LLVM 19.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86CallingConv.h"
18#include "X86FrameLowering.h"
19#include "X86InstrBuilder.h"
20#include "X86IntrinsicsInfo.h"
22#include "X86TargetMachine.h"
23#include "X86TargetObjectFile.h"
25#include "llvm/ADT/SmallSet.h"
26#include "llvm/ADT/Statistic.h"
43#include "llvm/IR/CallingConv.h"
44#include "llvm/IR/Constants.h"
47#include "llvm/IR/Function.h"
48#include "llvm/IR/GlobalAlias.h"
50#include "llvm/IR/IRBuilder.h"
52#include "llvm/IR/Intrinsics.h"
54#include "llvm/MC/MCAsmInfo.h"
55#include "llvm/MC/MCContext.h"
56#include "llvm/MC/MCExpr.h"
57#include "llvm/MC/MCSymbol.h"
59#include "llvm/Support/Debug.h"
64#include <algorithm>
65#include <bitset>
66#include <cctype>
67#include <numeric>
68using namespace llvm;
69
70#define DEBUG_TYPE "x86-isel"
71
73 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
75 "Sets the preferable loop alignment for experiments (as log2 bytes) "
76 "for innermost loops only. If specified, this option overrides "
77 "alignment set by x86-experimental-pref-loop-alignment."),
79
81 "x86-br-merging-base-cost", cl::init(2),
83 "Sets the cost threshold for when multiple conditionals will be merged "
84 "into one branch versus be split in multiple branches. Merging "
85 "conditionals saves branches at the cost of additional instructions. "
86 "This value sets the instruction cost limit, below which conditionals "
87 "will be merged, and above which conditionals will be split. Set to -1 "
88 "to never merge branches."),
90
92 "x86-br-merging-likely-bias", cl::init(0),
93 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
94 "that all conditionals will be executed. For example for merging "
95 "the conditionals (a == b && c > d), if its known that a == b is "
96 "likely, then it is likely that if the conditionals are split "
97 "both sides will be executed, so it may be desirable to increase "
98 "the instruction cost threshold. Set to -1 to never merge likely "
99 "branches."),
100 cl::Hidden);
101
103 "x86-br-merging-unlikely-bias", cl::init(-1),
104 cl::desc(
105 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
106 "that all conditionals will be executed. For example for merging "
107 "the conditionals (a == b && c > d), if its known that a == b is "
108 "unlikely, then it is unlikely that if the conditionals are split "
109 "both sides will be executed, so it may be desirable to decrease "
110 "the instruction cost threshold. Set to -1 to never merge unlikely "
111 "branches."),
112 cl::Hidden);
113
115 "mul-constant-optimization", cl::init(true),
116 cl::desc("Replace 'mul x, Const' with more effective instructions like "
117 "SHIFT, LEA, etc."),
118 cl::Hidden);
119
121 const X86Subtarget &STI)
122 : TargetLowering(TM), Subtarget(STI) {
123 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
124 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
125
126 // Set up the TargetLowering object.
127
128 // X86 is weird. It always uses i8 for shift amounts and setcc results.
130 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
132
133 // For 64-bit, since we have so many registers, use the ILP scheduler.
134 // For 32-bit, use the register pressure specific scheduling.
135 // For Atom, always use ILP scheduling.
136 if (Subtarget.isAtom())
138 else if (Subtarget.is64Bit())
140 else
142 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
144
145 // Bypass expensive divides and use cheaper ones.
146 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
147 if (Subtarget.hasSlowDivide32())
148 addBypassSlowDiv(32, 8);
149 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
150 addBypassSlowDiv(64, 32);
151 }
152
153 // Setup Windows compiler runtime calls.
154 if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
155 static const struct {
156 const RTLIB::Libcall Op;
157 const char * const Name;
158 const CallingConv::ID CC;
159 } LibraryCalls[] = {
160 { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
161 { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
162 { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
163 { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
164 { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
165 };
166
167 for (const auto &LC : LibraryCalls) {
168 setLibcallName(LC.Op, LC.Name);
169 setLibcallCallingConv(LC.Op, LC.CC);
170 }
171 }
172
173 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
174 // MSVCRT doesn't have powi; fall back to pow
175 setLibcallName(RTLIB::POWI_F32, nullptr);
176 setLibcallName(RTLIB::POWI_F64, nullptr);
177 }
178
179 if (Subtarget.canUseCMPXCHG16B())
181 else if (Subtarget.canUseCMPXCHG8B())
183 else
185
186 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
187
189
190 // Set up the register classes.
191 addRegisterClass(MVT::i8, &X86::GR8RegClass);
192 addRegisterClass(MVT::i16, &X86::GR16RegClass);
193 addRegisterClass(MVT::i32, &X86::GR32RegClass);
194 if (Subtarget.is64Bit())
195 addRegisterClass(MVT::i64, &X86::GR64RegClass);
196
197 for (MVT VT : MVT::integer_valuetypes())
199
200 // We don't accept any truncstore of integer registers.
201 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
202 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
203 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
204 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
205 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
206 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
207
208 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
209
210 // SETOEQ and SETUNE require checking two conditions.
211 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
214 }
215
216 // Integer absolute.
217 if (Subtarget.canUseCMOV()) {
218 setOperationAction(ISD::ABS , MVT::i16 , Custom);
219 setOperationAction(ISD::ABS , MVT::i32 , Custom);
220 if (Subtarget.is64Bit())
221 setOperationAction(ISD::ABS , MVT::i64 , Custom);
222 }
223
224 // Absolute difference.
225 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
226 setOperationAction(Op , MVT::i8 , Custom);
227 setOperationAction(Op , MVT::i16 , Custom);
228 setOperationAction(Op , MVT::i32 , Custom);
229 if (Subtarget.is64Bit())
230 setOperationAction(Op , MVT::i64 , Custom);
231 }
232
233 // Signed saturation subtraction.
237 if (Subtarget.is64Bit())
239
240 // Funnel shifts.
241 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
242 // For slow shld targets we only lower for code size.
243 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
244
245 setOperationAction(ShiftOp , MVT::i8 , Custom);
246 setOperationAction(ShiftOp , MVT::i16 , Custom);
247 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
248 if (Subtarget.is64Bit())
249 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
250 }
251
252 if (!Subtarget.useSoftFloat()) {
253 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
254 // operation.
259 // We have an algorithm for SSE2, and we turn this into a 64-bit
260 // FILD or VCVTUSI2SS/SD for other targets.
263 // We have an algorithm for SSE2->double, and we turn this into a
264 // 64-bit FILD followed by conditional FADD for other targets.
267
268 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
269 // this operation.
272 // SSE has no i16 to fp conversion, only i32. We promote in the handler
273 // to allow f80 to use i16 and f64 to use i16 with sse1 only
276 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
279 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
280 // are Legal, f80 is custom lowered.
283
284 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
285 // this operation.
287 // FIXME: This doesn't generate invalid exception when it should. PR44019.
293 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
294 // are Legal, f80 is custom lowered.
297
298 // Handle FP_TO_UINT by promoting the destination to a larger signed
299 // conversion.
301 // FIXME: This doesn't generate invalid exception when it should. PR44019.
304 // FIXME: This doesn't generate invalid exception when it should. PR44019.
310
315
316 if (!Subtarget.is64Bit()) {
319 }
320 }
321
322 if (Subtarget.hasSSE2()) {
323 // Custom lowering for saturating float to int conversions.
324 // We handle promotion to larger result types manually.
325 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
328 }
329 if (Subtarget.is64Bit()) {
332 }
333 }
334
335 // Handle address space casts between mixed sized pointers.
338
339 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
340 if (!Subtarget.hasSSE2()) {
343 if (Subtarget.is64Bit()) {
345 // Without SSE, i64->f64 goes through memory.
347 }
348 } else if (!Subtarget.is64Bit())
350
351 // Scalar integer divide and remainder are lowered to use operations that
352 // produce two results, to match the available instructions. This exposes
353 // the two-result form to trivial CSE, which is able to combine x/y and x%y
354 // into a single instruction.
355 //
356 // Scalar integer multiply-high is also lowered to use two-result
357 // operations, to match the available instructions. However, plain multiply
358 // (low) operations are left as Legal, as there are single-result
359 // instructions for this in x86. Using the two-result multiply instructions
360 // when both high and low results are needed must be arranged by dagcombine.
361 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
368 }
369
370 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
372 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
373 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
376 }
377 if (Subtarget.is64Bit())
382
383 setOperationAction(ISD::FREM , MVT::f32 , Expand);
384 setOperationAction(ISD::FREM , MVT::f64 , Expand);
385 setOperationAction(ISD::FREM , MVT::f80 , Expand);
386 setOperationAction(ISD::FREM , MVT::f128 , Expand);
387
388 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
394 }
395
396 // Promote the i8 variants and force them on up to i32 which has a shorter
397 // encoding.
398 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
400 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
401 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
402 // promote that too.
403 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
405
406 if (!Subtarget.hasBMI()) {
407 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
409 if (Subtarget.is64Bit()) {
410 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
412 }
413 }
414
415 if (Subtarget.hasLZCNT()) {
416 // When promoting the i8 variants, force them to i32 for a shorter
417 // encoding.
418 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
420 } else {
421 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
422 if (VT == MVT::i64 && !Subtarget.is64Bit())
423 continue;
426 }
427 }
428
431 // Special handling for half-precision floating point conversions.
432 // If we don't have F16C support, then lower half float conversions
433 // into library calls.
435 Op, MVT::f32,
436 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
437 // There's never any support for operations beyond MVT::f32.
438 setOperationAction(Op, MVT::f64, Expand);
439 setOperationAction(Op, MVT::f80, Expand);
440 setOperationAction(Op, MVT::f128, Expand);
441 }
442
443 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
446 }
447
448 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
449 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
450 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
451 setTruncStoreAction(VT, MVT::f16, Expand);
452 setTruncStoreAction(VT, MVT::bf16, Expand);
453
456 }
457
461 if (Subtarget.is64Bit())
463 if (Subtarget.hasPOPCNT()) {
464 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
465 // popcntw is longer to encode than popcntl and also has a false dependency
466 // on the dest that popcntl hasn't had since Cannon Lake.
467 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
468 } else {
473 }
474
476
477 if (!Subtarget.hasMOVBE())
479
480 // X86 wants to expand cmov itself.
481 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
486 }
487 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
488 if (VT == MVT::i64 && !Subtarget.is64Bit())
489 continue;
492 }
493
494 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
497
499 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
500 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
504 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
505 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
506
507 // Darwin ABI issue.
508 for (auto VT : { MVT::i32, MVT::i64 }) {
509 if (VT == MVT::i64 && !Subtarget.is64Bit())
510 continue;
517 }
518
519 // 64-bit shl, sra, srl (iff 32-bit x86)
520 for (auto VT : { MVT::i32, MVT::i64 }) {
521 if (VT == MVT::i64 && !Subtarget.is64Bit())
522 continue;
526 }
527
528 if (Subtarget.hasSSEPrefetch() || Subtarget.hasThreeDNow())
530
532
533 // Expand certain atomics
534 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
542 }
543
544 if (!Subtarget.is64Bit())
546
547 if (Subtarget.canUseCMPXCHG16B())
549
550 // FIXME - use subtarget debug flags
551 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
552 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
553 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
555 }
556
559
562
563 setOperationAction(ISD::TRAP, MVT::Other, Legal);
565 if (Subtarget.isTargetPS())
567 else
569
570 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
572 setOperationAction(ISD::VAEND , MVT::Other, Expand);
573 bool Is64Bit = Subtarget.is64Bit();
574 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
575 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
576
579
581
582 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
585
587
588 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
589 setOperationAction(ISD::FABS, VT, Action);
590 setOperationAction(ISD::FNEG, VT, Action);
592 setOperationAction(ISD::FREM, VT, Action);
593 setOperationAction(ISD::FMA, VT, Action);
594 setOperationAction(ISD::FMINNUM, VT, Action);
595 setOperationAction(ISD::FMAXNUM, VT, Action);
598 setOperationAction(ISD::FSIN, VT, Action);
599 setOperationAction(ISD::FCOS, VT, Action);
600 setOperationAction(ISD::FSINCOS, VT, Action);
601 setOperationAction(ISD::FSQRT, VT, Action);
602 setOperationAction(ISD::FPOW, VT, Action);
603 setOperationAction(ISD::FLOG, VT, Action);
604 setOperationAction(ISD::FLOG2, VT, Action);
605 setOperationAction(ISD::FLOG10, VT, Action);
606 setOperationAction(ISD::FEXP, VT, Action);
607 setOperationAction(ISD::FEXP2, VT, Action);
608 setOperationAction(ISD::FEXP10, VT, Action);
609 setOperationAction(ISD::FCEIL, VT, Action);
610 setOperationAction(ISD::FFLOOR, VT, Action);
612 setOperationAction(ISD::FRINT, VT, Action);
613 setOperationAction(ISD::BR_CC, VT, Action);
614 setOperationAction(ISD::SETCC, VT, Action);
617 setOperationAction(ISD::FROUND, VT, Action);
619 setOperationAction(ISD::FTRUNC, VT, Action);
620 setOperationAction(ISD::FLDEXP, VT, Action);
621 };
622
623 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
624 // f16, f32 and f64 use SSE.
625 // Set up the FP register classes.
626 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
627 : &X86::FR16RegClass);
628 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
629 : &X86::FR32RegClass);
630 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
631 : &X86::FR64RegClass);
632
633 // Disable f32->f64 extload as we can only generate this in one instruction
634 // under optsize. So its easier to pattern match (fpext (load)) for that
635 // case instead of needing to emit 2 instructions for extload in the
636 // non-optsize case.
637 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
638
639 for (auto VT : { MVT::f32, MVT::f64 }) {
640 // Use ANDPD to simulate FABS.
642
643 // Use XORP to simulate FNEG.
645
646 // Use ANDPD and ORPD to simulate FCOPYSIGN.
648
649 // These might be better off as horizontal vector ops.
652
653 // We don't support sin/cos/fmod
657 }
658
659 // Half type will be promoted by default.
660 setF16Action(MVT::f16, Promote);
668
698
699 setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
700 setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
701
702 // Lower this to MOVMSK plus an AND.
705
706 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
707 (UseX87 || Is64Bit)) {
708 // Use SSE for f32, x87 for f64.
709 // Set up the FP register classes.
710 addRegisterClass(MVT::f32, &X86::FR32RegClass);
711 if (UseX87)
712 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
713
714 // Use ANDPS to simulate FABS.
716
717 // Use XORP to simulate FNEG.
719
720 if (UseX87)
722
723 // Use ANDPS and ORPS to simulate FCOPYSIGN.
724 if (UseX87)
727
728 // We don't support sin/cos/fmod
732
733 if (UseX87) {
734 // Always expand sin/cos functions even though x87 has an instruction.
738 }
739 } else if (UseX87) {
740 // f32 and f64 in x87.
741 // Set up the FP register classes.
742 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
743 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
744
745 for (auto VT : { MVT::f32, MVT::f64 }) {
748
749 // Always expand sin/cos functions even though x87 has an instruction.
753 }
754 }
755
756 // Expand FP32 immediates into loads from the stack, save special cases.
757 if (isTypeLegal(MVT::f32)) {
758 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
759 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
760 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
761 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
762 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
763 } else // SSE immediates.
764 addLegalFPImmediate(APFloat(+0.0f)); // xorps
765 }
766 // Expand FP64 immediates into loads from the stack, save special cases.
767 if (isTypeLegal(MVT::f64)) {
768 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
769 addLegalFPImmediate(APFloat(+0.0)); // FLD0
770 addLegalFPImmediate(APFloat(+1.0)); // FLD1
771 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
772 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
773 } else // SSE immediates.
774 addLegalFPImmediate(APFloat(+0.0)); // xorpd
775 }
776 // Support fp16 0 immediate.
777 if (isTypeLegal(MVT::f16))
778 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
779
780 // Handle constrained floating-point operations of scalar.
793
794 // We don't support FMA.
797
798 // f80 always uses X87.
799 if (UseX87) {
800 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
803 {
805 addLegalFPImmediate(TmpFlt); // FLD0
806 TmpFlt.changeSign();
807 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
808
809 bool ignored;
810 APFloat TmpFlt2(+1.0);
812 &ignored);
813 addLegalFPImmediate(TmpFlt2); // FLD1
814 TmpFlt2.changeSign();
815 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
816 }
817
818 // Always expand sin/cos functions even though x87 has an instruction.
822
834
835 // Handle constrained floating-point operations of scalar.
841 if (isTypeLegal(MVT::f16)) {
844 } else {
846 }
847 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
848 // as Custom.
850 }
851
852 // f128 uses xmm registers, but most operations require libcalls.
853 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
854 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
855 : &X86::VR128RegClass);
856
857 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
858
869
873
879 // No STRICT_FSINCOS
882
885 // We need to custom handle any FP_ROUND with an f128 input, but
886 // LegalizeDAG uses the result type to know when to run a custom handler.
887 // So we have to list all legal floating point result types here.
888 if (isTypeLegal(MVT::f32)) {
891 }
892 if (isTypeLegal(MVT::f64)) {
895 }
896 if (isTypeLegal(MVT::f80)) {
899 }
900
902
903 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
904 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
905 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
906 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
907 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
908 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
909 }
910
911 // Always use a library call for pow.
912 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
913 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
914 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
915 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
916
925
926 // Some FP actions are always expanded for vector types.
927 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
928 MVT::v4f32, MVT::v8f32, MVT::v16f32,
929 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
942 }
943
944 // First set operation action for all vector types to either promote
945 // (for widening) or expand (for scalarization). Then we will selectively
946 // turn on ones that can be effectively codegen'd.
986 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
987 setTruncStoreAction(InnerVT, VT, Expand);
988
991
992 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
993 // types, we have to deal with them whether we ask for Expansion or not.
994 // Setting Expand causes its own optimisation problems though, so leave
995 // them legal.
996 if (VT.getVectorElementType() == MVT::i1)
997 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
998
999 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1000 // split/scalarized right now.
1001 if (VT.getVectorElementType() == MVT::f16 ||
1002 VT.getVectorElementType() == MVT::bf16)
1003 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1004 }
1005 }
1006
1007 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1008 // with -msoft-float, disable use of MMX as well.
1009 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1010 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1011 // No operations on x86mmx supported, everything uses intrinsics.
1012 }
1013
1014 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1015 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1016 : &X86::VR128RegClass);
1017
1020
1021 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1022 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1029
1030 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1031 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1032
1038 }
1039
1040 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1041 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1042 : &X86::VR128RegClass);
1043
1044 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1045 // registers cannot be used even for integer operations.
1046 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1047 : &X86::VR128RegClass);
1048 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1049 : &X86::VR128RegClass);
1050 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1051 : &X86::VR128RegClass);
1052 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1053 : &X86::VR128RegClass);
1054 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1055 : &X86::VR128RegClass);
1056
1057 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1060 }
1061
1062 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1063 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1068 }
1069
1070 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1071 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1072 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1073
1074 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1075 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1076 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1077 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1078 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1079 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1080 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1081 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1082 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1083 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1086
1087 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1088 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1089 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1090
1091 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1092 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1094
1095 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1096
1097 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1098 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1099 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1100 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1101 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1102 }
1103
1104 setOperationAction(ISD::ABDU, MVT::v16i8, Custom);
1105 setOperationAction(ISD::ABDS, MVT::v16i8, Custom);
1106 setOperationAction(ISD::ABDU, MVT::v8i16, Custom);
1107 setOperationAction(ISD::ABDS, MVT::v8i16, Custom);
1108 setOperationAction(ISD::ABDU, MVT::v4i32, Custom);
1109 setOperationAction(ISD::ABDS, MVT::v4i32, Custom);
1110
1121
1126
1127 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1131
1132 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1133 // setcc all the way to isel and prefer SETGT in some isel patterns.
1136 }
1137
1138 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1139 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1144
1145 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1151 }
1152
1153 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1157
1158 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1159 continue;
1160
1163 }
1164 setF16Action(MVT::v8f16, Expand);
1165 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1166 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1167 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1168 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1169 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1170 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1172
1173 // Custom lower v2i64 and v2f64 selects.
1180
1187
1188 // Custom legalize these to avoid over promotion or custom promotion.
1189 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1194 }
1195
1200
1203
1206
1207 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1212
1217
1218 // We want to legalize this to an f64 load rather than an i64 load on
1219 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1220 // store.
1221 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1222 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1223 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1224 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1225 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1227
1228 // Add 32-bit vector stores to help vectorization opportunities.
1229 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1231
1235 if (!Subtarget.hasAVX512())
1237
1241
1243
1260
1261 // In the customized shift lowering, the legal v4i32/v2i64 cases
1262 // in AVX2 will be recognized.
1263 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1267 if (VT == MVT::v2i64) continue;
1272 }
1273
1279 }
1280
1281 if (Subtarget.hasGFNI()) {
1286 }
1287
1288 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1289 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1290 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1291 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1292
1293 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1296 }
1297
1298 // These might be better off as horizontal vector ops.
1303 }
1304
1305 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1306 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1309 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1313 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1319
1321 }
1322
1323 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1324 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1325 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1326 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1327 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1328 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1329 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1330 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1331
1332 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1335 }
1336
1340
1341 // FIXME: Do we need to handle scalar-to-vector here?
1342 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1343 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1344
1345 // We directly match byte blends in the backend as they match the VSELECT
1346 // condition form.
1348
1349 // SSE41 brings specific instructions for doing vector sign extend even in
1350 // cases where we don't have SRA.
1351 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1354 }
1355
1356 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1357 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1358 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1359 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1360 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1361 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1362 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1363 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1364 }
1365
1366 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1367 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1368 // do the pre and post work in the vector domain.
1371 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1372 // so that DAG combine doesn't try to turn it into uint_to_fp.
1375 }
1376 }
1377
1378 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1380 }
1381
1382 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1383 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1384 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1387 }
1388
1389 // XOP can efficiently perform BITREVERSE with VPPERM.
1390 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1392 }
1393
1394 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1395 bool HasInt256 = Subtarget.hasInt256();
1396
1397 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1398 : &X86::VR256RegClass);
1399 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1400 : &X86::VR256RegClass);
1401 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1402 : &X86::VR256RegClass);
1403 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1404 : &X86::VR256RegClass);
1405 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1406 : &X86::VR256RegClass);
1407 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1408 : &X86::VR256RegClass);
1409 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1410 : &X86::VR256RegClass);
1411
1412 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1425
1427
1431
1434 }
1435
1436 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1437 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1438
1439 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1440 // even though v8i16 is a legal type.
1441 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1442 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1443 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1444 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1448
1455
1467
1468 if (!Subtarget.hasAVX512())
1470
1471 // In the customized shift lowering, the legal v8i32/v4i64 cases
1472 // in AVX2 will be recognized.
1473 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1479 if (VT == MVT::v4i64) continue;
1484 }
1485
1486 // These types need custom splitting if their input is a 128-bit vector.
1491
1495 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1496 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1499
1500 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1504 }
1505
1510
1511 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1516
1517 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1518 // setcc all the way to isel and prefer SETGT in some isel patterns.
1521 }
1522
1523 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1524 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1529
1530 if (Subtarget.hasAnyFMA()) {
1531 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1532 MVT::v2f64, MVT::v4f64 }) {
1535 }
1536 }
1537
1538 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1539 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1540 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1541 }
1542
1543 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1544 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1545 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1546 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1547
1548 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1549 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1550 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1551 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1552 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1553 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1554 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1555 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1556
1557 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1558 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1559
1560 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1561 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1562 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1563 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1564 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1565
1566 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1567 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1568 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1569 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1570 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1571 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1572 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1573 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1578
1579 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1580 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1581 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1582 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1583 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1584 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1585 }
1586
1587 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1590 }
1591
1592 if (HasInt256) {
1593 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1594 // when we have a 256bit-wide blend with immediate.
1597
1598 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1599 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1600 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1601 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1602 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1603 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1604 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1605 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1606 }
1607 }
1608
1609 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1610 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1611 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1613 }
1614
1615 // Extract subvector is special because the value type
1616 // (result) is 128-bit but the source is 256-bit wide.
1617 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1618 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1620 }
1621
1622 // Custom lower several nodes for 256-bit types.
1623 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1624 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1634 }
1635 setF16Action(MVT::v16f16, Expand);
1636 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1637 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1639 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1640 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1641 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1642 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1643
1644 if (HasInt256) {
1646
1647 // Custom legalize 2x32 to get a little better code.
1650
1651 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1652 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1654 }
1655 }
1656
1657 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1658 Subtarget.hasF16C()) {
1659 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1662 }
1663 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1666 }
1667 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1668 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1669 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1670 }
1671 }
1672
1673 // This block controls legalization of the mask vector sizes that are
1674 // available with AVX512. 512-bit vectors are in a separate block controlled
1675 // by useAVX512Regs.
1676 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1677 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1678 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1679 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1680 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1681 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1682
1686
1687 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1688 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1689 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1690 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1691 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1692 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1693 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1694 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1699
1700 // There is no byte sized k-register load or store without AVX512DQ.
1701 if (!Subtarget.hasDQI()) {
1702 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1703 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1704 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1705 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1706
1711 }
1712
1713 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1714 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1718 }
1719
1720 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1722
1723 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1727
1734 }
1735
1736 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1738 }
1739 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1740 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1743 }
1744 }
1745
1746 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1747 // elements. 512-bits can be disabled based on prefer-vector-width and
1748 // required-vector-width function attributes.
1749 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1750 bool HasBWI = Subtarget.hasBWI();
1751
1752 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1753 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1754 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1755 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1756 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1757 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1758 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1759
1760 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1761 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1762 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1763 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1764 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1765 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1766 if (HasBWI)
1767 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1768 }
1769
1770 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1778 }
1779 setOperationAction(ISD::LRINT, MVT::v16f32,
1780 Subtarget.hasDQI() ? Legal : Custom);
1781 setOperationAction(ISD::LRINT, MVT::v8f64,
1782 Subtarget.hasDQI() ? Legal : Custom);
1783 if (Subtarget.hasDQI())
1784 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1785
1786 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1791 }
1792
1793 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1798 }
1799
1806
1818
1819 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1820 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1821 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1822 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1823 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1824 if (HasBWI)
1825 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1826
1827 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1828 // to 512-bit rather than use the AVX2 instructions so that we can use
1829 // k-masks.
1830 if (!Subtarget.hasVLX()) {
1831 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1832 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1835 }
1836 }
1837
1839 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1840 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1850
1851 if (HasBWI) {
1852 // Extends from v64i1 masks to 512-bit vectors.
1856 }
1857
1858 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1871
1873 }
1874
1875 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1878 }
1879
1880 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1881 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1882 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1883 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1884
1885 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1886 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1887 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1888 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1889
1890 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1891 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1892 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1893 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1894 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1895 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1896 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1897 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1898
1899 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1900 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1901
1902 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1912
1913 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1914 // setcc all the way to isel and prefer SETGT in some isel patterns.
1917 }
1918
1919 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
1920 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
1925
1926 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1933 }
1934
1935 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1936 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1937 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1939 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1940 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1941 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1942 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
1947 }
1948
1949 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
1950 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
1951 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
1952 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
1953 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
1954 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
1955
1956 if (Subtarget.hasDQI()) {
1960 setOperationAction(Opc, MVT::v8i64, Custom);
1961 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1962 }
1963
1964 if (Subtarget.hasCDI()) {
1965 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1966 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1968 }
1969 } // Subtarget.hasCDI()
1970
1971 if (Subtarget.hasVPOPCNTDQ()) {
1972 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1974 }
1975
1976 // Extract subvector is special because the value type
1977 // (result) is 256-bit but the source is 512-bit wide.
1978 // 128-bit was made Legal under AVX1.
1979 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1980 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1982
1983 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
1984 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
1994 }
1995 setF16Action(MVT::v32f16, Expand);
2000 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2001 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2002
2003 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2008 }
2009 if (HasBWI) {
2010 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2013 }
2014 } else {
2015 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2016 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2017 }
2018
2019 if (Subtarget.hasVBMI2()) {
2020 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2023 }
2024
2025 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2026 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2027 }
2028 }// useAVX512Regs
2029
2030 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2031 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2032 MVT::v4i64}) {
2035 }
2036 }
2037
2038 // This block controls legalization for operations that don't have
2039 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2040 // narrower widths.
2041 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2042 // These operations are handled on non-VLX by artificially widening in
2043 // isel patterns.
2044
2048
2049 if (Subtarget.hasDQI()) {
2050 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2051 // v2f32 UINT_TO_FP is already custom under SSE2.
2054 "Unexpected operation action!");
2055 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2060 }
2061
2062 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2068 }
2069
2070 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2073 }
2074
2075 // Custom legalize 2x32 to get a little better code.
2078
2079 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2080 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2082
2083 if (Subtarget.hasDQI()) {
2087 setOperationAction(Opc, MVT::v2i64, Custom);
2088 setOperationAction(Opc, MVT::v4i64, Custom);
2089 }
2090 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2091 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2092 }
2093
2094 if (Subtarget.hasCDI()) {
2095 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2097 }
2098 } // Subtarget.hasCDI()
2099
2100 if (Subtarget.hasVPOPCNTDQ()) {
2101 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2103 }
2104 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2105 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2107 }
2108
2109 // This block control legalization of v32i1/v64i1 which are available with
2110 // AVX512BW..
2111 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2112 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2113 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2114
2115 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2126 }
2127
2128 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2130
2131 // Extends from v32i1 masks to 256-bit vectors.
2135
2136 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
2137 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2138 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2139 }
2140
2141 // These operations are handled on non-VLX by artificially widening in
2142 // isel patterns.
2143 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2144
2145 if (Subtarget.hasBITALG()) {
2146 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2148 }
2149 }
2150
2151 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2152 auto setGroup = [&] (MVT VT) {
2163
2176
2178
2181
2187
2193
2197 };
2198
2199 // AVX512_FP16 scalar operations
2200 setGroup(MVT::f16);
2214
2217
2218 if (Subtarget.useAVX512Regs()) {
2219 setGroup(MVT::v32f16);
2225 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2232
2237 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2239 MVT::v32i16);
2240 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2242 MVT::v32i16);
2243 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2245 MVT::v32i16);
2246 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2248 MVT::v32i16);
2249
2253
2254 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2255 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2256 }
2257
2258 if (Subtarget.hasVLX()) {
2259 setGroup(MVT::v8f16);
2260 setGroup(MVT::v16f16);
2261
2272
2283
2284 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2287
2291
2292 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2293 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2294 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2295 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2296
2297 // Need to custom widen these to prevent scalarization.
2298 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2299 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2300 }
2301 }
2302
2303 if (!Subtarget.useSoftFloat() &&
2304 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2305 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2306 : &X86::VR128RegClass);
2307 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2308 : &X86::VR256RegClass);
2309 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2310 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2311 // Set the operation action Custom to do the customization later.
2314 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2315 setF16Action(VT, Expand);
2320 }
2321 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2322 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2323 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2324 }
2326 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2327 }
2328
2329 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16()) {
2330 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2331 setF16Action(MVT::v32bf16, Expand);
2332 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2333 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2335 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2339 }
2340
2341 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2342 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2343 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2344 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2345 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2346 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2347
2348 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2349 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2350 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2351 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2352 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2353
2354 if (Subtarget.hasBWI()) {
2355 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2356 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2357 }
2358
2359 if (Subtarget.hasFP16()) {
2360 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2369 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2378 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2383 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2388 }
2389 }
2390
2391 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2392 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2393 }
2394
2395 // We want to custom lower some of our intrinsics.
2399 if (!Subtarget.is64Bit()) {
2401 }
2402
2403 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2404 // handle type legalization for these operations here.
2405 //
2406 // FIXME: We really should do custom legalization for addition and
2407 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2408 // than generic legalization for 64-bit multiplication-with-overflow, though.
2409 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2410 if (VT == MVT::i64 && !Subtarget.is64Bit())
2411 continue;
2412 // Add/Sub/Mul with overflow operations are custom lowered.
2419
2420 // Support carry in as value rather than glue.
2426 }
2427
2428 if (!Subtarget.is64Bit()) {
2429 // These libcalls are not available in 32-bit.
2430 setLibcallName(RTLIB::SHL_I128, nullptr);
2431 setLibcallName(RTLIB::SRL_I128, nullptr);
2432 setLibcallName(RTLIB::SRA_I128, nullptr);
2433 setLibcallName(RTLIB::MUL_I128, nullptr);
2434 // The MULO libcall is not part of libgcc, only compiler-rt.
2435 setLibcallName(RTLIB::MULO_I64, nullptr);
2436 }
2437 // The MULO libcall is not part of libgcc, only compiler-rt.
2438 setLibcallName(RTLIB::MULO_I128, nullptr);
2439
2440 // Combine sin / cos into _sincos_stret if it is available.
2441 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2442 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2445 }
2446
2447 if (Subtarget.isTargetWin64()) {
2448 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2449 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2450 setOperationAction(ISD::SREM, MVT::i128, Custom);
2451 setOperationAction(ISD::UREM, MVT::i128, Custom);
2460 }
2461
2462 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2463 // is. We should promote the value to 64-bits to solve this.
2464 // This is what the CRT headers do - `fmodf` is an inline header
2465 // function casting to f64 and calling `fmod`.
2466 if (Subtarget.is32Bit() &&
2467 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2468 for (ISD::NodeType Op :
2478 if (isOperationExpand(Op, MVT::f32))
2479 setOperationAction(Op, MVT::f32, Promote);
2480
2481 // We have target-specific dag combine patterns for the following nodes:
2492 ISD::SHL,
2493 ISD::SRA,
2494 ISD::SRL,
2495 ISD::OR,
2496 ISD::AND,
2498 ISD::ADD,
2499 ISD::FADD,
2500 ISD::FSUB,
2501 ISD::FNEG,
2502 ISD::FMA,
2506 ISD::SUB,
2507 ISD::LOAD,
2508 ISD::LRINT,
2510 ISD::MLOAD,
2511 ISD::STORE,
2525 ISD::SETCC,
2526 ISD::MUL,
2527 ISD::XOR,
2535
2537
2538 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2540 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2542 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2544
2545 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2546 // that needs to benchmarked and balanced with the potential use of vector
2547 // load/store types (PR33329, PR33914).
2550
2551 // Default loop alignment, which can be overridden by -align-loops.
2553
2554 // An out-of-order CPU can speculatively execute past a predictable branch,
2555 // but a conditional move could be stalled by an expensive earlier operation.
2556 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2557 EnableExtLdPromotion = true;
2559
2561
2562 // Default to having -disable-strictnode-mutation on
2563 IsStrictFPEnabled = true;
2564}
2565
2566// This has so far only been implemented for 64-bit MachO.
2568 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2569}
2570
2572 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2573 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2574}
2575
2577 const SDLoc &DL) const {
2578 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2579 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2580 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2581 return SDValue(Node, 0);
2582}
2583
2586 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2587 !Subtarget.hasBWI())
2588 return TypeSplitVector;
2589
2590 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2591 !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)
2592 return TypeSplitVector;
2593
2594 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2595 VT.getVectorElementType() != MVT::i1)
2596 return TypeWidenVector;
2597
2599}
2600
2601FastISel *
2603 const TargetLibraryInfo *libInfo) const {
2604 return X86::createFastISel(funcInfo, libInfo);
2605}
2606
2607//===----------------------------------------------------------------------===//
2608// Other Lowering Hooks
2609//===----------------------------------------------------------------------===//
2610
2612 bool AssumeSingleUse) {
2613 if (!AssumeSingleUse && !Op.hasOneUse())
2614 return false;
2615 if (!ISD::isNormalLoad(Op.getNode()))
2616 return false;
2617
2618 // If this is an unaligned vector, make sure the target supports folding it.
2619 auto *Ld = cast<LoadSDNode>(Op.getNode());
2620 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2621 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2622 return false;
2623
2624 // TODO: If this is a non-temporal load and the target has an instruction
2625 // for it, it should not be folded. See "useNonTemporalLoad()".
2626
2627 return true;
2628}
2629
2631 const X86Subtarget &Subtarget,
2632 bool AssumeSingleUse) {
2633 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2634 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2635 return false;
2636
2637 // We can not replace a wide volatile load with a broadcast-from-memory,
2638 // because that would narrow the load, which isn't legal for volatiles.
2639 auto *Ld = cast<LoadSDNode>(Op.getNode());
2640 return !Ld->isVolatile() ||
2641 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2642}
2643
2645 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
2646}
2647
2649 if (Op.hasOneUse()) {
2650 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
2651 return (ISD::ZERO_EXTEND == Opcode);
2652 }
2653 return false;
2654}
2655
2656static bool isLogicOp(unsigned Opcode) {
2657 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2658 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2659}
2660
2661static bool isTargetShuffle(unsigned Opcode) {
2662 switch(Opcode) {
2663 default: return false;
2664 case X86ISD::BLENDI:
2665 case X86ISD::PSHUFB:
2666 case X86ISD::PSHUFD:
2667 case X86ISD::PSHUFHW:
2668 case X86ISD::PSHUFLW:
2669 case X86ISD::SHUFP:
2670 case X86ISD::INSERTPS:
2671 case X86ISD::EXTRQI:
2672 case X86ISD::INSERTQI:
2673 case X86ISD::VALIGN:
2674 case X86ISD::PALIGNR:
2675 case X86ISD::VSHLDQ:
2676 case X86ISD::VSRLDQ:
2677 case X86ISD::MOVLHPS:
2678 case X86ISD::MOVHLPS:
2679 case X86ISD::MOVSHDUP:
2680 case X86ISD::MOVSLDUP:
2681 case X86ISD::MOVDDUP:
2682 case X86ISD::MOVSS:
2683 case X86ISD::MOVSD:
2684 case X86ISD::MOVSH:
2685 case X86ISD::UNPCKL:
2686 case X86ISD::UNPCKH:
2687 case X86ISD::VBROADCAST:
2688 case X86ISD::VPERMILPI:
2689 case X86ISD::VPERMILPV:
2690 case X86ISD::VPERM2X128:
2691 case X86ISD::SHUF128:
2692 case X86ISD::VPERMIL2:
2693 case X86ISD::VPERMI:
2694 case X86ISD::VPPERM:
2695 case X86ISD::VPERMV:
2696 case X86ISD::VPERMV3:
2697 case X86ISD::VZEXT_MOVL:
2698 return true;
2699 }
2700}
2701
2702static bool isTargetShuffleVariableMask(unsigned Opcode) {
2703 switch (Opcode) {
2704 default: return false;
2705 // Target Shuffles.
2706 case X86ISD::PSHUFB:
2707 case X86ISD::VPERMILPV:
2708 case X86ISD::VPERMIL2:
2709 case X86ISD::VPPERM:
2710 case X86ISD::VPERMV:
2711 case X86ISD::VPERMV3:
2712 return true;
2713 // 'Faux' Target Shuffles.
2714 case ISD::OR:
2715 case ISD::AND:
2716 case X86ISD::ANDNP:
2717 return true;
2718 }
2719}
2720
2723 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2725 int ReturnAddrIndex = FuncInfo->getRAIndex();
2726
2727 if (ReturnAddrIndex == 0) {
2728 // Set up a frame object for the return address.
2729 unsigned SlotSize = RegInfo->getSlotSize();
2730 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2731 -(int64_t)SlotSize,
2732 false);
2733 FuncInfo->setRAIndex(ReturnAddrIndex);
2734 }
2735
2736 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2737}
2738
2740 bool HasSymbolicDisplacement) {
2741 // Offset should fit into 32 bit immediate field.
2742 if (!isInt<32>(Offset))
2743 return false;
2744
2745 // If we don't have a symbolic displacement - we don't have any extra
2746 // restrictions.
2747 if (!HasSymbolicDisplacement)
2748 return true;
2749
2750 // We can fold large offsets in the large code model because we always use
2751 // 64-bit offsets.
2752 if (CM == CodeModel::Large)
2753 return true;
2754
2755 // For kernel code model we know that all object resist in the negative half
2756 // of 32bits address space. We may not accept negative offsets, since they may
2757 // be just off and we may accept pretty large positive ones.
2758 if (CM == CodeModel::Kernel)
2759 return Offset >= 0;
2760
2761 // For other non-large code models we assume that latest small object is 16MB
2762 // before end of 31 bits boundary. We may also accept pretty large negative
2763 // constants knowing that all objects are in the positive half of address
2764 // space.
2765 return Offset < 16 * 1024 * 1024;
2766}
2767
2768/// Return true if the condition is an signed comparison operation.
2769static bool isX86CCSigned(unsigned X86CC) {
2770 switch (X86CC) {
2771 default:
2772 llvm_unreachable("Invalid integer condition!");
2773 case X86::COND_E:
2774 case X86::COND_NE:
2775 case X86::COND_B:
2776 case X86::COND_A:
2777 case X86::COND_BE:
2778 case X86::COND_AE:
2779 return false;
2780 case X86::COND_G:
2781 case X86::COND_GE:
2782 case X86::COND_L:
2783 case X86::COND_LE:
2784 return true;
2785 }
2786}
2787
2789 switch (SetCCOpcode) {
2790 // clang-format off
2791 default: llvm_unreachable("Invalid integer condition!");
2792 case ISD::SETEQ: return X86::COND_E;
2793 case ISD::SETGT: return X86::COND_G;
2794 case ISD::SETGE: return X86::COND_GE;
2795 case ISD::SETLT: return X86::COND_L;
2796 case ISD::SETLE: return X86::COND_LE;
2797 case ISD::SETNE: return X86::COND_NE;
2798 case ISD::SETULT: return X86::COND_B;
2799 case ISD::SETUGT: return X86::COND_A;
2800 case ISD::SETULE: return X86::COND_BE;
2801 case ISD::SETUGE: return X86::COND_AE;
2802 // clang-format on
2803 }
2804}
2805
2806/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2807/// condition code, returning the condition code and the LHS/RHS of the
2808/// comparison to make.
2810 bool isFP, SDValue &LHS, SDValue &RHS,
2811 SelectionDAG &DAG) {
2812 if (!isFP) {
2813 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
2814 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2815 // X > -1 -> X == 0, jump !sign.
2816 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2817 return X86::COND_NS;
2818 }
2819 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2820 // X < 0 -> X == 0, jump on sign.
2821 return X86::COND_S;
2822 }
2823 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
2824 // X >= 0 -> X == 0, jump on !sign.
2825 return X86::COND_NS;
2826 }
2827 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
2828 // X < 1 -> X <= 0
2829 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2830 return X86::COND_LE;
2831 }
2832 }
2833
2834 return TranslateIntegerX86CC(SetCCOpcode);
2835 }
2836
2837 // First determine if it is required or is profitable to flip the operands.
2838
2839 // If LHS is a foldable load, but RHS is not, flip the condition.
2840 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
2841 !ISD::isNON_EXTLoad(RHS.getNode())) {
2842 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
2843 std::swap(LHS, RHS);
2844 }
2845
2846 switch (SetCCOpcode) {
2847 default: break;
2848 case ISD::SETOLT:
2849 case ISD::SETOLE:
2850 case ISD::SETUGT:
2851 case ISD::SETUGE:
2852 std::swap(LHS, RHS);
2853 break;
2854 }
2855
2856 // On a floating point condition, the flags are set as follows:
2857 // ZF PF CF op
2858 // 0 | 0 | 0 | X > Y
2859 // 0 | 0 | 1 | X < Y
2860 // 1 | 0 | 0 | X == Y
2861 // 1 | 1 | 1 | unordered
2862 switch (SetCCOpcode) {
2863 // clang-format off
2864 default: llvm_unreachable("Condcode should be pre-legalized away");
2865 case ISD::SETUEQ:
2866 case ISD::SETEQ: return X86::COND_E;
2867 case ISD::SETOLT: // flipped
2868 case ISD::SETOGT:
2869 case ISD::SETGT: return X86::COND_A;
2870 case ISD::SETOLE: // flipped
2871 case ISD::SETOGE:
2872 case ISD::SETGE: return X86::COND_AE;
2873 case ISD::SETUGT: // flipped
2874 case ISD::SETULT:
2875 case ISD::SETLT: return X86::COND_B;
2876 case ISD::SETUGE: // flipped
2877 case ISD::SETULE:
2878 case ISD::SETLE: return X86::COND_BE;
2879 case ISD::SETONE:
2880 case ISD::SETNE: return X86::COND_NE;
2881 case ISD::SETUO: return X86::COND_P;
2882 case ISD::SETO: return X86::COND_NP;
2883 case ISD::SETOEQ:
2884 case ISD::SETUNE: return X86::COND_INVALID;
2885 // clang-format on
2886 }
2887}
2888
2889/// Is there a floating point cmov for the specific X86 condition code?
2890/// Current x86 isa includes the following FP cmov instructions:
2891/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
2892static bool hasFPCMov(unsigned X86CC) {
2893 switch (X86CC) {
2894 default:
2895 return false;
2896 case X86::COND_B:
2897 case X86::COND_BE:
2898 case X86::COND_E:
2899 case X86::COND_P:
2900 case X86::COND_A:
2901 case X86::COND_AE:
2902 case X86::COND_NE:
2903 case X86::COND_NP:
2904 return true;
2905 }
2906}
2907
2908static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
2909 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
2910 VT.is512BitVector();
2911}
2912
2914 const CallInst &I,
2915 MachineFunction &MF,
2916 unsigned Intrinsic) const {
2918 Info.offset = 0;
2919
2920 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
2921 if (!IntrData) {
2922 switch (Intrinsic) {
2923 case Intrinsic::x86_aesenc128kl:
2924 case Intrinsic::x86_aesdec128kl:
2926 Info.ptrVal = I.getArgOperand(1);
2927 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
2928 Info.align = Align(1);
2930 return true;
2931 case Intrinsic::x86_aesenc256kl:
2932 case Intrinsic::x86_aesdec256kl:
2934 Info.ptrVal = I.getArgOperand(1);
2935 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
2936 Info.align = Align(1);
2938 return true;
2939 case Intrinsic::x86_aesencwide128kl:
2940 case Intrinsic::x86_aesdecwide128kl:
2942 Info.ptrVal = I.getArgOperand(0);
2943 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
2944 Info.align = Align(1);
2946 return true;
2947 case Intrinsic::x86_aesencwide256kl:
2948 case Intrinsic::x86_aesdecwide256kl:
2950 Info.ptrVal = I.getArgOperand(0);
2951 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
2952 Info.align = Align(1);
2954 return true;
2955 case Intrinsic::x86_cmpccxadd32:
2956 case Intrinsic::x86_cmpccxadd64:
2957 case Intrinsic::x86_atomic_bts:
2958 case Intrinsic::x86_atomic_btc:
2959 case Intrinsic::x86_atomic_btr: {
2961 Info.ptrVal = I.getArgOperand(0);
2962 unsigned Size = I.getType()->getScalarSizeInBits();
2963 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
2964 Info.align = Align(Size);
2967 return true;
2968 }
2969 case Intrinsic::x86_atomic_bts_rm:
2970 case Intrinsic::x86_atomic_btc_rm:
2971 case Intrinsic::x86_atomic_btr_rm: {
2973 Info.ptrVal = I.getArgOperand(0);
2974 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
2975 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
2976 Info.align = Align(Size);
2979 return true;
2980 }
2981 case Intrinsic::x86_aadd32:
2982 case Intrinsic::x86_aadd64:
2983 case Intrinsic::x86_aand32:
2984 case Intrinsic::x86_aand64:
2985 case Intrinsic::x86_aor32:
2986 case Intrinsic::x86_aor64:
2987 case Intrinsic::x86_axor32:
2988 case Intrinsic::x86_axor64:
2989 case Intrinsic::x86_atomic_add_cc:
2990 case Intrinsic::x86_atomic_sub_cc:
2991 case Intrinsic::x86_atomic_or_cc:
2992 case Intrinsic::x86_atomic_and_cc:
2993 case Intrinsic::x86_atomic_xor_cc: {
2995 Info.ptrVal = I.getArgOperand(0);
2996 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
2997 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
2998 Info.align = Align(Size);
3001 return true;
3002 }
3003 }
3004 return false;
3005 }
3006
3007 switch (IntrData->Type) {
3010 case TRUNCATE_TO_MEM_VI32: {
3012 Info.ptrVal = I.getArgOperand(0);
3013 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3015 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3016 ScalarVT = MVT::i8;
3017 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3018 ScalarVT = MVT::i16;
3019 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3020 ScalarVT = MVT::i32;
3021
3022 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3023 Info.align = Align(1);
3025 break;
3026 }
3027 case GATHER:
3028 case GATHER_AVX2: {
3030 Info.ptrVal = nullptr;
3031 MVT DataVT = MVT::getVT(I.getType());
3032 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3033 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3034 IndexVT.getVectorNumElements());
3035 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3036 Info.align = Align(1);
3038 break;
3039 }
3040 case SCATTER: {
3042 Info.ptrVal = nullptr;
3043 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3044 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3045 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3046 IndexVT.getVectorNumElements());
3047 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3048 Info.align = Align(1);
3050 break;
3051 }
3052 default:
3053 return false;
3054 }
3055
3056 return true;
3057}
3058
3059/// Returns true if the target can instruction select the
3060/// specified FP immediate natively. If false, the legalizer will
3061/// materialize the FP immediate as a load from a constant pool.
3063 bool ForCodeSize) const {
3064 for (const APFloat &FPImm : LegalFPImmediates)
3065 if (Imm.bitwiseIsEqual(FPImm))
3066 return true;
3067 return false;
3068}
3069
3071 ISD::LoadExtType ExtTy,
3072 EVT NewVT) const {
3073 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3074
3075 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3076 // relocation target a movq or addq instruction: don't let the load shrink.
3077 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3078 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3079 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3080 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3081
3082 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3083 // those uses are extracted directly into a store, then the extract + store
3084 // can be store-folded. Therefore, it's probably not worth splitting the load.
3085 EVT VT = Load->getValueType(0);
3086 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
3087 for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
3088 // Skip uses of the chain value. Result 0 of the node is the load value.
3089 if (UI.getUse().getResNo() != 0)
3090 continue;
3091
3092 // If this use is not an extract + store, it's probably worth splitting.
3093 if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
3094 UI->use_begin()->getOpcode() != ISD::STORE)
3095 return true;
3096 }
3097 // All non-chain uses are extract + store.
3098 return false;
3099 }
3100
3101 return true;
3102}
3103
3104/// Returns true if it is beneficial to convert a load of a constant
3105/// to just the constant itself.
3107 Type *Ty) const {
3108 assert(Ty->isIntegerTy());
3109
3110 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3111 if (BitSize == 0 || BitSize > 64)
3112 return false;
3113 return true;
3114}
3115
3117 // If we are using XMM registers in the ABI and the condition of the select is
3118 // a floating-point compare and we have blendv or conditional move, then it is
3119 // cheaper to select instead of doing a cross-register move and creating a
3120 // load that depends on the compare result.
3121 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3122 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3123}
3124
3126 // TODO: It might be a win to ease or lift this restriction, but the generic
3127 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3128 if (VT.isVector() && Subtarget.hasAVX512())
3129 return false;
3130
3131 return true;
3132}
3133
3135 SDValue C) const {
3136 // TODO: We handle scalars using custom code, but generic combining could make
3137 // that unnecessary.
3138 APInt MulC;
3139 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3140 return false;
3141
3142 // Find the type this will be legalized too. Otherwise we might prematurely
3143 // convert this to shl+add/sub and then still have to type legalize those ops.
3144 // Another choice would be to defer the decision for illegal types until
3145 // after type legalization. But constant splat vectors of i64 can't make it
3146 // through type legalization on 32-bit targets so we would need to special
3147 // case vXi64.
3148 while (getTypeAction(Context, VT) != TypeLegal)
3149 VT = getTypeToTransformTo(Context, VT);
3150
3151 // If vector multiply is legal, assume that's faster than shl + add/sub.
3152 // Multiply is a complex op with higher latency and lower throughput in
3153 // most implementations, sub-vXi32 vector multiplies are always fast,
3154 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3155 // is always going to be slow.
3156 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3157 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3158 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3159 return false;
3160
3161 // shl+add, shl+sub, shl+add+neg
3162 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3163 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3164}
3165
3167 unsigned Index) const {
3169 return false;
3170
3171 // Mask vectors support all subregister combinations and operations that
3172 // extract half of vector.
3173 if (ResVT.getVectorElementType() == MVT::i1)
3174 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3175 (Index == ResVT.getVectorNumElements()));
3176
3177 return (Index % ResVT.getVectorNumElements()) == 0;
3178}
3179
3181 unsigned Opc = VecOp.getOpcode();
3182
3183 // Assume target opcodes can't be scalarized.
3184 // TODO - do we have any exceptions?
3185 if (Opc >= ISD::BUILTIN_OP_END)
3186 return false;
3187
3188 // If the vector op is not supported, try to convert to scalar.
3189 EVT VecVT = VecOp.getValueType();
3190 if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
3191 return true;
3192
3193 // If the vector op is supported, but the scalar op is not, the transform may
3194 // not be worthwhile.
3195 EVT ScalarVT = VecVT.getScalarType();
3196 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3197}
3198
3200 bool) const {
3201 // TODO: Allow vectors?
3202 if (VT.isVector())
3203 return false;
3204 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3205}
3206
3208 // Speculate cttz only if we can directly use TZCNT or can promote to i32.
3209 return Subtarget.hasBMI() ||
3210 (!Ty->isVectorTy() && Ty->getScalarSizeInBits() < 32);
3211}
3212
3214 // Speculate ctlz only if we can directly use LZCNT.
3215 return Subtarget.hasLZCNT();
3216}
3217
3219 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3220 // expensive than a straight movsd. On the other hand, it's important to
3221 // shrink long double fp constant since fldt is very slow.
3222 return !Subtarget.hasSSE2() || VT == MVT::f80;
3223}
3224
3226 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3227 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3228}
3229
3231 const SelectionDAG &DAG,
3232 const MachineMemOperand &MMO) const {
3233 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3234 BitcastVT.getVectorElementType() == MVT::i1)
3235 return false;
3236
3237 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3238 return false;
3239
3240 // If both types are legal vectors, it's always ok to convert them.
3241 if (LoadVT.isVector() && BitcastVT.isVector() &&
3242 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
3243 return true;
3244
3245 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3246}
3247
3249 const MachineFunction &MF) const {
3250 // Do not merge to float value size (128 bytes) if no implicit
3251 // float attribute is set.
3252 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3253
3254 if (NoFloat) {
3255 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3256 return (MemVT.getSizeInBits() <= MaxIntSize);
3257 }
3258 // Make sure we don't merge greater than our preferred vector
3259 // width.
3260 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3261 return false;
3262
3263 return true;
3264}
3265
3267 return Subtarget.hasFastLZCNT();
3268}
3269
3271 const Instruction &AndI) const {
3272 return true;
3273}
3274
3276 EVT VT = Y.getValueType();
3277
3278 if (VT.isVector())
3279 return false;
3280
3281 if (!Subtarget.hasBMI())
3282 return false;
3283
3284 // There are only 32-bit and 64-bit forms for 'andn'.
3285 if (VT != MVT::i32 && VT != MVT::i64)
3286 return false;
3287
3288 return !isa<ConstantSDNode>(Y);
3289}
3290
3292 EVT VT = Y.getValueType();
3293
3294 if (!VT.isVector())
3295 return hasAndNotCompare(Y);
3296
3297 // Vector.
3298
3299 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3300 return false;
3301
3302 if (VT == MVT::v4i32)
3303 return true;
3304
3305 return Subtarget.hasSSE2();
3306}
3307
3309 return X.getValueType().isScalarInteger(); // 'bt'
3310}
3311
3315 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3316 SelectionDAG &DAG) const {
3317 // Does baseline recommend not to perform the fold by default?
3319 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3320 return false;
3321 // For scalars this transform is always beneficial.
3322 if (X.getValueType().isScalarInteger())
3323 return true;
3324 // If all the shift amounts are identical, then transform is beneficial even
3325 // with rudimentary SSE2 shifts.
3326 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3327 return true;
3328 // If we have AVX2 with it's powerful shift operations, then it's also good.
3329 if (Subtarget.hasAVX2())
3330 return true;
3331 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3332 return NewShiftOpcode == ISD::SHL;
3333}
3334
3336 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3337 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3338 if (!VT.isInteger())
3339 return ShiftOpc;
3340
3341 bool PreferRotate = false;
3342 if (VT.isVector()) {
3343 // For vectors, if we have rotate instruction support, then its definetly
3344 // best. Otherwise its not clear what the best so just don't make changed.
3345 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3346 VT.getScalarType() == MVT::i64);
3347 } else {
3348 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3349 // rotate unless we have a zext mask+shr.
3350 PreferRotate = Subtarget.hasBMI2();
3351 if (!PreferRotate) {
3352 unsigned MaskBits =
3353 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3354 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3355 }
3356 }
3357
3358 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3359 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3360
3361 if (PreferRotate && MayTransformRotate)
3362 return ISD::ROTL;
3363
3364 // If vector we don't really get much benefit swapping around constants.
3365 // Maybe we could check if the DAG has the flipped node already in the
3366 // future.
3367 if (VT.isVector())
3368 return ShiftOpc;
3369
3370 // See if the beneficial to swap shift type.
3371 if (ShiftOpc == ISD::SHL) {
3372 // If the current setup has imm64 mask, then inverse will have
3373 // at least imm32 mask (or be zext i32 -> i64).
3374 if (VT == MVT::i64)
3375 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3376 : ShiftOpc;
3377
3378 // We can only benefit if req at least 7-bit for the mask. We
3379 // don't want to replace shl of 1,2,3 as they can be implemented
3380 // with lea/add.
3381 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3382 }
3383
3384 if (VT == MVT::i64)
3385 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3386 // extremely efficient.
3387 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3388
3389 // Keep small shifts as shl so we can generate add/lea.
3390 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3391 }
3392
3393 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3394 // (PreferRotate will be set in the latter case).
3395 if (PreferRotate || VT.isVector())
3396 return ShiftOpc;
3397
3398 // Non-vector type and we have a zext mask with SRL.
3399 return ISD::SRL;
3400}
3401
3404 const Value *Lhs,
3405 const Value *Rhs) const {
3406 using namespace llvm::PatternMatch;
3407 int BaseCost = BrMergingBaseCostThresh.getValue();
3408 // a == b && a == c is a fast pattern on x86.
3410 if (BaseCost >= 0 && Opc == Instruction::And &&
3411 match(Lhs, m_ICmp(Pred, m_Value(), m_Value())) &&
3412 Pred == ICmpInst::ICMP_EQ &&
3413 match(Rhs, m_ICmp(Pred, m_Value(), m_Value())) &&
3414 Pred == ICmpInst::ICMP_EQ)
3415 BaseCost += 1;
3416 return {BaseCost, BrMergingLikelyBias.getValue(),
3417 BrMergingUnlikelyBias.getValue()};
3418}
3419
3421 return N->getOpcode() != ISD::FP_EXTEND;
3422}
3423
3425 const SDNode *N, CombineLevel Level) const {
3426 assert(((N->getOpcode() == ISD::SHL &&
3427 N->getOperand(0).getOpcode() == ISD::SRL) ||
3428 (N->getOpcode() == ISD::SRL &&
3429 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3430 "Expected shift-shift mask");
3431 // TODO: Should we always create i64 masks? Or only folded immediates?
3432 EVT VT = N->getValueType(0);
3433 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3434 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3435 // Only fold if the shift values are equal - so it folds to AND.
3436 // TODO - we should fold if either is a non-uniform vector but we don't do
3437 // the fold for non-splats yet.
3438 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3439 }
3441}
3442
3444 EVT VT = Y.getValueType();
3445
3446 // For vectors, we don't have a preference, but we probably want a mask.
3447 if (VT.isVector())
3448 return false;
3449
3450 // 64-bit shifts on 32-bit targets produce really bad bloated code.
3451 if (VT == MVT::i64 && !Subtarget.is64Bit())
3452 return false;
3453
3454 return true;
3455}
3456
3459 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3461 !Subtarget.isOSWindows())
3464 ExpansionFactor);
3465}
3466
3468 // Any legal vector type can be splatted more efficiently than
3469 // loading/spilling from memory.
3470 return isTypeLegal(VT);
3471}
3472
3474 MVT VT = MVT::getIntegerVT(NumBits);
3475 if (isTypeLegal(VT))
3476 return VT;
3477
3478 // PMOVMSKB can handle this.
3479 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3480 return MVT::v16i8;
3481
3482 // VPMOVMSKB can handle this.
3483 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3484 return MVT::v32i8;
3485
3486 // TODO: Allow 64-bit type for 32-bit target.
3487 // TODO: 512-bit types should be allowed, but make sure that those
3488 // cases are handled in combineVectorSizedSetCCEquality().
3489
3491}
3492
3493/// Val is the undef sentinel value or equal to the specified value.
3494static bool isUndefOrEqual(int Val, int CmpVal) {
3495 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3496}
3497
3498/// Return true if every element in Mask is the undef sentinel value or equal to
3499/// the specified value.
3500static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3501 return llvm::all_of(Mask, [CmpVal](int M) {
3502 return (M == SM_SentinelUndef) || (M == CmpVal);
3503 });
3504}
3505
3506/// Return true if every element in Mask, beginning from position Pos and ending
3507/// in Pos+Size is the undef sentinel value or equal to the specified value.
3508static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3509 unsigned Size) {
3510 return llvm::all_of(Mask.slice(Pos, Size),
3511 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3512}
3513
3514/// Val is either the undef or zero sentinel value.
3515static bool isUndefOrZero(int Val) {
3516 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3517}
3518
3519/// Return true if every element in Mask, beginning from position Pos and ending
3520/// in Pos+Size is the undef sentinel value.
3521static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3522 return llvm::all_of(Mask.slice(Pos, Size),
3523 [](int M) { return M == SM_SentinelUndef; });
3524}
3525
3526/// Return true if the mask creates a vector whose lower half is undefined.
3528 unsigned NumElts = Mask.size();
3529 return isUndefInRange(Mask, 0, NumElts / 2);
3530}
3531
3532/// Return true if the mask creates a vector whose upper half is undefined.
3534 unsigned NumElts = Mask.size();
3535 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3536}
3537
3538/// Return true if Val falls within the specified range (L, H].
3539static bool isInRange(int Val, int Low, int Hi) {
3540 return (Val >= Low && Val < Hi);
3541}
3542
3543/// Return true if the value of any element in Mask falls within the specified
3544/// range (L, H].
3545static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3546 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3547}
3548
3549/// Return true if the value of any element in Mask is the zero sentinel value.
3550static bool isAnyZero(ArrayRef<int> Mask) {
3551 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
3552}
3553
3554/// Return true if Val is undef or if its value falls within the
3555/// specified range (L, H].
3556static bool isUndefOrInRange(int Val, int Low, int Hi) {
3557 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3558}
3559
3560/// Return true if every element in Mask is undef or if its value
3561/// falls within the specified range (L, H].
3562static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3563 return llvm::all_of(
3564 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3565}
3566
3567/// Return true if Val is undef, zero or if its value falls within the
3568/// specified range (L, H].
3569static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3570 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3571}
3572
3573/// Return true if every element in Mask is undef, zero or if its value
3574/// falls within the specified range (L, H].
3575static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3576 return llvm::all_of(
3577 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3578}
3579
3580/// Return true if every element in Mask, is an in-place blend/select mask or is
3581/// undef.
3583 unsigned NumElts = Mask.size();
3584 for (auto [I, M] : enumerate(Mask))
3585 if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
3586 return false;
3587 return true;
3588}
3589
3590/// Return true if every element in Mask, beginning
3591/// from position Pos and ending in Pos + Size, falls within the specified
3592/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3593static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3594 unsigned Size, int Low, int Step = 1) {
3595 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3596 if (!isUndefOrEqual(Mask[i], Low))
3597 return false;
3598 return true;
3599}
3600
3601/// Return true if every element in Mask, beginning
3602/// from position Pos and ending in Pos+Size, falls within the specified
3603/// sequential range (Low, Low+Size], or is undef or is zero.
3605 unsigned Size, int Low,
3606 int Step = 1) {
3607 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3608 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3609 return false;
3610 return true;
3611}
3612
3613/// Return true if every element in Mask, beginning
3614/// from position Pos and ending in Pos+Size is undef or is zero.
3615static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3616 unsigned Size) {
3617 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3618}
3619
3620/// Return true if every element of a single input is referenced by the shuffle
3621/// mask. i.e. it just permutes them all.
3623 unsigned NumElts = Mask.size();
3624 APInt DemandedElts = APInt::getZero(NumElts);
3625 for (int M : Mask)
3626 if (isInRange(M, 0, NumElts))
3627 DemandedElts.setBit(M);
3628 return DemandedElts.isAllOnes();
3629}
3630
3631/// Helper function to test whether a shuffle mask could be
3632/// simplified by widening the elements being shuffled.
3633///
3634/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
3635/// leaves it in an unspecified state.
3636///
3637/// NOTE: This must handle normal vector shuffle masks and *target* vector
3638/// shuffle masks. The latter have the special property of a '-2' representing
3639/// a zero-ed lane of a vector.
3641 SmallVectorImpl<int> &WidenedMask) {
3642 WidenedMask.assign(Mask.size() / 2, 0);
3643 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
3644 int M0 = Mask[i];
3645 int M1 = Mask[i + 1];
3646
3647 // If both elements are undef, its trivial.
3648 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
3649 WidenedMask[i / 2] = SM_SentinelUndef;
3650 continue;
3651 }
3652
3653 // Check for an undef mask and a mask value properly aligned to fit with
3654 // a pair of values. If we find such a case, use the non-undef mask's value.
3655 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
3656 WidenedMask[i / 2] = M1 / 2;
3657 continue;
3658 }
3659 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
3660 WidenedMask[i / 2] = M0 / 2;
3661 continue;
3662 }
3663
3664 // When zeroing, we need to spread the zeroing across both lanes to widen.
3665 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
3666 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
3668 WidenedMask[i / 2] = SM_SentinelZero;
3669 continue;
3670 }
3671 return false;
3672 }
3673
3674 // Finally check if the two mask values are adjacent and aligned with
3675 // a pair.
3676 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
3677 WidenedMask[i / 2] = M0 / 2;
3678 continue;
3679 }
3680
3681 // Otherwise we can't safely widen the elements used in this shuffle.
3682 return false;
3683 }
3684 assert(WidenedMask.size() == Mask.size() / 2 &&
3685 "Incorrect size of mask after widening the elements!");
3686
3687 return true;
3688}
3689
3691 const APInt &Zeroable,
3692 bool V2IsZero,
3693 SmallVectorImpl<int> &WidenedMask) {
3694 // Create an alternative mask with info about zeroable elements.
3695 // Here we do not set undef elements as zeroable.
3696 SmallVector<int, 64> ZeroableMask(Mask);
3697 if (V2IsZero) {
3698 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3699 for (int i = 0, Size = Mask.size(); i != Size; ++i)
3700 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
3701 ZeroableMask[i] = SM_SentinelZero;
3702 }
3703 return canWidenShuffleElements(ZeroableMask, WidenedMask);
3704}
3705
3707 SmallVector<int, 32> WidenedMask;
3708 return canWidenShuffleElements(Mask, WidenedMask);
3709}
3710
3711// Attempt to narrow/widen shuffle mask until it matches the target number of
3712// elements.
3713static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
3714 SmallVectorImpl<int> &ScaledMask) {
3715 unsigned NumSrcElts = Mask.size();
3716 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
3717 "Illegal shuffle scale factor");
3718
3719 // Narrowing is guaranteed to work.
3720 if (NumDstElts >= NumSrcElts) {
3721 int Scale = NumDstElts / NumSrcElts;
3722 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
3723 return true;
3724 }
3725
3726 // We have to repeat the widening until we reach the target size, but we can
3727 // split out the first widening as it sets up ScaledMask for us.
3728 if (canWidenShuffleElements(Mask, ScaledMask)) {
3729 while (ScaledMask.size() > NumDstElts) {
3730 SmallVector<int, 16> WidenedMask;
3731 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
3732 return false;
3733 ScaledMask = std::move(WidenedMask);
3734 }
3735 return true;
3736 }
3737
3738 return false;
3739}
3740
3741static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
3742 SmallVector<int, 32> ScaledMask;
3743 return scaleShuffleElements(Mask, NumDstElts, ScaledMask);
3744}
3745
3746/// Returns true if Elt is a constant zero or a floating point constant +0.0.
3748 return isNullConstant(Elt) || isNullFPConstant(Elt);
3749}
3750
3751// Build a vector of constants.
3752// Use an UNDEF node if MaskElt == -1.
3753// Split 64-bit constants in the 32-bit mode.
3755 const SDLoc &dl, bool IsMask = false) {
3756
3758 bool Split = false;
3759
3760 MVT ConstVecVT = VT;
3761 unsigned NumElts = VT.getVectorNumElements();
3762 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3763 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3764 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3765 Split = true;
3766 }
3767
3768 MVT EltVT = ConstVecVT.getVectorElementType();
3769 for (unsigned i = 0; i < NumElts; ++i) {
3770 bool IsUndef = Values[i] < 0 && IsMask;
3771 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
3772 DAG.getConstant(Values[i], dl, EltVT);
3773 Ops.push_back(OpNode);
3774 if (Split)
3775 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
3776 DAG.getConstant(0, dl, EltVT));
3777 }
3778 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
3779 if (Split)
3780 ConstsNode = DAG.getBitcast(VT, ConstsNode);
3781 return ConstsNode;
3782}
3783
3784static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
3785 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
3786 assert(Bits.size() == Undefs.getBitWidth() &&
3787 "Unequal constant and undef arrays");
3789 bool Split = false;
3790
3791 MVT ConstVecVT = VT;
3792 unsigned NumElts = VT.getVectorNumElements();
3793 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3794 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3795 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3796 Split = true;
3797 }
3798
3799 MVT EltVT = ConstVecVT.getVectorElementType();
3800 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
3801 if (Undefs[i]) {
3802 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
3803 continue;
3804 }
3805 const APInt &V = Bits[i];
3806 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
3807 if (Split) {
3808 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
3809 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
3810 } else if (EltVT == MVT::f32) {
3812 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
3813 } else if (EltVT == MVT::f64) {
3815 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
3816 } else {
3817 Ops.push_back(DAG.getConstant(V, dl, EltVT));
3818 }
3819 }
3820
3821 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
3822 return DAG.getBitcast(VT, ConstsNode);
3823}
3824
3826 SelectionDAG &DAG, const SDLoc &dl) {
3827 APInt Undefs = APInt::getZero(Bits.size());
3828 return getConstVector(Bits, Undefs, VT, DAG, dl);
3829}
3830
3831/// Returns a vector of specified type with all zero elements.
3832static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
3833 SelectionDAG &DAG, const SDLoc &dl) {
3834 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
3835 VT.getVectorElementType() == MVT::i1) &&
3836 "Unexpected vector type");
3837
3838 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
3839 // type. This ensures they get CSE'd. But if the integer type is not
3840 // available, use a floating-point +0.0 instead.
3841 SDValue Vec;
3842 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3843 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
3844 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
3845 } else if (VT.isFloatingPoint() &&
3847 Vec = DAG.getConstantFP(+0.0, dl, VT);
3848 } else if (VT.getVectorElementType() == MVT::i1) {
3849 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
3850 "Unexpected vector type");
3851 Vec = DAG.getConstant(0, dl, VT);
3852 } else {
3853 unsigned Num32BitElts = VT.getSizeInBits() / 32;
3854 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
3855 }
3856 return DAG.getBitcast(VT, Vec);
3857}
3858
3859// Helper to determine if the ops are all the extracted subvectors come from a
3860// single source. If we allow commute they don't have to be in order (Lo/Hi).
3861static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
3862 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
3863 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
3864 LHS.getValueType() != RHS.getValueType() ||
3865 LHS.getOperand(0) != RHS.getOperand(0))
3866 return SDValue();
3867
3868 SDValue Src = LHS.getOperand(0);
3869 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
3870 return SDValue();
3871
3872 unsigned NumElts = LHS.getValueType().getVectorNumElements();
3873 if ((LHS.getConstantOperandAPInt(1) == 0 &&
3874 RHS.getConstantOperandAPInt(1) == NumElts) ||
3875 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
3876 LHS.getConstantOperandAPInt(1) == NumElts))
3877 return Src;
3878
3879 return SDValue();
3880}
3881
3882static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
3883 const SDLoc &dl, unsigned vectorWidth) {
3884 EVT VT = Vec.getValueType();
3885 EVT ElVT = VT.getVectorElementType();
3886 unsigned Factor = VT.getSizeInBits() / vectorWidth;
3887 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
3888 VT.getVectorNumElements() / Factor);
3889
3890 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
3891 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
3892 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
3893
3894 // This is the index of the first element of the vectorWidth-bit chunk
3895 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
3896 IdxVal &= ~(ElemsPerChunk - 1);
3897
3898 // If the input is a buildvector just emit a smaller one.
3899 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
3900 return DAG.getBuildVector(ResultVT, dl,
3901 Vec->ops().slice(IdxVal, ElemsPerChunk));
3902
3903 // Check if we're extracting the upper undef of a widening pattern.
3904 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
3905 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
3906 isNullConstant(Vec.getOperand(2)))
3907 return DAG.getUNDEF(ResultVT);
3908
3909 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
3910 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
3911}
3912
3913/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
3914/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
3915/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
3916/// instructions or a simple subregister reference. Idx is an index in the
3917/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
3918/// lowering EXTRACT_VECTOR_ELT operations easier.
3919static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
3920 SelectionDAG &DAG, const SDLoc &dl) {
3922 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
3923 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
3924}
3925
3926/// Generate a DAG to grab 256-bits from a 512-bit vector.
3927static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
3928 SelectionDAG &DAG, const SDLoc &dl) {
3929 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
3930 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
3931}
3932
3933static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
3934 SelectionDAG &DAG, const SDLoc &dl,
3935 unsigned vectorWidth) {
3936 assert((vectorWidth == 128 || vectorWidth == 256) &&
3937 "Unsupported vector width");
3938 // Inserting UNDEF is Result
3939 if (Vec.isUndef())
3940 return Result;
3941 EVT VT = Vec.getValueType();
3942 EVT ElVT = VT.getVectorElementType();
3943 EVT ResultVT = Result.getValueType();
3944
3945 // Insert the relevant vectorWidth bits.
3946 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
3947 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
3948
3949 // This is the index of the first element of the vectorWidth-bit chunk
3950 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
3951 IdxVal &= ~(ElemsPerChunk - 1);
3952
3953 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
3954 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
3955}
3956
3957/// Generate a DAG to put 128-bits into a vector > 128 bits. This
3958/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
3959/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
3960/// simple superregister reference. Idx is an index in the 128 bits
3961/// we want. It need not be aligned to a 128-bit boundary. That makes
3962/// lowering INSERT_VECTOR_ELT operations easier.
3963static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
3964 SelectionDAG &DAG, const SDLoc &dl) {
3965 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
3966 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
3967}
3968
3969/// Widen a vector to a larger size with the same scalar type, with the new
3970/// elements either zero or undef.
3971static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
3972 const X86Subtarget &Subtarget, SelectionDAG &DAG,
3973 const SDLoc &dl) {
3975 Vec.getValueType().getScalarType() == VT.getScalarType() &&
3976 "Unsupported vector widening type");
3977 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
3978 : DAG.getUNDEF(VT);
3979 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
3980 DAG.getIntPtrConstant(0, dl));
3981}
3982
3983/// Widen a vector to a larger size with the same scalar type, with the new
3984/// elements either zero or undef.
3985static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
3986 const X86Subtarget &Subtarget, SelectionDAG &DAG,
3987 const SDLoc &dl, unsigned WideSizeInBits) {
3988 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
3989 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
3990 "Unsupported vector widening type");
3991 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
3992 MVT SVT = Vec.getSimpleValueType().getScalarType();
3993 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
3994 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
3995}
3996
3997/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
3998/// and bitcast with integer types.
3999static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
4000 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
4001 unsigned NumElts = VT.getVectorNumElements();
4002 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4003 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4004 return VT;
4005}
4006
4007/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
4008/// bitcast with integer types.
4009static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
4010 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4011 const SDLoc &dl) {
4012 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
4013 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4014}
4015
4016// Helper function to collect subvector ops that are concatenated together,
4017// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
4018// The subvectors in Ops are guaranteed to be the same type.
4020 SelectionDAG &DAG) {
4021 assert(Ops.empty() && "Expected an empty ops vector");
4022
4023 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4024 Ops.append(N->op_begin(), N->op_end());
4025 return true;
4026 }
4027
4028 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4029 SDValue Src = N->getOperand(0);
4030 SDValue Sub = N->getOperand(1);
4031 const APInt &Idx = N->getConstantOperandAPInt(2);
4032 EVT VT = Src.getValueType();
4033 EVT SubVT = Sub.getValueType();
4034
4035 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4036 // insert_subvector(undef, x, lo)
4037 if (Idx == 0 && Src.isUndef()) {
4038 Ops.push_back(Sub);
4039 Ops.push_back(DAG.getUNDEF(SubVT));
4040 return true;
4041 }
4042 if (Idx == (VT.getVectorNumElements() / 2)) {
4043 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4044 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4045 Src.getOperand(1).getValueType() == SubVT &&
4046 isNullConstant(Src.getOperand(2))) {
4047 // Attempt to recurse into inner (matching) concats.
4048 SDValue Lo = Src.getOperand(1);
4049 SDValue Hi = Sub;
4050 SmallVector<SDValue, 2> LoOps, HiOps;
4051 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4052 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4053 LoOps.size() == HiOps.size()) {
4054 Ops.append(LoOps);
4055 Ops.append(HiOps);
4056 return true;
4057 }
4058 Ops.push_back(Lo);
4059 Ops.push_back(Hi);
4060 return true;
4061 }
4062 // insert_subvector(x, extract_subvector(x, lo), hi)
4063 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4064 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4065 Ops.append(2, Sub);
4066 return true;
4067 }
4068 // insert_subvector(undef, x, hi)
4069 if (Src.isUndef()) {
4070 Ops.push_back(DAG.getUNDEF(SubVT));
4071 Ops.push_back(Sub);
4072 return true;
4073 }
4074 }
4075 }
4076 }
4077
4078 return false;
4079}
4080
4081// Helper to check if \p V can be split into subvectors and the upper subvectors
4082// are all undef. In which case return the lower subvector.
4084 SelectionDAG &DAG) {
4085 SmallVector<SDValue> SubOps;
4086 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4087 return SDValue();
4088
4089 unsigned NumSubOps = SubOps.size();
4090 unsigned HalfNumSubOps = NumSubOps / 2;
4091 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4092
4093 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4094 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4095 return SDValue();
4096
4097 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4098 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4099 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4100}
4101
4102// Helper to check if we can access all the constituent subvectors without any
4103// extract ops.
4106 return collectConcatOps(N, Ops, DAG);
4107}
4108
4109static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4110 const SDLoc &dl) {
4111 EVT VT = Op.getValueType();
4112 unsigned NumElems = VT.getVectorNumElements();
4113 unsigned SizeInBits = VT.getSizeInBits();
4114 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4115 "Can't split odd sized vector");
4116
4117 // If this is a splat value (with no-undefs) then use the lower subvector,
4118 // which should be a free extraction.
4119 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4120 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4121 return std::make_pair(Lo, Lo);
4122
4123 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4124 return std::make_pair(Lo, Hi);
4125}
4126
4127/// Break an operation into 2 half sized ops and then concatenate the results.
4129 unsigned NumOps = Op.getNumOperands();
4130 EVT VT = Op.getValueType();
4131
4132 // Extract the LHS Lo/Hi vectors
4133 SmallVector<SDValue> LoOps(NumOps, SDValue());
4134 SmallVector<SDValue> HiOps(NumOps, SDValue());
4135 for (unsigned I = 0; I != NumOps; ++I) {
4136 SDValue SrcOp = Op.getOperand(I);
4137 if (!SrcOp.getValueType().isVector()) {
4138 LoOps[I] = HiOps[I] = SrcOp;
4139 continue;
4140 }
4141 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4142 }
4143
4144 EVT LoVT, HiVT;
4145 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4146 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4147 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4148 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4149}
4150
4151/// Break an unary integer operation into 2 half sized ops and then
4152/// concatenate the result back.
4154 const SDLoc &dl) {
4155 // Make sure we only try to split 256/512-bit types to avoid creating
4156 // narrow vectors.
4157 EVT VT = Op.getValueType();
4158 (void)VT;
4159 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4160 Op.getOperand(0).getValueType().is512BitVector()) &&
4161 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4162 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4163 VT.getVectorNumElements() &&
4164 "Unexpected VTs!");
4165 return splitVectorOp(Op, DAG, dl);
4166}
4167
4168/// Break a binary integer operation into 2 half sized ops and then
4169/// concatenate the result back.
4171 const SDLoc &dl) {
4172 // Assert that all the types match.
4173 EVT VT = Op.getValueType();
4174 (void)VT;
4175 assert(Op.getOperand(0).getValueType() == VT &&
4176 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4177 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4178 return splitVectorOp(Op, DAG, dl);
4179}
4180
4181// Helper for splitting operands of an operation to legal target size and
4182// apply a function on each part.
4183// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4184// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4185// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4186// The argument Builder is a function that will be applied on each split part:
4187// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4188template <typename F>
4190 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4191 F Builder, bool CheckBWI = true) {
4192 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4193 unsigned NumSubs = 1;
4194 if ((CheckBWI && Subtarget.useBWIRegs()) ||
4195 (!CheckBWI && Subtarget.useAVX512Regs())) {
4196 if (VT.getSizeInBits() > 512) {
4197 NumSubs = VT.getSizeInBits() / 512;
4198 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4199 }
4200 } else if (Subtarget.hasAVX2()) {
4201 if (VT.getSizeInBits() > 256) {
4202 NumSubs = VT.getSizeInBits() / 256;
4203 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4204 }
4205 } else {
4206 if (VT.getSizeInBits() > 128) {
4207 NumSubs = VT.getSizeInBits() / 128;
4208 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4209 }
4210 }
4211
4212 if (NumSubs == 1)
4213 return Builder(DAG, DL, Ops);
4214
4216 for (unsigned i = 0; i != NumSubs; ++i) {
4218 for (SDValue Op : Ops) {
4219 EVT OpVT = Op.getValueType();
4220 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4221 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4222 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4223 }
4224 Subs.push_back(Builder(DAG, DL, SubOps));
4225 }
4226 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4227}
4228
4229// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4230// targets.
4231static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4233 const X86Subtarget &Subtarget) {
4234 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4235 MVT SVT = VT.getScalarType();
4236
4237 // If we have a 32/64 splatted constant, splat it to DstTy to
4238 // encourage a foldable broadcast'd operand.
4239 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4240 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4241 // AVX512 broadcasts 32/64-bit operands.
4242 // TODO: Support float once getAVX512Node is used by fp-ops.
4243 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4245 return SDValue();
4246 // If we're not widening, don't bother if we're not bitcasting.
4247 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4248 return SDValue();
4249 if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {
4250 APInt SplatValue, SplatUndef;
4251 unsigned SplatBitSize;
4252 bool HasAnyUndefs;
4253 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4254 HasAnyUndefs, OpEltSizeInBits) &&
4255 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4256 return DAG.getConstant(SplatValue, DL, DstVT);
4257 }
4258 return SDValue();
4259 };
4260
4261 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4262
4263 MVT DstVT = VT;
4264 if (Widen)
4265 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4266
4267 // Canonicalize src operands.
4268 SmallVector<SDValue> SrcOps(Ops.begin(), Ops.end());
4269 for (SDValue &Op : SrcOps) {
4270 MVT OpVT = Op.getSimpleValueType();
4271 // Just pass through scalar operands.
4272 if (!OpVT.isVector())
4273 continue;
4274 assert(OpVT == VT && "Vector type mismatch");
4275
4276 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4277 Op = BroadcastOp;
4278 continue;
4279 }
4280
4281 // Just widen the subvector by inserting into an undef wide vector.
4282 if (Widen)
4283 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4284 }
4285
4286 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4287
4288 // Perform the 512-bit op then extract the bottom subvector.
4289 if (Widen)
4290 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4291 return Res;
4292}
4293
4294/// Insert i1-subvector to i1-vector.
4296 const X86Subtarget &Subtarget) {
4297
4298 SDLoc dl(Op);
4299 SDValue Vec = Op.getOperand(0);
4300 SDValue SubVec = Op.getOperand(1);
4301 SDValue Idx = Op.getOperand(2);
4302 unsigned IdxVal = Op.getConstantOperandVal(2);
4303
4304 // Inserting undef is a nop. We can just return the original vector.
4305 if (SubVec.isUndef())
4306 return Vec;
4307
4308 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4309 return Op;
4310
4311 MVT OpVT = Op.getSimpleValueType();
4312 unsigned NumElems = OpVT.getVectorNumElements();
4313 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
4314
4315 // Extend to natively supported kshift.
4316 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4317
4318 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4319 // if necessary.
4320 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4321 // May need to promote to a legal type.
4322 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4323 DAG.getConstant(0, dl, WideOpVT),
4324 SubVec, Idx);
4325 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4326 }
4327
4328 MVT SubVecVT = SubVec.getSimpleValueType();
4329 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4330 assert(IdxVal + SubVecNumElems <= NumElems &&
4331 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4332 "Unexpected index value in INSERT_SUBVECTOR");
4333
4334 SDValue Undef = DAG.getUNDEF(WideOpVT);
4335
4336 if (IdxVal == 0) {
4337 // Zero lower bits of the Vec
4338 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4339 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4340 ZeroIdx);
4341 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4342 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4343 // Merge them together, SubVec should be zero extended.
4344 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4345 DAG.getConstant(0, dl, WideOpVT),
4346 SubVec, ZeroIdx);
4347 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4348 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4349 }
4350
4351 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4352 Undef, SubVec, ZeroIdx);
4353
4354 if (Vec.isUndef()) {
4355 assert(IdxVal != 0 && "Unexpected index");
4356 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4357 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4358 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4359 }
4360
4362 assert(IdxVal != 0 && "Unexpected index");
4363 // If upper elements of Vec are known undef, then just shift into place.
4364 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4365 [](SDValue V) { return V.isUndef(); })) {
4366 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4367 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4368 } else {
4369 NumElems = WideOpVT.getVectorNumElements();
4370 unsigned ShiftLeft = NumElems - SubVecNumElems;
4371 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4372 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4373 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4374 if (ShiftRight != 0)
4375 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4376 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4377 }
4378 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4379 }
4380
4381 // Simple case when we put subvector in the upper part
4382 if (IdxVal + SubVecNumElems == NumElems) {
4383 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4384 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4385 if (SubVecNumElems * 2 == NumElems) {
4386 // Special case, use legal zero extending insert_subvector. This allows
4387 // isel to optimize when bits are known zero.
4388 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4389 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4390 DAG.getConstant(0, dl, WideOpVT),
4391 Vec, ZeroIdx);
4392 } else {
4393 // Otherwise use explicit shifts to zero the bits.
4394 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4395 Undef, Vec, ZeroIdx);
4396 NumElems = WideOpVT.getVectorNumElements();
4397 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4398 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4399 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4400 }
4401 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4402 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4403 }
4404
4405 // Inserting into the middle is more complicated.
4406
4407 NumElems = WideOpVT.getVectorNumElements();
4408
4409 // Widen the vector if needed.
4410 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4411
4412 unsigned ShiftLeft = NumElems - SubVecNumElems;
4413 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4414
4415 // Do an optimization for the most frequently used types.
4416 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4417 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4418 Mask0.flipAllBits();
4419 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4420 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4421 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4422 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4423 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4424 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4425 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4426 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4427
4428 // Reduce to original width if needed.
4429 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4430 }
4431
4432 // Clear the upper bits of the subvector and move it to its insert position.
4433 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4434 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4435 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4436 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4437
4438 // Isolate the bits below the insertion point.
4439 unsigned LowShift = NumElems - IdxVal;
4440 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4441 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4442 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4443 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4444
4445 // Isolate the bits after the last inserted bit.
4446 unsigned HighShift = IdxVal + SubVecNumElems;
4447 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4448 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4449 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4450 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4451
4452 // Now OR all 3 pieces together.
4453 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4454 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4455
4456 // Reduce to original width if needed.
4457 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4458}
4459
4461 const SDLoc &dl) {
4462 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4463 EVT SubVT = V1.getValueType();
4464 EVT SubSVT = SubVT.getScalarType();
4465 unsigned SubNumElts = SubVT.getVectorNumElements();
4466 unsigned SubVectorWidth = SubVT.getSizeInBits();
4467 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4468 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4469 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4470}
4471
4472/// Returns a vector of specified type with all bits set.
4473/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4474/// Then bitcast to their original type, ensuring they get CSE'd.
4475static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4476 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4477 "Expected a 128/256/512-bit vector type");
4478 unsigned NumElts = VT.getSizeInBits() / 32;
4479 SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
4480 return DAG.getBitcast(VT, Vec);
4481}
4482
4483static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
4484 SDValue In, SelectionDAG &DAG) {
4485 EVT InVT = In.getValueType();
4486 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
4487 assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
4488 ISD::ZERO_EXTEND == Opcode) &&
4489 "Unknown extension opcode");
4490
4491 // For 256-bit vectors, we only need the lower (128-bit) input half.
4492 // For 512-bit vectors, we only need the lower input half or quarter.
4493 if (InVT.getSizeInBits() > 128) {
4494 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
4495 "Expected VTs to be the same size!");
4496 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
4497 In = extractSubVector(In, 0, DAG, DL,
4498 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
4499 InVT = In.getValueType();
4500 }
4501
4502 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
4503 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
4504
4505 return DAG.getNode(Opcode, DL, VT, In);
4506}
4507
4508// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
4509static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS,
4510 SDValue Mask, SelectionDAG &DAG) {
4511 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
4512 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
4513 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
4514}
4515
4517 bool Lo, bool Unary) {
4518 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
4519 "Illegal vector type to unpack");
4520 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4521 int NumElts = VT.getVectorNumElements();
4522 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
4523 for (int i = 0; i < NumElts; ++i) {
4524 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4525 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4526 Pos += (Unary ? 0 : NumElts * (i % 2));
4527 Pos += (Lo ? 0 : NumEltsInLane / 2);
4528 Mask.push_back(Pos);
4529 }
4530}
4531
4532/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4533/// imposed by AVX and specific to the unary pattern. Example:
4534/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4535/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4537 bool Lo) {
4538 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4539 int NumElts = VT.getVectorNumElements();
4540 for (int i = 0; i < NumElts; ++i) {
4541 int Pos = i / 2;
4542 Pos += (Lo ? 0 : NumElts / 2);
4543 Mask.push_back(Pos);
4544 }
4545}
4546
4547// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
4548static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
4549 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
4551 (ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {
4552 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
4553 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
4554 int M = Mask[I];
4555 if (M < 0)
4556 continue;
4557 SDValue V = (M < NumElts) ? V1 : V2;
4558 if (V.isUndef())
4559 continue;
4560 Ops[I] = V.getOperand(M % NumElts);
4561 }
4562 return DAG.getBuildVector(VT, dl, Ops);
4563 }
4564
4565 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4566}
4567
4568/// Returns a vector_shuffle node for an unpackl operation.
4569static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4570 SDValue V1, SDValue V2) {
4572 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
4573 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4574}
4575
4576/// Returns a vector_shuffle node for an unpackh operation.
4577static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4578 SDValue V1, SDValue V2) {
4580 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
4581 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4582}
4583
4584/// Returns a node that packs the LHS + RHS nodes together at half width.
4585/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
4586/// TODO: Add subvector splitting if/when we have a need for it.
4587static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
4588 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
4589 bool PackHiHalf = false) {
4590 MVT OpVT = LHS.getSimpleValueType();
4591 unsigned EltSizeInBits = VT.getScalarSizeInBits();
4592 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
4593 assert(OpVT == RHS.getSimpleValueType() &&
4594 VT.getSizeInBits() == OpVT.getSizeInBits() &&
4595 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
4596 "Unexpected PACK operand types");
4597 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4598 "Unexpected PACK result type");
4599
4600 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4601 if (EltSizeInBits == 32) {
4602 SmallVector<int> PackMask;
4603 int Offset = PackHiHalf ? 1 : 0;
4604 int NumElts = VT.getVectorNumElements();
4605 for (int I = 0; I != NumElts; I += 4) {
4606 PackMask.push_back(I + Offset);
4607 PackMask.push_back(I + Offset + 2);
4608 PackMask.push_back(I + Offset + NumElts);
4609 PackMask.push_back(I + Offset + NumElts + 2);
4610 }
4611 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
4612 DAG.getBitcast(VT, RHS), PackMask);
4613 }
4614
4615 // See if we already have sufficient leading bits for PACKSS/PACKUS.
4616 if (!PackHiHalf) {
4617 if (UsePackUS &&
4618 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
4619 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
4620 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4621
4622 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
4623 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
4624 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4625 }
4626
4627 // Fallback to sign/zero extending the requested half and pack.
4628 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
4629 if (UsePackUS) {
4630 if (PackHiHalf) {
4631 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
4632 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
4633 } else {
4634 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4635 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
4636 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
4637 };
4638 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4639 };
4640
4641 if (!PackHiHalf) {
4642 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
4643 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
4644 }
4645 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
4646 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
4647 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4648}
4649
4650/// Return a vector_shuffle of the specified vector of zero or undef vector.
4651/// This produces a shuffle where the low element of V2 is swizzled into the
4652/// zero/undef vector, landing at element Idx.
4653/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
4655 bool IsZero,
4656 const X86Subtarget &Subtarget,
4657 SelectionDAG &DAG) {
4658 MVT VT = V2.getSimpleValueType();
4659 SDValue V1 = IsZero
4660 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4661 int NumElems = VT.getVectorNumElements();
4662 SmallVector<int, 16> MaskVec(NumElems);
4663 for (int i = 0; i != NumElems; ++i)
4664 // If this is the insertion idx, put the low elt of V2 here.
4665 MaskVec[i] = (i == Idx) ? NumElems : i;
4666 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4667}
4668
4670 if (Ptr.getOpcode() == X86ISD::Wrapper ||
4671 Ptr.getOpcode() == X86ISD::WrapperRIP)
4672 Ptr = Ptr.getOperand(0);
4673 return dyn_cast<ConstantPoolSDNode>(Ptr);
4674}
4675
4676// TODO: Add support for non-zero offsets.
4679 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4680 return nullptr;
4681 return CNode->getConstVal();
4682}
4683
4685 if (!Load || !ISD::isNormalLoad(Load))
4686 return nullptr;
4687 return getTargetConstantFromBasePtr(Load->getBasePtr());
4688}
4689
4692 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
4693}
4694
4695const Constant *
4697 assert(LD && "Unexpected null LoadSDNode");
4698 return getTargetConstantFromNode(LD);
4699}
4700
4701// Extract raw constant bits from constant pools.
4702static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
4703 APInt &UndefElts,
4704 SmallVectorImpl<APInt> &EltBits,
4705 bool AllowWholeUndefs = true,
4706 bool AllowPartialUndefs = false) {
4707 assert(EltBits.empty() && "Expected an empty EltBits vector");
4708
4710
4711 EVT VT = Op.getValueType();
4712 unsigned SizeInBits = VT.getSizeInBits();
4713 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
4714 unsigned NumElts = SizeInBits / EltSizeInBits;
4715
4716 // Bitcast a source array of element bits to the target size.
4717 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
4718 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
4719 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
4720 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
4721 "Constant bit sizes don't match");
4722
4723 // Don't split if we don't allow undef bits.
4724 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
4725 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
4726 return false;
4727
4728 // If we're already the right size, don't bother bitcasting.
4729 if (NumSrcElts == NumElts) {
4730 UndefElts = UndefSrcElts;
4731 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
4732 return true;
4733 }
4734
4735 // Extract all the undef/constant element data and pack into single bitsets.
4736 APInt UndefBits(SizeInBits, 0);
4737 APInt MaskBits(SizeInBits, 0);
4738
4739 for (unsigned i = 0; i != NumSrcElts; ++i) {
4740 unsigned BitOffset = i * SrcEltSizeInBits;
4741 if (UndefSrcElts[i])
4742 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
4743 MaskBits.insertBits(SrcEltBits[i], BitOffset);
4744 }
4745
4746 // Split the undef/constant single bitset data into the target elements.
4747 UndefElts = APInt(NumElts, 0);
4748 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
4749
4750 for (unsigned i = 0; i != NumElts; ++i) {
4751 unsigned BitOffset = i * EltSizeInBits;
4752 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
4753
4754 // Only treat an element as UNDEF if all bits are UNDEF.
4755 if (UndefEltBits.isAllOnes()) {
4756 if (!AllowWholeUndefs)
4757 return false;
4758 UndefElts.setBit(i);
4759 continue;
4760 }
4761
4762 // If only some bits are UNDEF then treat them as zero (or bail if not
4763 // supported).
4764 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
4765 return false;
4766
4767 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
4768 }
4769 return true;
4770 };
4771
4772 // Collect constant bits and insert into mask/undef bit masks.
4773 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
4774 unsigned UndefBitIndex) {
4775 if (!Cst)
4776 return false;
4777 if (isa<UndefValue>(Cst)) {
4778 Undefs.setBit(UndefBitIndex);
4779 return true;
4780 }
4781 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
4782 Mask = CInt->getValue();
4783 return true;
4784 }
4785 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
4786 Mask = CFP->getValueAPF().bitcastToAPInt();
4787 return true;
4788 }
4789 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
4790 Type *Ty = CDS->getType();
4792 Type *EltTy = CDS->getElementType();
4793 bool IsInteger = EltTy->isIntegerTy();
4794 bool IsFP =
4795 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
4796 if (!IsInteger && !IsFP)
4797 return false;
4798 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
4799 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
4800 if (IsInteger)
4801 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
4802 else
4803 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
4804 I * EltBits);
4805 return true;
4806 }
4807 return false;
4808 };
4809
4810 // Handle UNDEFs.
4811 if (Op.isUndef()) {
4812 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
4813 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
4814 return CastBitData(UndefSrcElts, SrcEltBits);
4815 }
4816
4817 // Extract scalar constant bits.
4818 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
4819 APInt UndefSrcElts = APInt::getZero(1);
4820 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
4821 return CastBitData(UndefSrcElts, SrcEltBits);
4822 }
4823 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
4824 APInt UndefSrcElts = APInt::getZero(1);
4825 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
4826 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
4827 return CastBitData(UndefSrcElts, SrcEltBits);
4828 }
4829
4830 // Extract constant bits from build vector.
4831 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
4832 BitVector Undefs;
4833 SmallVector<APInt> SrcEltBits;
4834 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4835 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
4836 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
4837 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
4838 if (Undefs[I])
4839 UndefSrcElts.setBit(I);
4840 return CastBitData(UndefSrcElts, SrcEltBits);
4841 }
4842 }
4843
4844 // Extract constant bits from constant pool vector.
4845 if (auto *Cst = getTargetConstantFromNode(Op)) {
4846 Type *CstTy = Cst->getType();
4847 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
4848 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
4849 return false;
4850
4851 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
4852 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
4853 if ((SizeInBits % SrcEltSizeInBits) != 0)
4854 return false;
4855
4856 APInt UndefSrcElts(NumSrcElts, 0);
4857 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
4858 for (unsigned i = 0; i != NumSrcElts; ++i)
4859 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
4860 UndefSrcElts, i))
4861 return false;
4862
4863 return CastBitData(UndefSrcElts, SrcEltBits);
4864 }
4865
4866 // Extract constant bits from a broadcasted constant pool scalar.
4867 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
4868 EltSizeInBits <= VT.getScalarSizeInBits()) {
4869 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
4870 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
4871 return false;
4872
4873 SDValue Ptr = MemIntr->getBasePtr();
4875 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4876 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
4877
4878 APInt UndefSrcElts(NumSrcElts, 0);
4879 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
4880 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
4881 if (UndefSrcElts[0])
4882 UndefSrcElts.setBits(0, NumSrcElts);
4883 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
4884 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
4885 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
4886 return CastBitData(UndefSrcElts, SrcEltBits);
4887 }
4888 }
4889 }
4890
4891 // Extract constant bits from a subvector broadcast.
4892 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
4893 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
4894 SDValue Ptr = MemIntr->getBasePtr();
4895 // The source constant may be larger than the subvector broadcast,
4896 // ensure we extract the correct subvector constants.
4897 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
4898 Type *CstTy = Cst->getType();
4899 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
4900 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
4901 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
4902 (SizeInBits % SubVecSizeInBits) != 0)
4903 return false;
4904 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
4905 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
4906 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
4907 APInt UndefSubElts(NumSubElts, 0);
4908 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
4909 APInt(CstEltSizeInBits, 0));
4910 for (unsigned i = 0; i != NumSubElts; ++i) {
4911 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
4912 UndefSubElts, i))
4913 return false;
4914 for (unsigned j = 1; j != NumSubVecs; ++j)
4915 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
4916 }
4917 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
4918 UndefSubElts);
4919 return CastBitData(UndefSubElts, SubEltBits);
4920 }
4921 }
4922
4923 // Extract a rematerialized scalar constant insertion.
4924 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
4925 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
4926 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
4927 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4928 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
4929
4930 APInt UndefSrcElts(NumSrcElts, 0);
4931 SmallVector<APInt, 64> SrcEltBits;
4932 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
4933 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
4934 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
4935 return CastBitData(UndefSrcElts, SrcEltBits);
4936 }
4937
4938 // Insert constant bits from a base and sub vector sources.
4939 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
4940 // If bitcasts to larger elements we might lose track of undefs - don't
4941 // allow any to be safe.
4942 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4943 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
4944
4945 APInt UndefSrcElts, UndefSubElts;
4946 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
4947 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
4948 UndefSubElts, EltSubBits,
4949 AllowWholeUndefs && AllowUndefs,
4950 AllowPartialUndefs && AllowUndefs) &&
4951 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
4952 UndefSrcElts, EltSrcBits,
4953 AllowWholeUndefs && AllowUndefs,
4954 AllowPartialUndefs && AllowUndefs)) {
4955 unsigned BaseIdx = Op.getConstantOperandVal(2);
4956 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
4957 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
4958 EltSrcBits[BaseIdx + i] = EltSubBits[i];
4959 return CastBitData(UndefSrcElts, EltSrcBits);
4960 }
4961 }
4962
4963 // Extract constant bits from a subvector's source.
4964 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4965 // TODO - support extract_subvector through bitcasts.
4966 if (EltSizeInBits != VT.getScalarSizeInBits())
4967 return false;
4968
4969 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
4970 UndefElts, EltBits, AllowWholeUndefs,
4971 AllowPartialUndefs)) {
4972 EVT SrcVT = Op.getOperand(0).getValueType();
4973 unsigned NumSrcElts = SrcVT.getVectorNumElements();
4974 unsigned NumSubElts = VT.getVectorNumElements();
4975 unsigned BaseIdx = Op.getConstantOperandVal(1);
4976 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
4977 if ((BaseIdx + NumSubElts) != NumSrcElts)
4978 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
4979 if (BaseIdx != 0)
4980 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
4981 return true;
4982 }
4983 }
4984
4985 // Extract constant bits from shuffle node sources.
4986 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
4987 // TODO - support shuffle through bitcasts.
4988 if (EltSizeInBits != VT.getScalarSizeInBits())
4989 return false;
4990
4991 ArrayRef<int> Mask = SVN->getMask();
4992 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
4993 llvm::any_of(Mask, [](int M) { return M < 0; }))
4994 return false;
4995
4996 APInt UndefElts0, UndefElts1;
4997 SmallVector<APInt, 32> EltBits0, EltBits1;
4998 if (isAnyInRange(Mask, 0, NumElts) &&
4999 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5000 UndefElts0, EltBits0, AllowWholeUndefs,
5001 AllowPartialUndefs))
5002 return false;
5003 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
5004 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
5005 UndefElts1, EltBits1, AllowWholeUndefs,
5006 AllowPartialUndefs))
5007 return false;
5008
5009 UndefElts = APInt::getZero(NumElts);
5010 for (int i = 0; i != (int)NumElts; ++i) {
5011 int M = Mask[i];
5012 if (M < 0) {
5013 UndefElts.setBit(i);
5014 EltBits.push_back(APInt::getZero(EltSizeInBits));
5015 } else if (M < (int)NumElts) {
5016 if (UndefElts0[M])
5017 UndefElts.setBit(i);
5018 EltBits.push_back(EltBits0[M]);
5019 } else {
5020 if (UndefElts1[M - NumElts])
5021 UndefElts.setBit(i);
5022 EltBits.push_back(EltBits1[M - NumElts]);
5023 }
5024 }
5025 return true;
5026 }
5027
5028 return false;
5029}
5030
5031namespace llvm {
5032namespace X86 {
5033bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5034 APInt UndefElts;
5035 SmallVector<APInt, 16> EltBits;
5037 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5038 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5039 int SplatIndex = -1;
5040 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5041 if (UndefElts[i])
5042 continue;
5043 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5044 SplatIndex = -1;
5045 break;
5046 }
5047 SplatIndex = i;
5048 }
5049 if (0 <= SplatIndex) {
5050 SplatVal = EltBits[SplatIndex];
5051 return true;
5052 }
5053 }
5054
5055 return false;
5056}
5057} // namespace X86
5058} // namespace llvm
5059
5061 unsigned MaskEltSizeInBits,
5063 APInt &UndefElts) {
5064 // Extract the raw target constant bits.
5065 SmallVector<APInt, 64> EltBits;
5066 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5067 EltBits, /* AllowWholeUndefs */ true,
5068 /* AllowPartialUndefs */ false))
5069 return false;
5070
5071 // Insert the extracted elements into the mask.
5072 for (const APInt &Elt : EltBits)
5073 RawMask.push_back(Elt.getZExtValue());
5074
5075 return true;
5076}
5077
5078// Match not(xor X, -1) -> X.
5079// Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5080// Match not(extract_subvector(xor X, -1)) -> extract_subvector(X).
5081// Match not(concat_vectors(xor X, -1, xor Y, -1)) -> concat_vectors(X, Y).
5083 V = peekThroughBitcasts(V);
5084 if (V.getOpcode() == ISD::XOR &&
5085 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5086 isAllOnesConstant(V.getOperand(1))))
5087 return V.getOperand(0);
5088 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5089 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5090 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5091 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5092 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
5093 Not, V.getOperand(1));
5094 }
5095 }
5096 if (V.getOpcode() == X86ISD::PCMPGT &&
5097 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5098 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5099 V.getOperand(0).hasOneUse()) {
5100 APInt UndefElts;
5101 SmallVector<APInt> EltBits;
5102 if (getTargetConstantBitsFromNode(V.getOperand(0),
5103 V.getScalarValueSizeInBits(), UndefElts,
5104 EltBits)) {
5105 // Don't fold min_signed_value -> (min_signed_value - 1)
5106 bool MinSigned = false;
5107 for (APInt &Elt : EltBits) {
5108 MinSigned |= Elt.isMinSignedValue();
5109 Elt -= 1;
5110 }
5111 if (!MinSigned) {
5112 SDLoc DL(V);
5113 MVT VT = V.getSimpleValueType();
5114 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5115 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5116 }
5117 }
5118 }
5120 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5121 for (SDValue &CatOp : CatOps) {
5122 SDValue NotCat = IsNOT(CatOp, DAG);
5123 if (!NotCat) return SDValue();
5124 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5125 }
5126 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
5127 }
5128 return SDValue();
5129}
5130
5131/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5132/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5133/// Note: This ignores saturation, so inputs must be checked first.
5135 bool Unary, unsigned NumStages = 1) {
5136 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5137 unsigned NumElts = VT.getVectorNumElements();
5138 unsigned NumLanes = VT.getSizeInBits() / 128;
5139 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5140 unsigned Offset = Unary ? 0 : NumElts;
5141 unsigned Repetitions = 1u << (NumStages - 1);
5142 unsigned Increment = 1u << NumStages;
5143 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5144
5145 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5146 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5147 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5148 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5149 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5150 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5151 }
5152 }
5153}
5154
5155// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5156static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5157 APInt &DemandedLHS, APInt &DemandedRHS) {
5158 int NumLanes = VT.getSizeInBits() / 128;
5159 int NumElts = DemandedElts.getBitWidth();
5160 int NumInnerElts = NumElts / 2;
5161 int NumEltsPerLane = NumElts / NumLanes;
5162 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5163
5164 DemandedLHS = APInt::getZero(NumInnerElts);
5165 DemandedRHS = APInt::getZero(NumInnerElts);
5166
5167 // Map DemandedElts to the packed operands.
5168 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5169 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5170 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5171 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5172 if (DemandedElts[OuterIdx])
5173 DemandedLHS.setBit(InnerIdx);
5174 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5175 DemandedRHS.setBit(InnerIdx);
5176 }
5177 }
5178}
5179
5180// Split the demanded elts of a HADD/HSUB node between its operands.
5181static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5182 APInt &DemandedLHS, APInt &DemandedRHS) {
5183 int NumLanes = VT.getSizeInBits() / 128;
5184 int NumElts = DemandedElts.getBitWidth();
5185 int NumEltsPerLane = NumElts / NumLanes;
5186 int HalfEltsPerLane = NumEltsPerLane / 2;
5187
5188 DemandedLHS = APInt::getZero(NumElts);
5189 DemandedRHS = APInt::getZero(NumElts);
5190
5191 // Map DemandedElts to the horizontal operands.
5192 for (int Idx = 0; Idx != NumElts; ++Idx) {
5193 if (!DemandedElts[Idx])
5194 continue;
5195 int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
5196 int LocalIdx = Idx % NumEltsPerLane;
5197 if (LocalIdx < HalfEltsPerLane) {
5198 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
5199 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
5200 } else {
5201 LocalIdx -= HalfEltsPerLane;
5202 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
5203 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
5204 }
5205 }
5206}
5207
5208/// Calculates the shuffle mask corresponding to the target-specific opcode.
5209/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5210/// operands in \p Ops, and returns true.
5211/// Sets \p IsUnary to true if only one source is used. Note that this will set
5212/// IsUnary for shuffles which use a single input multiple times, and in those
5213/// cases it will adjust the mask to only have indices within that single input.
5214/// It is an error to call this with non-empty Mask/Ops vectors.
5215static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5217 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5218 if (!isTargetShuffle(N.getOpcode()))
5219 return false;
5220
5221 MVT VT = N.getSimpleValueType();
5222 unsigned NumElems = VT.getVectorNumElements();
5223 unsigned MaskEltSize = VT.getScalarSizeInBits();
5225 APInt RawUndefs;
5226 uint64_t ImmN;
5227
5228 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5229 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5230
5231 IsUnary = false;
5232 bool IsFakeUnary = false;
5233 switch (N.getOpcode()) {
5234 case X86ISD::BLENDI:
5235 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5236 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5237 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5238 DecodeBLENDMask(NumElems, ImmN, Mask);
5239 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5240 break;
5241 case X86ISD::SHUFP:
5242 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5243 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5244 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5245 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5246 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5247 break;
5248 case X86ISD::INSERTPS:
5249 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5250 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5251 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5252 DecodeINSERTPSMask(ImmN, Mask);
5253 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5254 break;
5255 case X86ISD::EXTRQI:
5256 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5257 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5258 isa<ConstantSDNode>(N.getOperand(2))) {
5259 int BitLen = N.getConstantOperandVal(1);
5260 int BitIdx = N.getConstantOperandVal(2);
5261 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5262 IsUnary = true;
5263 }
5264 break;
5265 case X86ISD::INSERTQI:
5266 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5267 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5268 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5269 isa<ConstantSDNode>(N.getOperand(3))) {
5270 int BitLen = N.getConstantOperandVal(2);
5271 int BitIdx = N.getConstantOperandVal(3);
5272 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5273 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5274 }
5275 break;
5276 case X86ISD::UNPCKH:
5277 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5278 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5279 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5280 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5281 break;
5282 case X86ISD::UNPCKL:
5283 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5284 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5285 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5286 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5287 break;
5288 case X86ISD::MOVHLPS:
5289 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5290 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5291 DecodeMOVHLPSMask(NumElems, Mask);
5292 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5293 break;
5294 case X86ISD::MOVLHPS:
5295 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5296 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5297 DecodeMOVLHPSMask(NumElems, Mask);
5298 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5299 break;
5300 case X86ISD::VALIGN:
5301 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5302 "Only 32-bit and 64-bit elements are supported!");
5303 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5304 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5305 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5306 DecodeVALIGNMask(NumElems, ImmN, Mask);
5307 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5308 Ops.push_back(N.getOperand(1));
5309 Ops.push_back(N.getOperand(0));
5310 break;
5311 case X86ISD::PALIGNR:
5312 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5313 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5314 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5315 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5316 DecodePALIGNRMask(NumElems, ImmN, Mask);
5317 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5318 Ops.push_back(N.getOperand(1));
5319 Ops.push_back(N.getOperand(0));
5320 break;
5321 case X86ISD::VSHLDQ:
5322 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5323 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5324 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5325 DecodePSLLDQMask(NumElems, ImmN, Mask);
5326 IsUnary = true;
5327 break;
5328 case X86ISD::VSRLDQ:
5329 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5330 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5331 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5332 DecodePSRLDQMask(NumElems, ImmN, Mask);
5333 IsUnary = true;
5334 break;
5335 case X86ISD::PSHUFD:
5336 case X86ISD::VPERMILPI:
5337 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5338 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5339 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5340 IsUnary = true;
5341 break;
5342 case X86ISD::PSHUFHW:
5343 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5344 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5345 DecodePSHUFHWMask(NumElems, ImmN, Mask);
5346 IsUnary = true;
5347 break;
5348 case X86ISD::PSHUFLW:
5349 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5350 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5351 DecodePSHUFLWMask(NumElems, ImmN, Mask);
5352 IsUnary = true;
5353 break;
5354 case X86ISD::VZEXT_MOVL:
5355 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5356 DecodeZeroMoveLowMask(NumElems, Mask);
5357 IsUnary = true;
5358 break;
5359 case X86ISD::VBROADCAST:
5360 // We only decode broadcasts of same-sized vectors, peeking through to
5361 // extracted subvectors is likely to cause hasOneUse issues with
5362 // SimplifyDemandedBits etc.
5363 if (N.getOperand(0).getValueType() == VT) {
5364 DecodeVectorBroadcast(NumElems, Mask);
5365 IsUnary = true;
5366 break;
5367 }
5368 return false;
5369 case X86ISD::VPERMILPV: {
5370 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5371 IsUnary = true;
5372 SDValue MaskNode = N.getOperand(1);
5373 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5374 RawUndefs)) {
5375 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
5376 break;
5377 }
5378 return false;
5379 }
5380 case X86ISD::PSHUFB: {
5381 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5382 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5383 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5384 IsUnary = true;
5385 SDValue MaskNode = N.getOperand(1);
5386 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5387 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
5388 break;
5389 }
5390 return false;
5391 }
5392 case X86ISD::VPERMI:
5393 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5394 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5395 DecodeVPERMMask(NumElems, ImmN, Mask);
5396 IsUnary = true;
5397 break;
5398 case X86ISD::MOVSS:
5399 case X86ISD::MOVSD:
5400 case X86ISD::MOVSH:
5401 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5402 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5403 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5404 break;
5405 case X86ISD::VPERM2X128:
5406 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5407 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5408 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5409 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
5410 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5411 break;
5412 case X86ISD::SHUF128:
5413 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5414 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5415 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5416 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
5417 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5418 break;
5419 case X86ISD::MOVSLDUP:
5420 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5421 DecodeMOVSLDUPMask(NumElems, Mask);
5422 IsUnary = true;
5423 break;
5424 case X86ISD::MOVSHDUP:
5425 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5426 DecodeMOVSHDUPMask(NumElems, Mask);
5427 IsUnary = true;
5428 break;
5429 case X86ISD::MOVDDUP:
5430 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5431 DecodeMOVDDUPMask(NumElems, Mask);
5432 IsUnary = true;
5433 break;
5434 case X86ISD::VPERMIL2: {
5435 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5436 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5437 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5438 SDValue MaskNode = N.getOperand(2);
5439 SDValue CtrlNode = N.getOperand(3);
5440 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5441 unsigned CtrlImm = CtrlOp->getZExtValue();
5442 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5443 RawUndefs)) {
5444 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
5445 Mask);
5446 break;
5447 }
5448 }
5449 return false;
5450 }
5451 case X86ISD::VPPERM: {
5452 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5453 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5454 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5455 SDValue MaskNode = N.getOperand(2);
5456 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5457 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
5458 break;
5459 }
5460 return false;
5461 }
5462 case X86ISD::VPERMV: {
5463 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5464 IsUnary = true;
5465 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5466 Ops.push_back(N.getOperand(1));
5467 SDValue MaskNode = N.getOperand(0);
5468 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5469 RawUndefs)) {
5470 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
5471 break;
5472 }
5473 return false;
5474 }
5475 case X86ISD::VPERMV3: {
5476 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5477 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
5478 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
5479 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5480 Ops.push_back(N.getOperand(0));
5481 Ops.push_back(N.getOperand(2));
5482 SDValue MaskNode = N.getOperand(1);
5483 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5484 RawUndefs)) {
5485 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
5486 break;
5487 }
5488 return false;
5489 }
5490 default:
5491 llvm_unreachable("unknown target shuffle node");
5492 }
5493
5494 // Empty mask indicates the decode failed.
5495 if (Mask.empty())
5496 return false;
5497
5498 // Check if we're getting a shuffle mask with zero'd elements.
5499 if (!AllowSentinelZero && isAnyZero(Mask))
5500 return false;
5501
5502 // If we have a fake unary shuffle, the shuffle mask is spread across two
5503 // inputs that are actually the same node. Re-map the mask to always point
5504 // into the first input.
5505 if (IsFakeUnary)
5506 for (int &M : Mask)
5507 if (M >= (int)Mask.size())
5508 M -= Mask.size();
5509
5510 // If we didn't already add operands in the opcode-specific code, default to
5511 // adding 1 or 2 operands starting at 0.
5512 if (Ops.empty()) {
5513 Ops.push_back(N.getOperand(0));
5514 if (!IsUnary || IsFakeUnary)
5515 Ops.push_back(N.getOperand(1));
5516 }
5517
5518 return true;
5519}
5520
5521// Wrapper for getTargetShuffleMask with InUnary;
5522static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5524 SmallVectorImpl<int> &Mask) {
5525 bool IsUnary;
5526 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
5527}
5528
5529/// Compute whether each element of a shuffle is zeroable.
5530///
5531/// A "zeroable" vector shuffle element is one which can be lowered to zero.
5532/// Either it is an undef element in the shuffle mask, the element of the input
5533/// referenced is undef, or the element of the input referenced is known to be
5534/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
5535/// as many lanes with this technique as possible to simplify the remaining
5536/// shuffle.
5538 SDValue V1, SDValue V2,
5539 APInt &KnownUndef, APInt &KnownZero) {
5540 int Size = Mask.size();
5541 KnownUndef = KnownZero = APInt::getZero(Size);
5542
5543 V1 = peekThroughBitcasts(V1);
5544 V2 = peekThroughBitcasts(V2);
5545
5546 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
5547 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
5548
5549 int VectorSizeInBits = V1.getValueSizeInBits();
5550 int ScalarSizeInBits = VectorSizeInBits / Size;
5551 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
5552
5553 for (int i = 0; i < Size; ++i) {
5554 int M = Mask[i];
5555 // Handle the easy cases.
5556 if (M < 0) {
5557 KnownUndef.setBit(i);
5558 continue;
5559 }
5560 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
5561 KnownZero.setBit(i);
5562 continue;
5563 }
5564
5565 // Determine shuffle input and normalize the mask.
5566 SDValue V = M < Size ? V1 : V2;
5567 M %= Size;
5568
5569 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5570 if (V.getOpcode() != ISD::BUILD_VECTOR)
5571 continue;
5572
5573 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
5574 // the (larger) source element must be UNDEF/ZERO.
5575 if ((Size % V.getNumOperands()) == 0) {
5576 int Scale = Size / V->getNumOperands();
5577 SDValue Op = V.getOperand(M / Scale);
5578 if (Op.isUndef())
5579 KnownUndef.setBit(i);
5580 if (X86::isZeroNode(Op))
5581 KnownZero.setBit(i);
5582 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
5583 APInt Val = Cst->getAPIntValue();
5584 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5585 if (Val == 0)
5586 KnownZero.setBit(i);
5587 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5588 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5589 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5590 if (Val == 0)
5591 KnownZero.setBit(i);
5592 }
5593 continue;
5594 }
5595
5596 // If the BUILD_VECTOR has more elements then all the (smaller) source
5597 // elements must be UNDEF or ZERO.
5598 if ((V.getNumOperands() % Size) == 0) {
5599 int Scale = V->getNumOperands() / Size;
5600 bool AllUndef = true;
5601 bool AllZero = true;
5602 for (int j = 0; j < Scale; ++j) {
5603 SDValue Op = V.getOperand((M * Scale) + j);
5604 AllUndef &= Op.isUndef();
5605 AllZero &= X86::isZeroNode(Op);
5606 }
5607 if (AllUndef)
5608 KnownUndef.setBit(i);
5609 if (AllZero)
5610 KnownZero.setBit(i);
5611 continue;
5612 }
5613 }
5614}
5615
5616/// Decode a target shuffle mask and inputs and see if any values are
5617/// known to be undef or zero from their inputs.
5618/// Returns true if the target shuffle mask was decoded.
5619/// FIXME: Merge this with computeZeroableShuffleElements?
5622 APInt &KnownUndef, APInt &KnownZero) {
5623 bool IsUnary;
5624 if (!isTargetShuffle(N.getOpcode()))
5625 return false;
5626
5627 MVT VT = N.getSimpleValueType();
5628 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
5629 return false;
5630
5631 int Size = Mask.size();
5632 SDValue V1 = Ops[0];
5633 SDValue V2 = IsUnary ? V1 : Ops[1];
5634 KnownUndef = KnownZero = APInt::getZero(Size);
5635
5636 V1 = peekThroughBitcasts(V1);
5637 V2 = peekThroughBitcasts(V2);
5638
5639 assert((VT.getSizeInBits() % Size) == 0 &&
5640 "Illegal split of shuffle value type");
5641 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
5642
5643 // Extract known constant input data.
5644 APInt UndefSrcElts[2];
5645 SmallVector<APInt, 32> SrcEltBits[2];
5646 bool IsSrcConstant[2] = {
5647 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5648 SrcEltBits[0], /*AllowWholeUndefs*/ true,
5649 /*AllowPartialUndefs*/ false),
5650 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5651 SrcEltBits[1], /*AllowWholeUndefs*/ true,
5652 /*AllowPartialUndefs*/ false)};
5653
5654 for (int i = 0; i < Size; ++i) {
5655 int M = Mask[i];
5656
5657 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5658 if (M < 0) {
5659 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
5660 if (SM_SentinelUndef == M)
5661 KnownUndef.setBit(i);
5662 if (SM_SentinelZero == M)
5663 KnownZero.setBit(i);
5664 continue;
5665 }
5666
5667 // Determine shuffle input and normalize the mask.
5668 unsigned SrcIdx = M / Size;
5669 SDValue V = M < Size ? V1 : V2;
5670 M %= Size;
5671
5672 // We are referencing an UNDEF input.
5673 if (V.isUndef()) {
5674 KnownUndef.setBit(i);
5675 continue;
5676 }
5677
5678 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5679 // TODO: We currently only set UNDEF for integer types - floats use the same
5680 // registers as vectors and many of the scalar folded loads rely on the
5681 // SCALAR_TO_VECTOR pattern.
5682 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5683 (Size % V.getValueType().getVectorNumElements()) == 0) {
5684 int Scale = Size / V.getValueType().getVectorNumElements();
5685 int Idx = M / Scale;
5686 if (Idx != 0 && !VT.isFloatingPoint())
5687 KnownUndef.setBit(i);
5688 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5689 KnownZero.setBit(i);
5690 continue;
5691 }
5692
5693 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
5694 // base vectors.
5695 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
5696 SDValue Vec = V.getOperand(0);
5697 int NumVecElts = Vec.getValueType().getVectorNumElements();
5698 if (Vec.isUndef() && Size == NumVecElts) {
5699 int Idx = V.getConstantOperandVal(2);
5700 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
5701 if (M < Idx || (Idx + NumSubElts) <= M)
5702 KnownUndef.setBit(i);
5703 }
5704 continue;
5705 }
5706
5707 // Attempt to extract from the source's constant bits.
5708 if (IsSrcConstant[SrcIdx]) {
5709 if (UndefSrcElts[SrcIdx][M])
5710 KnownUndef.setBit(i);
5711 else if (SrcEltBits[SrcIdx][M] == 0)
5712 KnownZero.setBit(i);
5713 }
5714 }
5715
5716 assert(VT.getVectorNumElements() == (unsigned)Size &&
5717 "Different mask size from vector size!");
5718 return true;
5719}
5720
5721// Replace target shuffle mask elements with known undef/zero sentinels.
5723 const APInt &KnownUndef,
5724 const APInt &KnownZero,
5725 bool ResolveKnownZeros= true) {
5726 unsigned NumElts = Mask.size();
5727 assert(KnownUndef.getBitWidth() == NumElts &&
5728 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
5729
5730 for (unsigned i = 0; i != NumElts; ++i) {
5731 if (KnownUndef[i])
5732 Mask[i] = SM_SentinelUndef;
5733 else if (ResolveKnownZeros && KnownZero[i])
5734 Mask[i] = SM_SentinelZero;
5735 }
5736}
5737
5738// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
5740 APInt &KnownUndef,
5741 APInt &KnownZero) {
5742 unsigned NumElts = Mask.size();
5743 KnownUndef = KnownZero = APInt::getZero(NumElts);
5744
5745 for (unsigned i = 0; i != NumElts; ++i) {
5746 int M = Mask[i];
5747 if (SM_SentinelUndef == M)
5748 KnownUndef.setBit(i);
5749 if (SM_SentinelZero == M)
5750 KnownZero.setBit(i);
5751 }
5752}
5753
5754// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
5756 SDValue Cond, bool IsBLENDV = false) {
5757 EVT CondVT = Cond.getValueType();
5758 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
5759 unsigned NumElts = CondVT.getVectorNumElements();
5760
5761 APInt UndefElts;
5762 SmallVector<APInt, 32> EltBits;
5763 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
5764 /*AllowWholeUndefs*/ true,
5765 /*AllowPartialUndefs*/ false))
5766 return false;
5767
5768 Mask.resize(NumElts, SM_SentinelUndef);
5769
5770 for (int i = 0; i != (int)NumElts; ++i) {
5771 Mask[i] = i;
5772 // Arbitrarily choose from the 2nd operand if the select condition element
5773 // is undef.
5774 // TODO: Can we do better by matching patterns such as even/odd?
5775 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
5776 (IsBLENDV && EltBits[i].isNonNegative()))
5777 Mask[i] += NumElts;
5778 }
5779
5780 return true;
5781}
5782
5783// Forward declaration (for getFauxShuffleMask recursive check).
5784static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
5787 const SelectionDAG &DAG, unsigned Depth,
5788 bool ResolveKnownElts);
5789
5790// Attempt to decode ops that could be represented as a shuffle mask.
5791// The decoded shuffle mask may contain a different number of elements to the
5792// destination value type.
5793// TODO: Merge into getTargetShuffleInputs()
5794static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
5797 const SelectionDAG &DAG, unsigned Depth,
5798 bool ResolveKnownElts) {
5799 Mask.clear();
5800 Ops.clear();
5801
5802 MVT VT = N.getSimpleValueType();
5803 unsigned NumElts = VT.getVectorNumElements();
5804 unsigned NumSizeInBits = VT.getSizeInBits();
5805 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5806 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
5807 return false;
5808 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
5809 unsigned NumSizeInBytes = NumSizeInBits / 8;
5810 unsigned NumBytesPerElt = NumBitsPerElt / 8;
5811
5812 unsigned Opcode = N.getOpcode();
5813 switch (Opcode) {
5814 case ISD::VECTOR_SHUFFLE: {
5815 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
5816 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
5817 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
5818 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
5819 Ops.push_back(N.getOperand(0));
5820 Ops.push_back(N.getOperand(1));
5821 return true;
5822 }
5823 return false;
5824 }
5825 case ISD::AND:
5826 case X86ISD::ANDNP: {
5827 // Attempt to decode as a per-byte mask.
5828 APInt UndefElts;
5829 SmallVector<APInt, 32> EltBits;
5830 SDValue N0 = N.getOperand(0);
5831 SDValue N1 = N.getOperand(1);
5832 bool IsAndN = (X86ISD::ANDNP == Opcode);
5833 uint64_t ZeroMask = IsAndN ? 255 : 0;
5834 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
5835 /*AllowWholeUndefs*/ false,
5836 /*AllowPartialUndefs*/ false))
5837 return false;
5838 // We can't assume an undef src element gives an undef dst - the other src
5839 // might be zero.
5840 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
5841 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5842 const APInt &ByteBits = EltBits[i];
5843 if (ByteBits != 0 && ByteBits != 255)
5844 return false;
5845 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
5846 }
5847 Ops.push_back(IsAndN ? N1 : N0);
5848 return true;
5849 }
5850 case ISD::OR: {
5851 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
5852 // is a valid shuffle index.
5853 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
5854 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
5855 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
5856 return false;
5857
5858 SmallVector<int, 64> SrcMask0, SrcMask1;
5859 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
5862 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
5863 Depth + 1, true) ||
5864 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
5865 Depth + 1, true))
5866 return false;
5867
5868 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
5869 SmallVector<int, 64> Mask0, Mask1;
5870 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
5871 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
5872 for (int i = 0; i != (int)MaskSize; ++i) {
5873 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
5874 // loops converting between OR and BLEND shuffles due to
5875 // canWidenShuffleElements merging away undef elements, meaning we
5876 // fail to recognise the OR as the undef element isn't known zero.
5877 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
5878 Mask.push_back(SM_SentinelZero);
5879 else if (Mask1[i] == SM_SentinelZero)
5880 Mask.push_back(i);
5881 else if (Mask0[i] == SM_SentinelZero)
5882 Mask.push_back(i + MaskSize);
5883 else
5884 return false;
5885 }
5886 Ops.push_back(N0);
5887 Ops.push_back(N1);
5888 return true;
5889 }
5890 case ISD::INSERT_SUBVECTOR: {
5891 SDValue Src = N.getOperand(0);
5892 SDValue Sub = N.getOperand(1);
5893 EVT SubVT = Sub.getValueType();
5894 unsigned NumSubElts = SubVT.getVectorNumElements();
5895 if (!N->isOnlyUserOf(Sub.getNode()))
5896 return false;
5897 SDValue SubBC = peekThroughBitcasts(Sub);
5898 uint64_t InsertIdx = N.getConstantOperandVal(2);
5899 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
5900 if (SubBC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5901 SubBC.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
5902 uint64_t ExtractIdx = SubBC.getConstantOperandVal(1);
5903 SDValue SubBCSrc = SubBC.getOperand(0);
5904 unsigned NumSubSrcBCElts = SubBCSrc.getValueType().getVectorNumElements();
5905 unsigned MaxElts = std::max(NumElts, NumSubSrcBCElts);
5906 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcBCElts) == 0 &&
5907 "Subvector valuetype mismatch");
5908 InsertIdx *= (MaxElts / NumElts);
5909 ExtractIdx *= (MaxElts / NumSubSrcBCElts);
5910 NumSubElts *= (MaxElts / NumElts);
5911 bool SrcIsUndef = Src.isUndef();
5912 for (int i = 0; i != (int)MaxElts; ++i)
5913 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
5914 for (int i = 0; i != (int)NumSubElts; ++i)
5915 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
5916 if (!SrcIsUndef)
5917 Ops.push_back(Src);
5918 Ops.push_back(SubBCSrc);
5919 return true;
5920 }
5921 // Handle CONCAT(SUB0, SUB1).
5922 // Limit this to vXi64 512-bit vector cases to make the most of AVX512
5923 // cross lane shuffles.
5924 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
5925 NumBitsPerElt == 64 && NumSizeInBits == 512 &&
5926 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
5927 Src.getOperand(0).isUndef() &&
5928 Src.getOperand(1).getValueType() == SubVT &&
5929 Src.getConstantOperandVal(2) == 0) {
5930 for (int i = 0; i != (int)NumSubElts; ++i)
5931 Mask.push_back(i);
5932 for (int i = 0; i != (int)NumSubElts; ++i)
5933 Mask.push_back(i + NumElts);
5934 Ops.push_back(Src.getOperand(1));
5935 Ops.push_back(Sub);
5936 return true;
5937 }
5938 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
5939 SmallVector<int, 64> SubMask;
5940 SmallVector<SDValue, 2> SubInputs;
5941 SDValue SubSrc = peekThroughOneUseBitcasts(Sub);
5942 EVT SubSrcVT = SubSrc.getValueType();
5943 if (!SubSrcVT.isVector())
5944 return false;
5945
5946 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
5947 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
5948 Depth + 1, ResolveKnownElts))
5949 return false;
5950
5951 // Subvector shuffle inputs must not be larger than the subvector.
5952 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
5953 return SubVT.getFixedSizeInBits() <
5954 SubInput.getValueSizeInBits().getFixedValue();
5955 }))
5956 return false;
5957
5958 if (SubMask.size() != NumSubElts) {
5959 assert(((SubMask.size() % NumSubElts) == 0 ||
5960 (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
5961 if ((NumSubElts % SubMask.size()) == 0) {
5962 int Scale = NumSubElts / SubMask.size();
5963 SmallVector<int,64> ScaledSubMask;
5964 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
5965 SubMask = ScaledSubMask;
5966 } else {
5967 int Scale = SubMask.size() / NumSubElts;
5968 NumSubElts = SubMask.size();
5969 NumElts *= Scale;
5970 InsertIdx *= Scale;
5971 }
5972 }
5973 Ops.push_back(Src);
5974 Ops.append(SubInputs.begin(), SubInputs.end());
5975 if (ISD::isBuildVectorAllZeros(Src.getNode()))
5976 Mask.append(NumElts, SM_SentinelZero);
5977 else
5978 for (int i = 0; i != (int)NumElts; ++i)
5979 Mask.push_back(i);
5980 for (int i = 0; i != (int)NumSubElts; ++i) {
5981 int M = SubMask[i];
5982 if (0 <= M) {
5983 int InputIdx = M / NumSubElts;
5984 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
5985 }
5986 Mask[i + InsertIdx] = M;
5987 }
5988 return true;
5989 }
5990 case X86ISD::PINSRB:
5991 case X86ISD::PINSRW:
5994 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
5995 // vector, for matching src/dst vector types.
5996 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
5997
5998 unsigned DstIdx = 0;
5999 if (Opcode != ISD::SCALAR_TO_VECTOR) {
6000 // Check we have an in-range constant insertion index.
6001 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
6002 N.getConstantOperandAPInt(2).uge(NumElts))
6003 return false;
6004 DstIdx = N.getConstantOperandVal(2);
6005
6006 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
6007 if (X86::isZeroNode(Scl)) {
6008 Ops.push_back(N.getOperand(0));
6009 for (unsigned i = 0; i != NumElts; ++i)
6010 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
6011 return true;
6012 }
6013 }
6014
6015 // Peek through trunc/aext/zext/bitcast.
6016 // TODO: aext shouldn't require SM_SentinelZero padding.
6017 // TODO: handle shift of scalars.
6018 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
6019 while (Scl.getOpcode() == ISD::TRUNCATE ||
6020 Scl.getOpcode() == ISD::ANY_EXTEND ||
6021 Scl.getOpcode() == ISD::ZERO_EXTEND ||
6022 (Scl.getOpcode() == ISD::BITCAST &&
6025 Scl = Scl.getOperand(0);
6026 MinBitsPerElt =
6027 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
6028 }
6029 if ((MinBitsPerElt % 8) != 0)
6030 return false;
6031
6032 // Attempt to find the source vector the scalar was extracted from.
6033 SDValue SrcExtract;
6034 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6035 Scl.getOpcode() == X86ISD::PEXTRW ||
6036 Scl.getOpcode() == X86ISD::PEXTRB) &&
6037 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6038 SrcExtract = Scl;
6039 }
6040 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6041 return false;
6042
6043 SDValue SrcVec = SrcExtract.getOperand(0);
6044 EVT SrcVT = SrcVec.getValueType();
6045 if (!SrcVT.getScalarType().isByteSized())
6046 return false;
6047 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6048 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6049 unsigned DstByte = DstIdx * NumBytesPerElt;
6050 MinBitsPerElt =
6051 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6052
6053 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6054 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6055 Ops.push_back(SrcVec);
6056 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6057 } else {
6058 Ops.push_back(SrcVec);
6059 Ops.push_back(N.getOperand(0));
6060 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6061 Mask.push_back(NumSizeInBytes + i);
6062 }
6063
6064 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6065 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6066 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6067 Mask[DstByte + i] = SrcByte + i;
6068 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6069 Mask[DstByte + i] = SM_SentinelZero;
6070 return true;
6071 }
6072 case X86ISD::PACKSS:
6073 case X86ISD::PACKUS: {
6074 SDValue N0 = N.getOperand(0);
6075 SDValue N1 = N.getOperand(1);
6076 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6077 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6078 "Unexpected input value type");
6079
6080 APInt EltsLHS, EltsRHS;
6081 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6082
6083 // If we know input saturation won't happen (or we don't care for particular
6084 // lanes), we can treat this as a truncation shuffle.
6085 bool Offset0 = false, Offset1 = false;
6086 if (Opcode == X86ISD::PACKSS) {
6087 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6088 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6089 (!(N1.isUndef() || EltsRHS.isZero()) &&
6090 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6091 return false;
6092 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6093 // PACKSS then it was likely being used for sign-extension for a
6094 // truncation, so just peek through and adjust the mask accordingly.
6095 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6096 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6097 Offset0 = true;
6098 N0 = N0.getOperand(0);
6099 }
6100 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6101 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6102 Offset1 = true;
6103 N1 = N1.getOperand(0);
6104 }
6105 } else {
6106 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6107 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6108 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6109 (!(N1.isUndef() || EltsRHS.isZero()) &&
6110 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6111 return false;
6112 }
6113
6114 bool IsUnary = (N0 == N1);
6115
6116 Ops.push_back(N0);
6117 if (!IsUnary)
6118 Ops.push_back(N1);
6119
6120 createPackShuffleMask(VT, Mask, IsUnary);
6121
6122 if (Offset0 || Offset1) {
6123 for (int &M : Mask)
6124 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6125 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6126 ++M;
6127 }
6128 return true;
6129 }
6130 case ISD::VSELECT:
6131 case X86ISD::BLENDV: {
6132 SDValue Cond = N.getOperand(0);
6133 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6134 Ops.push_back(N.getOperand(1));
6135 Ops.push_back(N.getOperand(2));
6136 return true;
6137 }
6138 return false;
6139 }
6140 case X86ISD::VTRUNC: {
6141 SDValue Src = N.getOperand(0);
6142 EVT SrcVT = Src.getValueType();
6143 // Truncated source must be a simple vector.
6144 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6145 (SrcVT.getScalarSizeInBits() % 8) != 0)
6146 return false;
6147 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6148 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6149 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6150 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6151 for (unsigned i = 0; i != NumSrcElts; ++i)
6152 Mask.push_back(i * Scale);
6153 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6154 Ops.push_back(Src);
6155 return true;
6156 }
6157 case X86ISD::VSHLI:
6158 case X86ISD::VSRLI: {
6159 uint64_t ShiftVal = N.getConstantOperandVal(1);
6160 // Out of range bit shifts are guaranteed to be zero.
6161 if (NumBitsPerElt <= ShiftVal) {
6162 Mask.append(NumElts, SM_SentinelZero);
6163 return true;
6164 }
6165
6166 // We can only decode 'whole byte' bit shifts as shuffles.
6167 if ((ShiftVal % 8) != 0)
6168 break;
6169
6170 uint64_t ByteShift = ShiftVal / 8;
6171 Ops.push_back(N.getOperand(0));
6172
6173 // Clear mask to all zeros and insert the shifted byte indices.
6174 Mask.append(NumSizeInBytes, SM_SentinelZero);
6175
6176 if (X86ISD::VSHLI == Opcode) {
6177 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6178 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6179 Mask[i + j] = i + j - ByteShift;
6180 } else {
6181 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6182 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6183 Mask[i + j - ByteShift] = i + j;
6184 }
6185 return true;
6186 }
6187 case X86ISD::VROTLI:
6188 case X86ISD::VROTRI: {
6189 // We can only decode 'whole byte' bit rotates as shuffles.
6190 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6191 if ((RotateVal % 8) != 0)
6192 return false;
6193 Ops.push_back(N.getOperand(0));
6194 int Offset = RotateVal / 8;
6195 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6196 for (int i = 0; i != (int)NumElts; ++i) {
6197 int BaseIdx = i * NumBytesPerElt;
6198 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
6199 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
6200 }
6201 }
6202 return true;
6203 }
6204 case X86ISD::VBROADCAST: {
6205 SDValue Src = N.getOperand(0);
6206 if (!Src.getSimpleValueType().isVector()) {
6207 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6208 !isNullConstant(Src.getOperand(1)) ||
6209 Src.getOperand(0).getValueType().getScalarType() !=
6210 VT.getScalarType())
6211 return false;
6212 Src = Src.getOperand(0);
6213 }
6214 Ops.push_back(Src);
6215 Mask.append(NumElts, 0);
6216 return true;
6217 }
6219 SDValue Src = N.getOperand(0);
6220 EVT SrcVT = Src.getValueType();
6221 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6222
6223 // Extended source must be a simple vector.
6224 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6225 (NumBitsPerSrcElt % 8) != 0)
6226 return false;
6227
6228 // We can only handle all-signbits extensions.
6229 APInt DemandedSrcElts =
6230 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
6231 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
6232 return false;
6233
6234 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
6235 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6236 for (unsigned I = 0; I != NumElts; ++I)
6237 Mask.append(Scale, I);
6238 Ops.push_back(Src);
6239 return true;
6240 }
6241 case ISD::ZERO_EXTEND:
6242 case ISD::ANY_EXTEND:
6245 SDValue Src = N.getOperand(0);
6246 EVT SrcVT = Src.getValueType();
6247
6248 // Extended source must be a simple vector.
6249 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6250 (SrcVT.getScalarSizeInBits() % 8) != 0)
6251 return false;
6252
6253 bool IsAnyExtend =
6254 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
6255 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6256 IsAnyExtend, Mask);
6257 Ops.push_back(Src);
6258 return true;
6259 }
6260 }
6261
6262 return false;
6263}
6264
6265/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
6267 SmallVectorImpl<int> &Mask) {
6268 int MaskWidth = Mask.size();
6269 SmallVector<SDValue, 16> UsedInputs;
6270 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6271 int lo = UsedInputs.size() * MaskWidth;
6272 int hi = lo + MaskWidth;
6273
6274 // Strip UNDEF input usage.
6275 if (Inputs[i].isUndef())
6276 for (int &M : Mask)
6277 if ((lo <= M) && (M < hi))
6278 M = SM_SentinelUndef;
6279
6280 // Check for unused inputs.
6281 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6282 for (int &M : Mask)
6283 if (lo <= M)
6284 M -= MaskWidth;
6285 continue;
6286 }
6287
6288 // Check for repeated inputs.
6289 bool IsRepeat = false;
6290 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
6291 if (UsedInputs[j] != Inputs[i])
6292 continue;
6293 for (int &M : Mask)
6294 if (lo <= M)
6295 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6296 IsRepeat = true;
6297 break;
6298 }
6299 if (IsRepeat)
6300 continue;
6301
6302 UsedInputs.push_back(Inputs[i]);
6303 }
6304 Inputs = UsedInputs;
6305}
6306
6307/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
6308/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
6309/// Returns true if the target shuffle mask was decoded.
6310static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6313 APInt &KnownUndef, APInt &KnownZero,
6314 const SelectionDAG &DAG, unsigned Depth,
6315 bool ResolveKnownElts) {
6317 return false; // Limit search depth.
6318
6319 EVT VT = Op.getValueType();
6320 if (!VT.isSimple() || !VT.isVector())
6321 return false;
6322
6323 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
6324 if (ResolveKnownElts)
6325 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
6326 return true;
6327 }
6328 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6329 ResolveKnownElts)) {
6330 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
6331 return true;
6332 }
6333 return false;
6334}
6335
6336static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6339 const SelectionDAG &DAG, unsigned Depth,
6340 bool ResolveKnownElts) {
6341 APInt KnownUndef, KnownZero;
6342 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
6343 KnownZero, DAG, Depth, ResolveKnownElts);
6344}
6345
6348 const SelectionDAG &DAG, unsigned Depth = 0,
6349 bool ResolveKnownElts = true) {
6350 EVT VT = Op.getValueType();
6351 if (!VT.isSimple() || !VT.isVector())
6352 return false;
6353
6354 unsigned NumElts = Op.getValueType().getVectorNumElements();
6355 APInt DemandedElts = APInt::getAllOnes(NumElts);
6356 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6357 ResolveKnownElts);
6358}
6359
6360// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
6361static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
6362 EVT MemVT, MemSDNode *Mem, unsigned Offset,
6363 SelectionDAG &DAG) {
6364 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
6365 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
6366 "Unknown broadcast load type");
6367
6368 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6369 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6370 return SDValue();
6371
6374 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6375 SDValue Ops[] = {Mem->getChain(), Ptr};
6376 SDValue BcstLd = DAG.getMemIntrinsicNode(
6377 Opcode, DL, Tys, Ops, MemVT,
6379 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6380 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
6381 return BcstLd;
6382}
6383
6384/// Returns the scalar element that will make up the i'th
6385/// element of the result of the vector shuffle.
6387 SelectionDAG &DAG, unsigned Depth) {
6389 return SDValue(); // Limit search depth.
6390
6391 EVT VT = Op.getValueType();
6392 unsigned Opcode = Op.getOpcode();
6393 unsigned NumElems = VT.getVectorNumElements();
6394
6395 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6396 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
6397 int Elt = SV->getMaskElt(Index);
6398
6399 if (Elt < 0)
6400 return DAG.getUNDEF(VT.getVectorElementType());
6401
6402 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6403 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6404 }
6405
6406 // Recurse into target specific vector shuffles to find scalars.
6407 if (isTargetShuffle(Opcode)) {
6408 MVT ShufVT = VT.getSimpleVT();
6409 MVT ShufSVT = ShufVT.getVectorElementType();
6410 int NumElems = (int)ShufVT.getVectorNumElements();
6411 SmallVector<int, 16> ShuffleMask;
6413 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
6414 return SDValue();
6415
6416 int Elt = ShuffleMask[Index];
6417 if (Elt == SM_SentinelZero)
6418 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
6419 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
6420 if (Elt == SM_SentinelUndef)
6421 return DAG.getUNDEF(ShufSVT);
6422
6423 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
6424 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6425 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6426 }
6427
6428 // Recurse into insert_subvector base/sub vector to find scalars.
6429 if (Opcode == ISD::INSERT_SUBVECTOR) {
6430 SDValue Vec = Op.getOperand(0);
6431 SDValue Sub = Op.getOperand(1);
6432 uint64_t SubIdx = Op.getConstantOperandVal(2);
6433 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
6434
6435 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6436 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6437 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6438 }
6439
6440 // Recurse into concat_vectors sub vector to find scalars.
6441 if (Opcode == ISD::CONCAT_VECTORS) {
6442 EVT SubVT = Op.getOperand(0).getValueType();
6443 unsigned NumSubElts = SubVT.getVectorNumElements();
6444 uint64_t SubIdx = Index / NumSubElts;
6445 uint64_t SubElt = Index % NumSubElts;
6446 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6447 }
6448
6449 // Recurse into extract_subvector src vector to find scalars.
6450 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
6451 SDValue Src = Op.getOperand(0);
6452 uint64_t SrcIdx = Op.getConstantOperandVal(1);
6453 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6454 }
6455
6456 // We only peek through bitcasts of the same vector width.
6457 if (Opcode == ISD::BITCAST) {
6458 SDValue Src = Op.getOperand(0);
6459 EVT SrcVT = Src.getValueType();
6460 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
6461 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6462 return SDValue();
6463 }
6464
6465 // Actual nodes that may contain scalar elements
6466
6467 // For insert_vector_elt - either return the index matching scalar or recurse
6468 // into the base vector.
6469 if (Opcode == ISD::INSERT_VECTOR_ELT &&
6470 isa<ConstantSDNode>(Op.getOperand(2))) {
6471 if (Op.getConstantOperandAPInt(2) == Index)
6472 return Op.getOperand(1);
6473 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6474 }
6475
6476 if (Opcode == ISD::SCALAR_TO_VECTOR)
6477 return (Index == 0) ? Op.getOperand(0)
6478 : DAG.getUNDEF(VT.getVectorElementType());
6479
6480 if (Opcode == ISD::BUILD_VECTOR)
6481 return Op.getOperand(Index);
6482
6483 return SDValue();
6484}
6485
6486// Use PINSRB/PINSRW/PINSRD to create a build vector.
6488 const APInt &NonZeroMask,
6489 unsigned NumNonZero, unsigned NumZero,
6490 SelectionDAG &DAG,
6491 const X86Subtarget &Subtarget) {
6492 MVT VT = Op.getSimpleValueType();
6493 unsigned NumElts = VT.getVectorNumElements();
6494 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6495 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6496 "Illegal vector insertion");
6497
6498 SDValue V;
6499 bool First = true;
6500
6501 for (unsigned i = 0; i < NumElts; ++i) {
6502 bool IsNonZero = NonZeroMask[i];
6503 if (!IsNonZero)
6504 continue;
6505
6506 // If the build vector contains zeros or our first insertion is not the
6507 // first index then insert into zero vector to break any register
6508 // dependency else use SCALAR_TO_VECTOR.
6509 if (First) {
6510 First = false;
6511 if (NumZero || 0 != i)
6512 V = getZeroVector(VT, Subtarget, DAG, DL);
6513 else {
6514 assert(0 == i && "Expected insertion into zero-index");
6515 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6516 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6517 V = DAG.getBitcast(VT, V);
6518 continue;
6519 }
6520 }
6521 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
6522 DAG.getIntPtrConstant(i, DL));
6523 }
6524
6525 return V;
6526}
6527
6528/// Custom lower build_vector of v16i8.
6530 const APInt &NonZeroMask,
6531 unsigned NumNonZero, unsigned NumZero,
6532 SelectionDAG &DAG,
6533 const X86Subtarget &Subtarget) {
6534 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6535 return SDValue();
6536
6537 // SSE4.1 - use PINSRB to insert each byte directly.
6538 if (Subtarget.hasSSE41())
6539 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
6540 DAG, Subtarget);
6541
6542 SDValue V;
6543
6544 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6545 // If both the lowest 16-bits are non-zero, then convert to MOVD.
6546 if (!NonZeroMask.extractBits(2, 0).isZero() &&
6547 !NonZeroMask.extractBits(2, 2).isZero()) {
6548 for (unsigned I = 0; I != 4; ++I) {
6549 if (!NonZeroMask[I])
6550 continue;
6551 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
6552 if (I != 0)
6553 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
6554 DAG.getConstant(I * 8, DL, MVT::i8));
6555 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
6556 }
6557 assert(V && "Failed to fold v16i8 vector to zero");
6558 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6559 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
6560 V = DAG.getBitcast(MVT::v8i16, V);
6561 }
6562 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
6563 bool ThisIsNonZero = NonZeroMask[i];
6564 bool NextIsNonZero = NonZeroMask[i + 1];
6565 if (!ThisIsNonZero && !NextIsNonZero)
6566 continue;
6567
6568 SDValue Elt;
6569 if (ThisIsNonZero) {
6570 if (NumZero || NextIsNonZero)
6571 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6572 else
6573 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6574 }
6575
6576 if (NextIsNonZero) {
6577 SDValue NextElt = Op.getOperand(i + 1);
6578 if (i == 0 && NumZero)
6579 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
6580 else
6581 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
6582 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
6583 DAG.getConstant(8, DL, MVT::i8));
6584 if (ThisIsNonZero)
6585 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
6586 else
6587 Elt = NextElt;
6588 }
6589
6590 // If our first insertion is not the first index or zeros are needed, then
6591 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
6592 // elements undefined).
6593 if (!V) {
6594 if (i != 0 || NumZero)
6595 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
6596 else {
6597 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
6598 V = DAG.getBitcast(MVT::v8i16, V);
6599 continue;
6600 }
6601 }
6602 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6603 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
6604 DAG.getIntPtrConstant(i / 2, DL));
6605 }
6606
6607 return DAG.getBitcast(MVT::v16i8, V);
6608}
6609
6610/// Custom lower build_vector of v8i16.
6612 const APInt &NonZeroMask,
6613 unsigned NumNonZero, unsigned NumZero,
6614 SelectionDAG &DAG,
6615 const X86Subtarget &Subtarget) {
6616 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6617 return SDValue();
6618
6619 // Use PINSRW to insert each byte directly.
6620 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
6621 Subtarget);
6622}
6623
6624/// Custom lower build_vector of v4i32 or v4f32.
6626 SelectionDAG &DAG,
6627 const X86Subtarget &Subtarget) {
6628 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
6629 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
6630 // Because we're creating a less complicated build vector here, we may enable
6631 // further folding of the MOVDDUP via shuffle transforms.
6632 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
6633 Op.getOperand(0) == Op.getOperand(2) &&
6634 Op.getOperand(1) == Op.getOperand(3) &&
6635 Op.getOperand(0) != Op.getOperand(1)) {
6636 MVT VT = Op.getSimpleValueType();
6637 MVT EltVT = VT.getVectorElementType();
6638 // Create a new build vector with the first 2 elements followed by undef
6639 // padding, bitcast to v2f64, duplicate, and bitcast back.
6640 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
6641 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
6642 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
6643 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
6644 return DAG.getBitcast(VT, Dup);
6645 }
6646
6647 // Find all zeroable elements.
6648 std::bitset<4> Zeroable, Undefs;
6649 for (int i = 0; i < 4; ++i) {
6650 SDValue Elt = Op.getOperand(i);
6651 Undefs[i] = Elt.isUndef();
6652 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6653 }
6654 assert(Zeroable.size() - Zeroable.count() > 1 &&
6655 "We expect at least two non-zero elements!");
6656
6657 // We only know how to deal with build_vector nodes where elements are either
6658 // zeroable or extract_vector_elt with constant index.
6659 SDValue FirstNonZero;
6660 unsigned FirstNonZeroIdx;
6661 for (unsigned i = 0; i < 4; ++i) {
6662 if (Zeroable[i])
6663 continue;
6664 SDValue Elt = Op.getOperand(i);
6665 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6666 !isa<ConstantSDNode>(Elt.getOperand(1)))
6667 return SDValue();
6668 // Make sure that this node is extracting from a 128-bit vector.
6669 MVT VT = Elt.getOperand(0).getSimpleValueType();
6670 if (!VT.is128BitVector())
6671 return SDValue();
6672 if (!FirstNonZero.getNode()) {
6673 FirstNonZero = Elt;
6674 FirstNonZeroIdx = i;
6675 }
6676 }
6677
6678 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6679 SDValue V1 = FirstNonZero.getOperand(0);
6680 MVT VT = V1.getSimpleValueType();
6681
6682 // See if this build_vector can be lowered as a blend with zero.
6683 SDValue Elt;
6684 unsigned EltMaskIdx, EltIdx;
6685 int Mask[4];
6686 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6687 if (Zeroable[EltIdx]) {
6688 // The zero vector will be on the right hand side.
6689 Mask[EltIdx] = EltIdx+4;
6690 continue;
6691 }
6692
6693 Elt = Op->getOperand(EltIdx);
6694 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6695 EltMaskIdx = Elt.getConstantOperandVal(1);
6696 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6697 break;
6698 Mask[EltIdx] = EltIdx;
6699 }
6700
6701 if (EltIdx == 4) {
6702 // Let the shuffle legalizer deal with blend operations.
6703 SDValue VZeroOrUndef = (Zeroable == Undefs)
6704 ? DAG.getUNDEF(VT)
6705 : getZeroVector(VT, Subtarget, DAG, DL);
6706 if (V1.getSimpleValueType() != VT)
6707 V1 = DAG.getBitcast(VT, V1);
6708 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
6709 }
6710
6711 // See if we can lower this build_vector to a INSERTPS.
6712 if (!Subtarget.hasSSE41())
6713 return SDValue();
6714
6715 SDValue V2 = Elt.getOperand(0);
6716 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6717 V1 = SDValue();
6718
6719 bool CanFold = true;
6720 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6721 if (Zeroable[i])
6722 continue;
6723
6724 SDValue Current = Op->getOperand(i);
6725 SDValue SrcVector = Current->getOperand(0);
6726 if (!V1.getNode())
6727 V1 = SrcVector;
6728 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
6729 }
6730
6731 if (!CanFold)
6732 return SDValue();
6733
6734 assert(V1.getNode() && "Expected at least two non-zero elements!");
6735 if (V1.getSimpleValueType() != MVT::v4f32)
6736 V1 = DAG.getBitcast(MVT::v4f32, V1);
6737 if (V2.getSimpleValueType() != MVT::v4f32)
6738 V2 = DAG.getBitcast(MVT::v4f32, V2);
6739
6740 // Ok, we can emit an INSERTPS instruction.
6741 unsigned ZMask = Zeroable.to_ulong();
6742
6743 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6744 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6745 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6746 DAG.getIntPtrConstant(InsertPSMask, DL, true));
6747 return DAG.getBitcast(VT, Result);
6748}
6749
6750/// Return a vector logical shift node.
6751static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6752 SelectionDAG &DAG, const TargetLowering &TLI,
6753 const SDLoc &dl) {
6754 assert(VT.is128BitVector() && "Unknown type for VShift");
6755 MVT ShVT = MVT::v16i8;
6756 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6757 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6758 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6759 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
6760 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6761}
6762
6764 SelectionDAG &DAG) {
6765
6766 // Check if the scalar load can be widened into a vector load. And if
6767 // the address is "base + cst" see if the cst can be "absorbed" into
6768 // the shuffle mask.
6769 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6770 SDValue Ptr = LD->getBasePtr();
6771 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
6772 return SDValue();
6773 EVT PVT = LD->getValueType(0);
6774 if (PVT != MVT::i32 && PVT != MVT::f32)
6775 return SDValue();
6776
6777 int FI = -1;
6778 int64_t Offset = 0;
6779 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6780 FI = FINode->getIndex();
6781 Offset = 0;
6782 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6783 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6784 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6785 Offset = Ptr.getConstantOperandVal(1);
6786 Ptr = Ptr.getOperand(0);
6787 } else {
6788 return SDValue();
6789 }
6790
6791 // FIXME: 256-bit vector instructions don't require a strict alignment,
6792 // improve this code to support it better.
6793 Align RequiredAlign(VT.getSizeInBits() / 8);
6794 SDValue Chain = LD->getChain();
6795 // Make sure the stack object alignment is at least 16 or 32.
6797 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
6798 if (!InferredAlign || *InferredAlign < RequiredAlign) {
6799 if (MFI.isFixedObjectIndex(FI)) {
6800 // Can't change the alignment. FIXME: It's possible to compute
6801 // the exact stack offset and reference FI + adjust offset instead.
6802 // If someone *really* cares about this. That's the way to implement it.
6803 return SDValue();
6804 } else {
6805 MFI.setObjectAlignment(FI, RequiredAlign);
6806 }
6807 }
6808
6809 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6810 // Ptr + (Offset & ~15).
6811 if (Offset < 0)
6812 return SDValue();
6813 if ((Offset % RequiredAlign.value()) & 3)
6814 return SDValue();
6815 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
6816 if (StartOffset) {
6817 SDLoc DL(Ptr);
6818 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6819 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6820 }
6821
6822 int EltNo = (Offset - StartOffset) >> 2;
6823 unsigned NumElems = VT.getVectorNumElements();
6824
6825 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6826 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6827 LD->getPointerInfo().getWithOffset(StartOffset));
6828
6829 SmallVector<int, 8> Mask(NumElems, EltNo);
6830
6831 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6832 }
6833
6834 return SDValue();
6835}
6836
6837// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
6838static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
6839 if (ISD::isNON_EXTLoad(Elt.getNode())) {
6840 auto *BaseLd = cast<LoadSDNode>(Elt);
6841 if (!BaseLd->isSimple())
6842 return false;
6843 Ld = BaseLd;
6844 ByteOffset = 0;
6845 return true;
6846 }
6847
6848 switch (Elt.getOpcode()) {
6849 case ISD::BITCAST:
6850 case ISD::TRUNCATE:
6852 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
6853 case ISD::SRL:
6854 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
6855 uint64_t Amt = AmtC->getZExtValue();
6856 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
6857 ByteOffset += Amt / 8;
6858 return true;
6859 }
6860 }
6861 break;
6863 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
6864 SDValue Src = Elt.getOperand(0);
6865 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
6866 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
6867 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
6868 findEltLoadSrc(Src, Ld, ByteOffset)) {
6869 uint64_t Idx = IdxC->getZExtValue();
6870 ByteOffset += Idx * (SrcSizeInBits / 8);
6871 return true;
6872 }
6873 }
6874 break;
6875 }
6876
6877 return false;
6878}
6879
6880/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6881/// elements can be replaced by a single large load which has the same value as
6882/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6883///
6884/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6886 const SDLoc &DL, SelectionDAG &DAG,
6887 const X86Subtarget &Subtarget,
6888 bool IsAfterLegalize) {
6889 if ((VT.getScalarSizeInBits() % 8) != 0)
6890 return SDValue();
6891
6892 unsigned NumElems = Elts.size();
6893
6894 int LastLoadedElt = -1;
6895 APInt LoadMask = APInt::getZero(NumElems);
6896 APInt ZeroMask = APInt::getZero(NumElems);
6897 APInt UndefMask = APInt::getZero(NumElems);
6898
6899 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
6900 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
6901
6902 // For each element in the initializer, see if we've found a load, zero or an
6903 // undef.
6904 for (unsigned i = 0; i < NumElems; ++i) {
6905 SDValue Elt = peekThroughBitcasts(Elts[i]);
6906 if (!Elt.getNode())
6907 return SDValue();
6908 if (Elt.isUndef()) {
6909 UndefMask.setBit(i);
6910 continue;
6911 }
6913 ZeroMask.setBit(i);
6914 continue;
6915 }
6916
6917 // Each loaded element must be the correct fractional portion of the
6918 // requested vector load.
6919 unsigned EltSizeInBits = Elt.getValueSizeInBits();
6920 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
6921 return SDValue();
6922
6923 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
6924 return SDValue();
6925 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
6926 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
6927 return SDValue();
6928
6929 LoadMask.setBit(i);
6930 LastLoadedElt = i;
6931 }
6932 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
6933 NumElems &&
6934 "Incomplete element masks");
6935
6936 // Handle Special Cases - all undef or undef/zero.
6937 if (UndefMask.popcount() == NumElems)
6938 return DAG.getUNDEF(VT);
6939 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
6940 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6941 : DAG.getConstantFP(0.0, DL, VT);
6942
6943 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6944 int FirstLoadedElt = LoadMask.countr_zero();
6945 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6946 EVT EltBaseVT = EltBase.getValueType();
6947 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
6948 "Register/Memory size mismatch");
6949 LoadSDNode *LDBase = Loads[FirstLoadedElt];
6950 assert(LDBase && "Did not find base load for merging consecutive loads");
6951 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
6952 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
6953 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
6954 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
6955 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
6956
6957 // TODO: Support offsetting the base load.
6958 if (ByteOffsets[FirstLoadedElt] != 0)
6959 return SDValue();
6960
6961 // Check to see if the element's load is consecutive to the base load
6962 // or offset from a previous (already checked) load.
6963 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
6964 LoadSDNode *Ld = Loads[EltIdx];
6965 int64_t ByteOffset = ByteOffsets[EltIdx];
6966 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
6967 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
6968 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
6969 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
6970 }
6971 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
6972 EltIdx - FirstLoadedElt);
6973 };
6974
6975 // Consecutive loads can contain UNDEFS but not ZERO elements.
6976 // Consecutive loads with UNDEFs and ZEROs elements require a
6977 // an additional shuffle stage to clear the ZERO elements.
6978 bool IsConsecutiveLoad = true;
6979 bool IsConsecutiveLoadWithZeros = true;
6980 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6981 if (LoadMask[i]) {
6982 if (!CheckConsecutiveLoad(LDBase, i)) {
6983 IsConsecutiveLoad = false;
6984 IsConsecutiveLoadWithZeros = false;
6985 break;
6986 }
6987 } else if (ZeroMask[i]) {
6988 IsConsecutiveLoad = false;
6989 }
6990 }
6991
6992 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
6993 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6994 assert(LDBase->isSimple() &&
6995 "Cannot merge volatile or atomic loads.");
6996 SDValue NewLd =
6997 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6998 LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
6999 MMOFlags);
7000 for (auto *LD : Loads)
7001 if (LD)
7002 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
7003 return NewLd;
7004 };
7005
7006 // Check if the base load is entirely dereferenceable.
7007 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7008 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
7009
7010 // LOAD - all consecutive load/undefs (must start/end with a load or be
7011 // entirely dereferenceable). If we have found an entire vector of loads and
7012 // undefs, then return a large load of the entire vector width starting at the
7013 // base pointer. If the vector contains zeros, then attempt to shuffle those
7014 // elements.
7015 if (FirstLoadedElt == 0 &&
7016 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
7017 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7018 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7019 return SDValue();
7020
7021 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7022 // will lower to regular temporal loads and use the cache.
7023 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7024 VT.is256BitVector() && !Subtarget.hasInt256())
7025 return SDValue();
7026
7027 if (NumElems == 1)
7028 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
7029
7030 if (!ZeroMask)
7031 return CreateLoad(VT, LDBase);
7032
7033 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7034 // vector and a zero vector to clear out the zero elements.
7035 if (!IsAfterLegalize && VT.isVector()) {
7036 unsigned NumMaskElts = VT.getVectorNumElements();
7037 if ((NumMaskElts % NumElems) == 0) {
7038 unsigned Scale = NumMaskElts / NumElems;
7039 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7040 for (unsigned i = 0; i < NumElems; ++i) {
7041 if (UndefMask[i])
7042 continue;
7043 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7044 for (unsigned j = 0; j != Scale; ++j)
7045 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7046 }
7047 SDValue V = CreateLoad(VT, LDBase);
7048 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7049 : DAG.getConstantFP(0.0, DL, VT);
7050 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7051 }
7052 }
7053 }
7054
7055 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7056 if (VT.is256BitVector() || VT.is512BitVector()) {
7057 unsigned HalfNumElems = NumElems / 2;
7058 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7059 EVT HalfVT =
7060 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7061 SDValue HalfLD =
7062 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7063 DAG, Subtarget, IsAfterLegalize);
7064 if (HalfLD)
7065 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7066 HalfLD, DAG.getIntPtrConstant(0, DL));
7067 }
7068 }
7069
7070 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7071 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7072 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7073 LoadSizeInBits == 64) &&
7074 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7075 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7076 : MVT::getIntegerVT(LoadSizeInBits);
7077 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7078 // Allow v4f32 on SSE1 only targets.
7079 // FIXME: Add more isel patterns so we can just use VT directly.
7080 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7081 VecVT = MVT::v4f32;
7082 if (TLI.isTypeLegal(VecVT)) {
7083 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7084 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7085 SDValue ResNode = DAG.getMemIntrinsicNode(
7086 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7088 for (auto *LD : Loads)
7089 if (LD)
7090 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7091 return DAG.getBitcast(VT, ResNode);
7092 }
7093 }
7094
7095 // BROADCAST - match the smallest possible repetition pattern, load that
7096 // scalar/subvector element and then broadcast to the entire vector.
7097 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7098 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7099 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7100 unsigned RepeatSize = SubElems * BaseSizeInBits;
7101 unsigned ScalarSize = std::min(RepeatSize, 64u);
7102 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7103 continue;
7104
7105 // Don't attempt a 1:N subvector broadcast - it should be caught by
7106 // combineConcatVectorOps, else will cause infinite loops.
7107 if (RepeatSize > ScalarSize && SubElems == 1)
7108 continue;
7109
7110 bool Match = true;
7111 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7112 for (unsigned i = 0; i != NumElems && Match; ++i) {
7113 if (!LoadMask[i])
7114 continue;
7115 SDValue Elt = peekThroughBitcasts(Elts[i]);
7116 if (RepeatedLoads[i % SubElems].isUndef())
7117 RepeatedLoads[i % SubElems] = Elt;
7118 else
7119 Match &= (RepeatedLoads[i % SubElems] == Elt);
7120 }
7121
7122 // We must have loads at both ends of the repetition.
7123 Match &= !RepeatedLoads.front().isUndef();
7124 Match &= !RepeatedLoads.back().isUndef();
7125 if (!Match)
7126 continue;
7127
7128 EVT RepeatVT =
7129 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7130 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7131 : EVT::getFloatingPointVT(ScalarSize);
7132 if (RepeatSize > ScalarSize)
7133 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7134 RepeatSize / ScalarSize);
7135 EVT BroadcastVT =
7136 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7137 VT.getSizeInBits() / ScalarSize);
7138 if (TLI.isTypeLegal(BroadcastVT)) {
7139 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7140 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
7141 SDValue Broadcast = RepeatLoad;
7142 if (RepeatSize > ScalarSize) {
7143 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7144 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7145 } else {
7146 if (!Subtarget.hasAVX2() &&
7148 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7149 Subtarget,
7150 /*AssumeSingleUse=*/true))
7151 return SDValue();
7152 Broadcast =
7153 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
7154 }
7155 return DAG.getBitcast(VT, Broadcast);
7156 }
7157 }
7158 }
7159 }
7160
7161 return SDValue();
7162}
7163
7164// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
7165// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
7166// are consecutive, non-overlapping, and in the right order.
7168 SelectionDAG &DAG,
7169 const X86Subtarget &Subtarget,
7170 bool IsAfterLegalize) {
7172 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
7173 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
7174 Elts.push_back(Elt);
7175 continue;
7176 }
7177 return SDValue();
7178 }
7179 assert(Elts.size() == VT.getVectorNumElements());
7180 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
7181 IsAfterLegalize);
7182}
7183
7185 const APInt &Undefs, LLVMContext &C) {
7186 unsigned ScalarSize = VT.getScalarSizeInBits();
7187 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
7188
7189 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7190 if (VT.isFloatingPoint()) {
7191 if (ScalarSize == 16)
7192 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7193 if (ScalarSize == 32)
7194 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7195 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7196 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7197 }
7198 return Constant::getIntegerValue(Ty, Val);
7199 };
7200
7201 SmallVector<Constant *, 32> ConstantVec;
7202 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
7203 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
7204 : getConstantScalar(Bits[I]));
7205
7206 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7207}
7208
7209static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
7210 unsigned SplatBitSize, LLVMContext &C) {
7211 unsigned ScalarSize = VT.getScalarSizeInBits();
7212
7213 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7214 if (VT.isFloatingPoint()) {
7215 if (ScalarSize == 16)
7216 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7217 if (ScalarSize == 32)
7218 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7219 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7220 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7221 }
7222 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
7223 };
7224
7225 if (ScalarSize == SplatBitSize)
7226 return getConstantScalar(SplatValue);
7227
7228 unsigned NumElm = SplatBitSize / ScalarSize;
7229 SmallVector<Constant *, 32> ConstantVec;
7230 for (unsigned I = 0; I != NumElm; ++I) {
7231 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
7232 ConstantVec.push_back(getConstantScalar(Val));
7233 }
7234 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7235}
7236
7238 for (auto *U : N->uses()) {
7239 unsigned Opc = U->getOpcode();
7240 // VPERMV/VPERMV3 shuffles can never fold their index operands.
7241 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7242 return false;
7243 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7244 return false;
7245 if (isTargetShuffle(Opc))
7246 return true;
7247 if (Opc == ISD::BITCAST) // Ignore bitcasts
7248 return isFoldableUseOfShuffle(U);
7249 if (N->hasOneUse()) {
7250 // TODO, there may be some general way to know if a SDNode can
7251 // be folded. We now only know whether an MI is foldable.
7252 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7253 return false;
7254 return true;
7255 }
7256 }
7257 return false;
7258}
7259
7260/// Attempt to use the vbroadcast instruction to generate a splat value
7261/// from a splat BUILD_VECTOR which uses:
7262/// a. A single scalar load, or a constant.
7263/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7264///
7265/// The VBROADCAST node is returned when a pattern is found,
7266/// or SDValue() otherwise.
7268 const SDLoc &dl,
7269 const X86Subtarget &Subtarget,
7270 SelectionDAG &DAG) {
7271 // VBROADCAST requires AVX.
7272 // TODO: Splats could be generated for non-AVX CPUs using SSE
7273 // instructions, but there's less potential gain for only 128-bit vectors.
7274 if (!Subtarget.hasAVX())
7275 return SDValue();
7276
7277 MVT VT = BVOp->getSimpleValueType(0);
7278 unsigned NumElts = VT.getVectorNumElements();
7279 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7280 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7281 "Unsupported vector type for broadcast.");
7282
7283 // See if the build vector is a repeating sequence of scalars (inc. splat).
7284 SDValue Ld;
7285 BitVector UndefElements;
7286 SmallVector<SDValue, 16> Sequence;
7287 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7288 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
7289 if (Sequence.size() == 1)
7290 Ld = Sequence[0];
7291 }
7292
7293 // Attempt to use VBROADCASTM
7294 // From this pattern:
7295 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7296 // b. t1 = (build_vector t0 t0)
7297 //
7298 // Create (VBROADCASTM v2i1 X)
7299 if (!Sequence.empty() && Subtarget.hasCDI()) {
7300 // If not a splat, are the upper sequence values zeroable?
7301 unsigned SeqLen = Sequence.size();
7302 bool UpperZeroOrUndef =
7303 SeqLen == 1 ||
7304 llvm::all_of(ArrayRef(Sequence).drop_front(), [](SDValue V) {
7305 return !V || V.isUndef() || isNullConstant(V);
7306 });
7307 SDValue Op0 = Sequence[0];
7308 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
7309 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
7310 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
7311 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
7312 ? Op0.getOperand(0)
7313 : Op0.getOperand(0).getOperand(0);
7314 MVT MaskVT = BOperand.getSimpleValueType();
7315 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
7316 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7317 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7318 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
7319 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
7320 unsigned Scale = 512 / VT.getSizeInBits();
7321 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
7322 }
7323 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
7324 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
7325 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
7326 return DAG.getBitcast(VT, Bcst);
7327 }
7328 }
7329 }
7330
7331 unsigned NumUndefElts = UndefElements.count();
7332 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7333 APInt SplatValue, Undef;
7334 unsigned SplatBitSize;
7335 bool HasUndef;
7336 // Check if this is a repeated constant pattern suitable for broadcasting.
7337 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7338 SplatBitSize > VT.getScalarSizeInBits() &&
7339 SplatBitSize < VT.getSizeInBits()) {
7340 // Avoid replacing with broadcast when it's a use of a shuffle
7341 // instruction to preserve the present custom lowering of shuffles.
7342 if (isFoldableUseOfShuffle(BVOp))
7343 return SDValue();
7344 // replace BUILD_VECTOR with broadcast of the repeated constants.
7345 LLVMContext *Ctx = DAG.getContext();
7346 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7347 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7348 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
7349 // Load the constant scalar/subvector and broadcast it.
7350 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7351 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7352 SDValue CP = DAG.getConstantPool(C, PVT);
7353 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7354
7355 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7356 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
7357 SDValue Ops[] = {DAG.getEntryNode(), CP};
7358 MachinePointerInfo MPI =
7360 SDValue Brdcst =
7361 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7362 MPI, Alignment, MachineMemOperand::MOLoad);
7363 return DAG.getBitcast(VT, Brdcst);
7364 }
7365 if (SplatBitSize > 64) {
7366 // Load the vector of constants and broadcast it.
7367 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7368 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7369 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7370 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
7371 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7372 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7373 SDValue Ops[] = {DAG.getEntryNode(), VCP};
7374 MachinePointerInfo MPI =
7377 Ops, VVT, MPI, Alignment,
7379 }
7380 }
7381
7382 // If we are moving a scalar into a vector (Ld must be set and all elements
7383 // but 1 are undef) and that operation is not obviously supported by
7384 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7385 // That's better than general shuffling and may eliminate a load to GPR and
7386 // move from scalar to vector register.
7387 if (!Ld || NumElts - NumUndefElts != 1)
7388 return SDValue();
7389 unsigned ScalarSize = Ld.getValueSizeInBits();
7390 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7391 return SDValue();
7392 }
7393
7394 bool ConstSplatVal =
7395 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7396 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7397
7398 // TODO: Handle broadcasts of non-constant sequences.
7399
7400 // Make sure that all of the users of a non-constant load are from the
7401 // BUILD_VECTOR node.
7402 // FIXME: Is the use count needed for non-constant, non-load case?
7403 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7404 return SDValue();
7405
7406 unsigned ScalarSize = Ld.getValueSizeInBits();
7407 bool IsGE256 = (VT.getSizeInBits() >= 256);
7408
7409 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7410 // instruction to save 8 or more bytes of constant pool data.
7411 // TODO: If multiple splats are generated to load the same constant,
7412 // it may be detrimental to overall size. There needs to be a way to detect
7413 // that condition to know if this is truly a size win.
7414 bool OptForSize = DAG.shouldOptForSize();
7415
7416 // Handle broadcasting a single constant scalar from the constant pool
7417 // into a vector.
7418 // On Sandybridge (no AVX2), it is still better to load a constant vector
7419 // from the constant pool and not to broadcast it from a scalar.
7420 // But override that restriction when optimizing for size.
7421 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7422 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7423 EVT CVT = Ld.getValueType();
7424 assert(!CVT.isVector() && "Must not broadcast a vector type");
7425
7426 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
7427 // For size optimization, also splat v2f64 and v2i64, and for size opt
7428 // with AVX2, also splat i8 and i16.
7429 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7430 if (ScalarSize == 32 ||
7431 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7432 (CVT == MVT::f16 && Subtarget.hasAVX2()) ||
7433 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7434 const Constant *C = nullptr;
7435 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
7436 C = CI->getConstantIntValue();
7437 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
7438 C = CF->getConstantFPValue();
7439
7440 assert(C && "Invalid constant type");
7441
7442 SDValue CP =
7444 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7445
7446 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7447 SDValue Ops[] = {DAG.getEntryNode(), CP};
7448 MachinePointerInfo MPI =
7450 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7451 MPI, Alignment, MachineMemOperand::MOLoad);
7452 }
7453 }
7454
7455 // Handle AVX2 in-register broadcasts.
7456 if (!IsLoad && Subtarget.hasInt256() &&
7457 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7458 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7459
7460 // The scalar source must be a normal load.
7461 if (!IsLoad)
7462 return SDValue();
7463
7464 // Make sure the non-chain result is only used by this build vector.
7465 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7466 return SDValue();
7467
7468 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7469 (Subtarget.hasVLX() && ScalarSize == 64)) {
7470 auto *LN = cast<LoadSDNode>(Ld);
7471 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7472 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7473 SDValue BCast =
7475 LN->getMemoryVT(), LN->getMemOperand());
7476 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7477 return BCast;
7478 }
7479
7480 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7481 // double since there is no vbroadcastsd xmm
7482 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
7483 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
7484 auto *LN = cast<LoadSDNode>(Ld);
7485 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7486 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7487 SDValue BCast =
7489 LN->getMemoryVT(), LN->getMemOperand());
7490 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7491 return BCast;
7492 }
7493
7494 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
7495 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7496
7497 // Unsupported broadcast.
7498 return SDValue();
7499}
7500
7501/// For an EXTRACT_VECTOR_ELT with a constant index return the real
7502/// underlying vector and index.
7503///
7504/// Modifies \p ExtractedFromVec to the real vector and returns the real
7505/// index.
7506static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7507 SDValue ExtIdx) {
7508 int Idx = ExtIdx->getAsZExtVal();
7509 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7510 return Idx;
7511
7512 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7513 // lowered this:
7514 // (extract_vector_elt (v8f32 %1), Constant<6>)
7515 // to:
7516 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7517 // (extract_subvector (v8f32 %0), Constant<4>),
7518 // undef)
7519 // Constant<0>)
7520 // In this case the vector is the extract_subvector expression and the index
7521 // is 2, as specified by the shuffle.
7522 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7523 SDValue ShuffleVec = SVOp->getOperand(0);
7524 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7525 assert(ShuffleVecVT.getVectorElementType() ==
7526 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7527
7528 int ShuffleIdx = SVOp->getMaskElt(Idx);
7529 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7530 ExtractedFromVec = ShuffleVec;
7531 return ShuffleIdx;
7532 }
7533 return Idx;
7534}
7535
7537 SelectionDAG &DAG) {
7538 MVT VT = Op.getSimpleValueType();
7539
7540 // Skip if insert_vec_elt is not supported.
7541 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7543 return SDValue();
7544
7545 unsigned NumElems = Op.getNumOperands();
7546 SDValue VecIn1;
7547 SDValue VecIn2;
7548 SmallVector<unsigned, 4> InsertIndices;
7549 SmallVector<int, 8> Mask(NumElems, -1);
7550
7551 for (unsigned i = 0; i != NumElems; ++i) {
7552 unsigned Opc = Op.getOperand(i).getOpcode();
7553
7554 if (Opc == ISD::UNDEF)
7555 continue;
7556
7557 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
7558 // Quit if more than 1 elements need inserting.
7559 if (InsertIndices.size() > 1)
7560 return SDValue();
7561
7562 InsertIndices.push_back(i);
7563 continue;
7564 }
7565
7566 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7567 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7568
7569 // Quit if non-constant index.
7570 if (!isa<ConstantSDNode>(ExtIdx))
7571 return SDValue();
7572 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7573
7574 // Quit if extracted from vector of different type.
7575 if (ExtractedFromVec.getValueType() != VT)
7576 return SDValue();
7577
7578 if (!VecIn1.getNode())
7579 VecIn1 = ExtractedFromVec;
7580 else if (VecIn1 != ExtractedFromVec) {
7581 if (!VecIn2.getNode())
7582 VecIn2 = ExtractedFromVec;
7583 else if (VecIn2 != ExtractedFromVec)
7584 // Quit if more than 2 vectors to shuffle
7585 return SDValue();
7586 }
7587
7588 if (ExtractedFromVec == VecIn1)
7589 Mask[i] = Idx;
7590 else if (ExtractedFromVec == VecIn2)
7591 Mask[i] = Idx + NumElems;
7592 }
7593
7594 if (!VecIn1.getNode())
7595 return SDValue();
7596
7597 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7598 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7599
7600 for (unsigned Idx : InsertIndices)
7601 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7602 DAG.getIntPtrConstant(Idx, DL));
7603
7604 return NV;
7605}
7606
7607// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
7609 const X86Subtarget &Subtarget) {
7610 MVT VT = Op.getSimpleValueType();
7611 MVT IVT =
7612 VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);
7614 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
7615 NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,
7616 Op.getOperand(I)));
7617 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
7618 return DAG.getBitcast(VT, Res);
7619}
7620
7621// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7623 SelectionDAG &DAG,
7624 const X86Subtarget &Subtarget) {
7625
7626 MVT VT = Op.getSimpleValueType();
7627 assert((VT.getVectorElementType() == MVT::i1) &&
7628 "Unexpected type in LowerBUILD_VECTORvXi1!");
7629 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
7630 ISD::isBuildVectorAllOnes(Op.getNode()))
7631 return Op;
7632
7633 uint64_t Immediate = 0;
7634 SmallVector<unsigned, 16> NonConstIdx;
7635 bool IsSplat = true;
7636 bool HasConstElts = false;
7637 int SplatIdx = -1;
7638 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7639 SDValue In = Op.getOperand(idx);
7640 if (In.isUndef())
7641 continue;
7642 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
7643 Immediate |= (InC->getZExtValue() & 0x1) << idx;
7644 HasConstElts = true;
7645 } else {
7646 NonConstIdx.push_back(idx);
7647 }
7648 if (SplatIdx < 0)
7649 SplatIdx = idx;
7650 else if (In != Op.getOperand(SplatIdx))
7651 IsSplat = false;
7652 }
7653
7654 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7655 if (IsSplat) {
7656 // The build_vector allows the scalar element to be larger than the vector
7657 // element type. We need to mask it to use as a condition unless we know
7658 // the upper bits are zero.
7659 // FIXME: Use computeKnownBits instead of checking specific opcode?
7660 SDValue Cond = Op.getOperand(SplatIdx);
7661 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
7662 if (Cond.getOpcode() != ISD::SETCC)
7663 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
7664 DAG.getConstant(1, dl, MVT::i8));
7665
7666 // Perform the select in the scalar domain so we can use cmov.
7667 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7668 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
7669 DAG.getAllOnesConstant(dl, MVT::i32),
7670 DAG.getConstant(0, dl, MVT::i32));
7671 Select = DAG.getBitcast(MVT::v32i1, Select);
7672 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
7673 } else {
7674 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
7675 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
7676 DAG.getAllOnesConstant(dl, ImmVT),
7677 DAG.getConstant(0, dl, ImmVT));
7678 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
7679 Select = DAG.getBitcast(VecVT, Select);
7680 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
7681 DAG.getIntPtrConstant(0, dl));
7682 }
7683 }
7684
7685 // insert elements one by one
7686 SDValue DstVec;
7687 if (HasConstElts) {
7688 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7689 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
7690 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
7691 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
7692 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
7693 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
7694 } else {
7695 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
7696 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
7697 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
7698 DstVec = DAG.getBitcast(VecVT, Imm);
7699 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
7700 DAG.getIntPtrConstant(0, dl));
7701 }
7702 } else
7703 DstVec = DAG.getUNDEF(VT);
7704
7705 for (unsigned InsertIdx : NonConstIdx) {
7706 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7707 Op.getOperand(InsertIdx),
7708 DAG.getIntPtrConstant(InsertIdx, dl));
7709 }
7710 return DstVec;
7711}
7712
7713LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
7714 switch (Opcode) {
7715 case X86ISD::PACKSS:
7716 case X86ISD::PACKUS:
7717 case X86ISD::FHADD:
7718 case X86ISD::FHSUB:
7719 case X86ISD::HADD:
7720 case X86ISD::HSUB:
7721 return true;
7722 }
7723 return false;
7724}
7725
7726/// This is a helper function of LowerToHorizontalOp().
7727/// This function checks that the build_vector \p N in input implements a
7728/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
7729/// may not match the layout of an x86 256-bit horizontal instruction.
7730/// In other words, if this returns true, then some extraction/insertion will
7731/// be required to produce a valid horizontal instruction.
7732///
7733/// Parameter \p Opcode defines the kind of horizontal operation to match.
7734/// For example, if \p Opcode is equal to ISD::ADD, then this function
7735/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7736/// is equal to ISD::SUB, then this function checks if this is a horizontal
7737/// arithmetic sub.
7738///
7739/// This function only analyzes elements of \p N whose indices are
7740/// in range [BaseIdx, LastIdx).
7741///
7742/// TODO: This function was originally used to match both real and fake partial
7743/// horizontal operations, but the index-matching logic is incorrect for that.
7744/// See the corrected implementation in isHopBuildVector(). Can we reduce this
7745/// code because it is only used for partial h-op matching now?
7746static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
7747 const SDLoc &DL, SelectionDAG &DAG,
7748 unsigned BaseIdx, unsigned LastIdx,
7749 SDValue &V0, SDValue &V1) {
7750 EVT VT = N->getValueType(0);
7751 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
7752 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7753 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7754 "Invalid Vector in input!");
7755
7756 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7757 bool CanFold = true;
7758 unsigned ExpectedVExtractIdx = BaseIdx;
7759 unsigned NumElts = LastIdx - BaseIdx;
7760 V0 = DAG.getUNDEF(VT);
7761 V1 = DAG.getUNDEF(VT);
7762
7763 // Check if N implements a horizontal binop.
7764 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7765 SDValue Op = N->getOperand(i + BaseIdx);
7766
7767 // Skip UNDEFs.
7768 if (Op->isUndef()) {
7769 // Update the expected vector extract index.
7770 if (i * 2 == NumElts)
7771 ExpectedVExtractIdx = BaseIdx;
7772 ExpectedVExtractIdx += 2;
7773 continue;
7774 }
7775
7776 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7777
7778 if (!CanFold)
7779 break;
7780
7781 SDValue Op0 = Op.getOperand(0);
7782 SDValue Op1 = Op.getOperand(1);
7783
7784 // Try to match the following pattern:
7785 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7786 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7788 Op0.getOperand(0) == Op1.getOperand(0) &&
7789 isa<ConstantSDNode>(Op0.getOperand(1)) &&
7790 isa<ConstantSDNode>(Op1.getOperand(1)));
7791 if (!CanFold)
7792 break;
7793
7794 unsigned I0 = Op0.getConstantOperandVal(1);
7795 unsigned I1 = Op1.getConstantOperandVal(1);
7796
7797 if (i * 2 < NumElts) {
7798 if (V0.isUndef()) {
7799 V0 = Op0.getOperand(0);
7800 if (V0.getValueType() != VT)
7801 return false;
7802 }
7803 } else {
7804 if (V1.isUndef()) {
7805 V1 = Op0.getOperand(0);
7806 if (V1.getValueType() != VT)
7807 return false;
7808 }
7809 if (i * 2 == NumElts)
7810 ExpectedVExtractIdx = BaseIdx;
7811 }
7812
7813 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7814 if (I0 == ExpectedVExtractIdx)
7815 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7816 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7817 // Try to match the following dag sequence:
7818 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7819 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7820 } else
7821 CanFold = false;
7822
7823 ExpectedVExtractIdx += 2;
7824 }
7825
7826 return CanFold;
7827}
7828
7829/// Emit a sequence of two 128-bit horizontal add/sub followed by
7830/// a concat_vector.
7831///
7832/// This is a helper function of LowerToHorizontalOp().
7833/// This function expects two 256-bit vectors called V0 and V1.
7834/// At first, each vector is split into two separate 128-bit vectors.
7835/// Then, the resulting 128-bit vectors are used to implement two
7836/// horizontal binary operations.
7837///
7838/// The kind of horizontal binary operation is defined by \p X86Opcode.
7839///
7840/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7841/// the two new horizontal binop.
7842/// When Mode is set, the first horizontal binop dag node would take as input
7843/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7844/// horizontal binop dag node would take as input the lower 128-bit of V1
7845/// and the upper 128-bit of V1.
7846/// Example:
7847/// HADD V0_LO, V0_HI
7848/// HADD V1_LO, V1_HI
7849///
7850/// Otherwise, the first horizontal binop dag node takes as input the lower
7851/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7852/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7853/// Example:
7854/// HADD V0_LO, V1_LO
7855/// HADD V0_HI, V1_HI
7856///
7857/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7858/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7859/// the upper 128-bits of the result.
7860static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7861 const SDLoc &DL, SelectionDAG &DAG,
7862 unsigned X86Opcode, bool Mode,
7863 bool isUndefLO, bool isUndefHI) {
7864 MVT VT = V0.getSimpleValueType();
7865 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7866 "Invalid nodes in input!");
7867
7868 unsigned NumElts = VT.getVectorNumElements();
7869 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7870 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7871 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7872 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7873 MVT NewVT = V0_LO.getSimpleValueType();
7874
7875 SDValue LO = DAG.getUNDEF(NewVT);
7876 SDValue HI = DAG.getUNDEF(NewVT);
7877
7878 if (Mode) {
7879 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7880 if (!isUndefLO && !V0->isUndef())
7881 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7882 if (!isUndefHI && !V1->isUndef())
7883 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7884 } else {
7885 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7886 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7887 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7888
7889 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7890 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7891 }
7892
7893 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7894}
7895
7896/// Returns true iff \p BV builds a vector with the result equivalent to
7897/// the result of ADDSUB/SUBADD operation.
7898/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
7899/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
7900/// \p Opnd0 and \p Opnd1.
7902 const X86Subtarget &Subtarget, SelectionDAG &DAG,
7903 SDValue &Opnd0, SDValue &Opnd1,
7904 unsigned &NumExtracts,
7905 bool &IsSubAdd) {
7906
7907 MVT VT = BV->getSimpleValueType(0);
7908 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
7909 return false;
7910
7911 unsigned NumElts = VT.getVectorNumElements();
7912 SDValue InVec0 = DAG.getUNDEF(VT);
7913 SDValue InVec1 = DAG.getUNDEF(VT);
7914
7915 NumExtracts = 0;
7916
7917 // Odd-numbered elements in the input build vector are obtained from
7918 // adding/subtracting two integer/float elements.
7919 // Even-numbered elements in the input build vector are obtained from
7920 // subtracting/adding two integer/float elements.
7921 unsigned Opc[2] = {0, 0};
7922 for (unsigned i = 0, e = NumElts; i != e; ++i) {
7923 SDValue Op = BV->getOperand(i);
7924
7925 // Skip 'undef' values.
7926 unsigned Opcode = Op.getOpcode();
7927 if (Opcode == ISD::UNDEF)
7928 continue;
7929
7930 // Early exit if we found an unexpected opcode.
7931 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
7932 return false;
7933
7934 SDValue Op0 = Op.getOperand(0);
7935 SDValue Op1 = Op.getOperand(1);
7936
7937 // Try to match the following pattern:
7938 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7939 // Early exit if we cannot match that sequence.
7940 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7942 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7943 Op0.getOperand(1) != Op1.getOperand(1))
7944 return false;
7945
7946 unsigned I0 = Op0.getConstantOperandVal(1);
7947 if (I0 != i)
7948 return false;
7949
7950 // We found a valid add/sub node, make sure its the same opcode as previous
7951 // elements for this parity.
7952 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
7953 return false;
7954 Opc[i % 2] = Opcode;
7955
7956 // Update InVec0 and InVec1.
7957 if (InVec0.isUndef()) {
7958 InVec0 = Op0.getOperand(0);
7959 if (InVec0.getSimpleValueType() != VT)
7960 return false;
7961 }
7962 if (InVec1.isUndef()) {
7963 InVec1 = Op1.getOperand(0);
7964 if (InVec1.getSimpleValueType() != VT)
7965 return false;
7966 }
7967
7968 // Make sure that operands in input to each add/sub node always
7969 // come from a same pair of vectors.
7970 if (InVec0 != Op0.getOperand(0)) {
7971 if (Opcode == ISD::FSUB)
7972 return false;
7973
7974 // FADD is commutable. Try to commute the operands
7975 // and then test again.
7976 std::swap(Op0, Op1);
7977 if (InVec0 != Op0.getOperand(0))
7978 return false;
7979 }
7980
7981 if (InVec1 != Op1.getOperand(0))
7982 return false;
7983
7984 // Increment the number of extractions done.
7985 ++NumExtracts;
7986 }
7987
7988 // Ensure we have found an opcode for both parities and that they are
7989 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
7990 // inputs are undef.
7991 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
7992 InVec0.isUndef() || InVec1.isUndef())
7993 return false;
7994
7995 IsSubAdd = Opc[0] == ISD::FADD;
7996
7997 Opnd0 = InVec0;
7998 Opnd1 = InVec1;
7999 return true;
8000}
8001
8002/// Returns true if is possible to fold MUL and an idiom that has already been
8003/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
8004/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
8005/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
8006///
8007/// Prior to calling this function it should be known that there is some
8008/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
8009/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
8010/// before replacement of such SDNode with ADDSUB operation. Thus the number
8011/// of \p Opnd0 uses is expected to be equal to 2.
8012/// For example, this function may be called for the following IR:
8013/// %AB = fmul fast <2 x double> %A, %B
8014/// %Sub = fsub fast <2 x double> %AB, %C
8015/// %Add = fadd fast <2 x double> %AB, %C
8016/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8017/// <2 x i32> <i32 0, i32 3>
8018/// There is a def for %Addsub here, which potentially can be replaced by
8019/// X86ISD::ADDSUB operation:
8020/// %Addsub = X86ISD::ADDSUB %AB, %C
8021/// and such ADDSUB can further be replaced with FMADDSUB:
8022/// %Addsub = FMADDSUB %A, %B, %C.
8023///
8024/// The main reason why this method is called before the replacement of the
8025/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8026/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8027/// FMADDSUB is.
8028static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8029 SelectionDAG &DAG,
8030 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
8031 unsigned ExpectedUses) {
8032 if (Opnd0.getOpcode() != ISD::FMUL ||
8033 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8034 return false;
8035
8036 // FIXME: These checks must match the similar ones in
8037 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8038 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8039 // or MUL + ADDSUB to FMADDSUB.
8040 const TargetOptions &Options = DAG.getTarget().Options;
8041 bool AllowFusion =
8042 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
8043 if (!AllowFusion)
8044 return false;
8045
8046 Opnd2 = Opnd1;
8047 Opnd1 = Opnd0.getOperand(1);
8048 Opnd0 = Opnd0.getOperand(0);
8049
8050 return true;
8051}
8052
8053/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8054/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8055/// X86ISD::FMSUBADD node.
8057 const SDLoc &DL,
8058 const X86Subtarget &Subtarget,
8059 SelectionDAG &DAG) {
8060 SDValue Opnd0, Opnd1;
8061 unsigned NumExtracts;
8062 bool IsSubAdd;
8063 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
8064 IsSubAdd))
8065 return SDValue();
8066
8067 MVT VT = BV->getSimpleValueType(0);
8068
8069 // Try to generate X86ISD::FMADDSUB node here.
8070 SDValue Opnd2;
8071 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
8072 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8073 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8074 }
8075
8076 // We only support ADDSUB.
8077 if (IsSubAdd)
8078 return SDValue();
8079
8080 // There are no known X86 targets with 512-bit ADDSUB instructions!
8081 // Convert to blend(fsub,fadd).
8082 if (VT.is512BitVector()) {
8083 SmallVector<int> Mask;
8084 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8085 Mask.push_back(I);
8086 Mask.push_back(I + E + 1);
8087 }
8088 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8089 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8090 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8091 }
8092
8093 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8094}
8095
8097 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8098 // Initialize outputs to known values.
8099 MVT VT = BV->getSimpleValueType(0);
8100 HOpcode = ISD::DELETED_NODE;
8101 V0 = DAG.getUNDEF(VT);
8102 V1 = DAG.getUNDEF(VT);
8103
8104 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8105 // half of the result is calculated independently from the 128-bit halves of
8106 // the inputs, so that makes the index-checking logic below more complicated.
8107 unsigned NumElts = VT.getVectorNumElements();
8108 unsigned GenericOpcode = ISD::DELETED_NODE;
8109 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8110 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8111 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8112 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8113 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8114 // Ignore undef elements.
8115 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8116 if (Op.isUndef())
8117 continue;
8118
8119 // If there's an opcode mismatch, we're done.
8120 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8121 return false;
8122
8123 // Initialize horizontal opcode.
8124 if (HOpcode == ISD::DELETED_NODE) {
8125 GenericOpcode = Op.getOpcode();
8126 switch (GenericOpcode) {
8127 // clang-format off
8128 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8129 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8130 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8131 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8132 default: return false;
8133 // clang-format on
8134 }
8135 }
8136
8137 SDValue Op0 = Op.getOperand(0);
8138 SDValue Op1 = Op.getOperand(1);
8139 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8141 Op0.getOperand(0) != Op1.getOperand(0) ||
8142 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
8143 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8144 return false;
8145
8146 // The source vector is chosen based on which 64-bit half of the
8147 // destination vector is being calculated.
8148 if (j < NumEltsIn64Bits) {
8149 if (V0.isUndef())
8150 V0 = Op0.getOperand(0);
8151 } else {
8152 if (V1.isUndef())
8153 V1 = Op0.getOperand(0);
8154 }
8155
8156 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8157 if (SourceVec != Op0.getOperand(0))
8158 return false;
8159
8160 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8161 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8162 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8163 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8164 (j % NumEltsIn64Bits) * 2;
8165 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8166 continue;
8167
8168 // If this is not a commutative op, this does not match.
8169 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
8170 return false;
8171
8172 // Addition is commutative, so try swapping the extract indexes.
8173 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
8174 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8175 continue;
8176
8177 // Extract indexes do not match horizontal requirement.
8178 return false;
8179 }
8180 }
8181 // We matched. Opcode and operands are returned by reference as arguments.
8182 return true;
8183}
8184
8186 const SDLoc &DL, SelectionDAG &DAG,
8187 unsigned HOpcode, SDValue V0, SDValue V1) {
8188 // If either input vector is not the same size as the build vector,
8189 // extract/insert the low bits to the correct size.
8190 // This is free (examples: zmm --> xmm, xmm --> ymm).
8191 MVT VT = BV->getSimpleValueType(0);
8192 unsigned Width = VT.getSizeInBits();
8193 if (V0.getValueSizeInBits() > Width)
8194 V0 = extractSubVector(V0, 0, DAG, DL, Width);
8195 else if (V0.getValueSizeInBits() < Width)
8196 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
8197
8198 if (V1.getValueSizeInBits() > Width)
8199 V1 = extractSubVector(V1, 0, DAG, DL, Width);
8200 else if (V1.getValueSizeInBits() < Width)
8201 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
8202
8203 unsigned NumElts = VT.getVectorNumElements();
8204 APInt DemandedElts = APInt::getAllOnes(NumElts);
8205 for (unsigned i = 0; i != NumElts; ++i)
8206 if (BV->getOperand(i).isUndef())
8207 DemandedElts.clearBit(i);
8208
8209 // If we don't need the upper xmm, then perform as a xmm hop.
8210 unsigned HalfNumElts = NumElts / 2;
8211 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
8212 MVT HalfVT = VT.getHalfNumVectorElementsVT();
8213 V0 = extractSubVector(V0, 0, DAG, DL, 128);
8214 V1 = extractSubVector(V1, 0, DAG, DL, 128);
8215 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
8216 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
8217 }
8218
8219 return DAG.getNode(HOpcode, DL, VT, V0, V1);
8220}
8221
8222/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
8224 const X86Subtarget &Subtarget,
8225 SelectionDAG &DAG) {
8226 // We need at least 2 non-undef elements to make this worthwhile by default.
8227 unsigned NumNonUndefs =
8228 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8229 if (NumNonUndefs < 2)
8230 return SDValue();
8231
8232 // There are 4 sets of horizontal math operations distinguished by type:
8233 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8234 // subtarget feature. Try to match those "native" patterns first.
8235 MVT VT = BV->getSimpleValueType(0);
8236 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
8237 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
8238 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
8239 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
8240 unsigned HOpcode;
8241 SDValue V0, V1;
8242 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8243 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
8244 }
8245
8246 // Try harder to match 256-bit ops by using extract/concat.
8247 if (!Subtarget.hasAVX() || !VT.is256BitVector())
8248 return SDValue();
8249
8250 // Count the number of UNDEF operands in the build_vector in input.
8251 unsigned NumElts = VT.getVectorNumElements();
8252 unsigned Half = NumElts / 2;
8253 unsigned NumUndefsLO = 0;
8254 unsigned NumUndefsHI = 0;
8255 for (unsigned i = 0, e = Half; i != e; ++i)
8256 if (BV->getOperand(i)->isUndef())
8257 NumUndefsLO++;
8258
8259 for (unsigned i = Half, e = NumElts; i != e; ++i)
8260 if (BV->getOperand(i)->isUndef())
8261 NumUndefsHI++;
8262
8263 SDValue InVec0, InVec1;
8264 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8265 SDValue InVec2, InVec3;
8266 unsigned X86Opcode;
8267 bool CanFold = true;
8268
8269 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
8270 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
8271 InVec3) &&
8272 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8273 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8274 X86Opcode = X86ISD::HADD;
8275 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
8276 InVec1) &&
8277 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
8278 InVec3) &&
8279 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8280 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8281 X86Opcode = X86ISD::HSUB;
8282 else
8283 CanFold = false;
8284
8285 if (CanFold) {
8286 // Do not try to expand this build_vector into a pair of horizontal
8287 // add/sub if we can emit a pair of scalar add/sub.
8288 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8289 return SDValue();
8290
8291 // Convert this build_vector into a pair of horizontal binops followed by
8292 // a concat vector. We must adjust the outputs from the partial horizontal
8293 // matching calls above to account for undefined vector halves.
8294 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8295 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8296 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8297 bool isUndefLO = NumUndefsLO == Half;
8298 bool isUndefHI = NumUndefsHI == Half;
8299 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8300 isUndefHI);
8301 }
8302 }
8303
8304 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8305 VT == MVT::v16i16) {
8306 unsigned X86Opcode;
8307 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
8308 InVec1))
8309 X86Opcode = X86ISD::HADD;
8310 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
8311 InVec1))
8312 X86Opcode = X86ISD::HSUB;
8313 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
8314 InVec1))
8315 X86Opcode = X86ISD::FHADD;
8316 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
8317 InVec1))
8318 X86Opcode = X86ISD::FHSUB;
8319 else
8320 return SDValue();
8321
8322 // Don't try to expand this build_vector into a pair of horizontal add/sub
8323 // if we can simply emit a pair of scalar add/sub.
8324 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8325 return SDValue();
8326
8327 // Convert this build_vector into two horizontal add/sub followed by
8328 // a concat vector.
8329 bool isUndefLO = NumUndefsLO == Half;
8330 bool isUndefHI = NumUndefsHI == Half;
8331 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8332 isUndefLO, isUndefHI);
8333 }
8334
8335 return SDValue();
8336}
8337
8338static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
8339 SelectionDAG &DAG);
8340
8341/// If a BUILD_VECTOR's source elements all apply the same bit operation and
8342/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8343/// just apply the bit to the vectors.
8344/// NOTE: Its not in our interest to start make a general purpose vectorizer
8345/// from this, but enough scalar bit operations are created from the later
8346/// legalization + scalarization stages to need basic support.
8348 const X86Subtarget &Subtarget,
8349 SelectionDAG &DAG) {
8350 MVT VT = Op->getSimpleValueType(0);
8351 unsigned NumElems = VT.getVectorNumElements();
8352 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8353
8354 // Check that all elements have the same opcode.
8355 // TODO: Should we allow UNDEFS and if so how many?
8356 unsigned Opcode = Op->getOperand(0).getOpcode();
8357 for (unsigned i = 1; i < NumElems; ++i)
8358 if (Opcode != Op->getOperand(i).getOpcode())
8359 return SDValue();
8360
8361 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
8362 bool IsShift = false;
8363 switch (Opcode) {
8364 default:
8365 return SDValue();
8366 case ISD::SHL:
8367 case ISD::SRL:
8368 case ISD::SRA:
8369 IsShift = true;
8370 break;
8371 case ISD::AND:
8372 case ISD::XOR:
8373 case ISD::OR:
8374 // Don't do this if the buildvector is a splat - we'd replace one
8375 // constant with an entire vector.
8376 if (Op->getSplatValue())
8377 return SDValue();
8378 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8379 return SDValue();
8380 break;
8381 }
8382
8383 SmallVector<SDValue, 4> LHSElts, RHSElts;
8384 for (SDValue Elt : Op->ops()) {
8385 SDValue LHS = Elt.getOperand(0);
8386 SDValue RHS = Elt.getOperand(1);
8387
8388 // We expect the canonicalized RHS operand to be the constant.
8389 if (!isa<ConstantSDNode>(RHS))
8390 return SDValue();
8391
8392 // Extend shift amounts.
8393 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
8394 if (!IsShift)
8395 return SDValue();
8396 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
8397 }
8398
8399 LHSElts.push_back(LHS);
8400 RHSElts.push_back(RHS);
8401 }
8402
8403 // Limit to shifts by uniform immediates.
8404 // TODO: Only accept vXi8/vXi64 special cases?
8405 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8406 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
8407 return SDValue();
8408
8409 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
8410 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
8411 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
8412
8413 if (!IsShift)
8414 return Res;
8415
8416 // Immediately lower the shift to ensure the constant build vector doesn't
8417 // get converted to a constant pool before the shift is lowered.
8418 return LowerShift(Res, Subtarget, DAG);
8419}
8420
8421/// Create a vector constant without a load. SSE/AVX provide the bare minimum
8422/// functionality to do this, so it's all zeros, all ones, or some derivation
8423/// that is cheap to calculate.
8425 SelectionDAG &DAG,
8426 const X86Subtarget &Subtarget) {
8427 MVT VT = Op.getSimpleValueType();
8428
8429 // Vectors containing all zeros can be matched by pxor and xorps.
8430 if (ISD::isBuildVectorAllZeros(Op.getNode()))
8431 return Op;
8432
8433 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8434 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8435 // vpcmpeqd on 256-bit vectors.
8436 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8437 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8438 return Op;
8439
8440 return getOnesVector(VT, DAG, DL);
8441 }
8442
8443 return SDValue();
8444}
8445
8446/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8447/// from a vector of source values and a vector of extraction indices.
8448/// The vectors might be manipulated to match the type of the permute op.
8449static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8450 const SDLoc &DL, SelectionDAG &DAG,
8451 const X86Subtarget &Subtarget) {
8452 MVT ShuffleVT = VT;
8453 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8454 unsigned NumElts = VT.getVectorNumElements();
8455 unsigned SizeInBits = VT.getSizeInBits();
8456
8457 // Adjust IndicesVec to match VT size.
8458 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8459 "Illegal variable permute mask size");
8460 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
8461 // Narrow/widen the indices vector to the correct size.
8462 if (IndicesVec.getValueSizeInBits() > SizeInBits)
8463 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8464 NumElts * VT.getScalarSizeInBits());
8465 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
8466 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
8467 SDLoc(IndicesVec), SizeInBits);
8468 // Zero-extend the index elements within the vector.
8469 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8470 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
8471 IndicesVT, IndicesVec);
8472 }
8473 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8474
8475 // Handle SrcVec that don't match VT type.
8476 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8477 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8478 // Handle larger SrcVec by treating it as a larger permute.
8479 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8480 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8481 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8482 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8483 Subtarget, DAG, SDLoc(IndicesVec));
8484 SDValue NewSrcVec =
8485 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8486 if (NewSrcVec)
8487 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
8488 return SDValue();
8489 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8490 // Widen smaller SrcVec to match VT.
8491 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8492 } else
8493 return SDValue();
8494 }
8495
8496 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8497 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8498 EVT SrcVT = Idx.getValueType();
8499 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8500 uint64_t IndexScale = 0;
8501 uint64_t IndexOffset = 0;
8502
8503 // If we're scaling a smaller permute op, then we need to repeat the
8504 // indices, scaling and offsetting them as well.
8505 // e.g. v4i32 -> v16i8 (Scale = 4)
8506 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8507 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8508 for (uint64_t i = 0; i != Scale; ++i) {
8509 IndexScale |= Scale << (i * NumDstBits);
8510 IndexOffset |= i << (i * NumDstBits);
8511 }
8512
8513 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8514 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8515 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8516 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8517 return Idx;
8518 };
8519
8520 unsigned Opcode = 0;
8521 switch (VT.SimpleTy) {
8522 default:
8523 break;
8524 case MVT::v16i8:
8525 if (Subtarget.hasSSSE3())
8526 Opcode = X86ISD::PSHUFB;
8527 break;
8528 case MVT::v8i16:
8529 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8530 Opcode = X86ISD::VPERMV;
8531 else if (Subtarget.hasSSSE3()) {
8532 Opcode = X86ISD::PSHUFB;
8533 ShuffleVT = MVT::v16i8;
8534 }
8535 break;
8536 case MVT::v4f32:
8537 case MVT::v4i32:
8538 if (Subtarget.hasAVX()) {
8539 Opcode = X86ISD::VPERMILPV;
8540 ShuffleVT = MVT::v4f32;
8541 } else if (Subtarget.hasSSSE3()) {
8542 Opcode = X86ISD::PSHUFB;
8543 ShuffleVT = MVT::v16i8;
8544 }
8545 break;
8546 case MVT::v2f64:
8547 case MVT::v2i64:
8548 if (Subtarget.hasAVX()) {
8549 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8550 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8551 Opcode = X86ISD::VPERMILPV;
8552 ShuffleVT = MVT::v2f64;
8553 } else if (Subtarget.hasSSE41()) {
8554 // SSE41 can compare v2i64 - select between indices 0 and 1.
8555 return DAG.getSelectCC(
8556 DL, IndicesVec,
8557 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8558 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8559 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8561 }
8562 break;
8563 case MVT::v32i8:
8564 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8565 Opcode = X86ISD::VPERMV;
8566 else if (Subtarget.hasXOP()) {
8567 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8568 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
8569 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
8570 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
8571 return DAG.getNode(
8573 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
8574 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
8575 } else if (Subtarget.hasAVX()) {
8576 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
8577 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
8578 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
8579 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
8580 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
8581 ArrayRef<SDValue> Ops) {
8582 // Permute Lo and Hi and then select based on index range.
8583 // This works as SHUFB uses bits[3:0] to permute elements and we don't
8584 // care about the bit[7] as its just an index vector.
8585 SDValue Idx = Ops[2];
8586 EVT VT = Idx.getValueType();
8587 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
8588 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
8589 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
8591 };
8592 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
8593 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
8594 PSHUFBBuilder);
8595 }
8596 break;
8597 case MVT::v16i16:
8598 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8599 Opcode = X86ISD::VPERMV;
8600 else if (Subtarget.hasAVX()) {
8601 // Scale to v32i8 and perform as v32i8.
8602 IndicesVec = ScaleIndices(IndicesVec, 2);
8603 return DAG.getBitcast(
8605 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
8606 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
8607 }
8608 break;
8609 case MVT::v8f32:
8610 case MVT::v8i32:
8611 if (Subtarget.hasAVX2())
8612 Opcode = X86ISD::VPERMV;
8613 else if (Subtarget.hasAVX()) {
8614 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
8615 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8616 {0, 1, 2, 3, 0, 1, 2, 3});
8617 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8618 {4, 5, 6, 7, 4, 5, 6, 7});
8619 if (Subtarget.hasXOP())
8620 return DAG.getBitcast(
8621 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
8622 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
8623 // Permute Lo and Hi and then select based on index range.
8624 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
8625 SDValue Res = DAG.getSelectCC(
8626 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
8627 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
8628 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
8630 return DAG.getBitcast(VT, Res);
8631 }
8632 break;
8633 case MVT::v4i64:
8634 case MVT::v4f64:
8635 if (Subtarget.hasAVX512()) {
8636 if (!Subtarget.hasVLX()) {
8637 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
8638 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
8639 SDLoc(SrcVec));
8640 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
8641 DAG, SDLoc(IndicesVec));
8642 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
8643 DAG, Subtarget);
8644 return extract256BitVector(Res, 0, DAG, DL);
8645 }
8646 Opcode = X86ISD::VPERMV;
8647 } else if (Subtarget.hasAVX()) {
8648 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
8649 SDValue LoLo =
8650 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
8651 SDValue HiHi =
8652 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
8653 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
8654 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8655 if (Subtarget.hasXOP())
8656 return DAG.getBitcast(
8657 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
8658 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
8659 // Permute Lo and Hi and then select based on index range.
8660 // This works as VPERMILPD only uses index bit[1] to permute elements.
8661 SDValue Res = DAG.getSelectCC(
8662 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
8663 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
8664 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
8666 return DAG.getBitcast(VT, Res);
8667 }
8668 break;
8669 case MVT::v64i8:
8670 if (Subtarget.hasVBMI())
8671 Opcode = X86ISD::VPERMV;
8672 break;
8673 case MVT::v32i16:
8674 if (Subtarget.hasBWI())
8675 Opcode = X86ISD::VPERMV;
8676 break;
8677 case MVT::v16f32:
8678 case MVT::v16i32:
8679 case MVT::v8f64:
8680 case MVT::v8i64:
8681 if (Subtarget.hasAVX512())
8682 Opcode = X86ISD::VPERMV;
8683 break;
8684 }
8685 if (!Opcode)
8686 return SDValue();
8687
8688 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
8689 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
8690 "Illegal variable permute shuffle type");
8691
8692 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
8693 if (Scale > 1)
8694 IndicesVec = ScaleIndices(IndicesVec, Scale);
8695
8696 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
8697 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
8698
8699 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
8700 SDValue Res = Opcode == X86ISD::VPERMV
8701 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
8702 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
8703 return DAG.getBitcast(VT, Res);
8704}
8705
8706// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
8707// reasoned to be a permutation of a vector by indices in a non-constant vector.
8708// (build_vector (extract_elt V, (extract_elt I, 0)),
8709// (extract_elt V, (extract_elt I, 1)),
8710// ...
8711// ->
8712// (vpermv I, V)
8713//
8714// TODO: Handle undefs
8715// TODO: Utilize pshufb and zero mask blending to support more efficient
8716// construction of vectors with constant-0 elements.
8717static SDValue
8719 SelectionDAG &DAG,
8720 const X86Subtarget &Subtarget) {
8721 SDValue SrcVec, IndicesVec;
8722 // Check for a match of the permute source vector and permute index elements.
8723 // This is done by checking that the i-th build_vector operand is of the form:
8724 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
8725 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
8726 SDValue Op = V.getOperand(Idx);
8727 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8728 return SDValue();
8729
8730 // If this is the first extract encountered in V, set the source vector,
8731 // otherwise verify the extract is from the previously defined source
8732 // vector.
8733 if (!SrcVec)
8734 SrcVec = Op.getOperand(0);
8735 else if (SrcVec != Op.getOperand(0))
8736 return SDValue();
8737 SDValue ExtractedIndex = Op->getOperand(1);
8738 // Peek through extends.
8739 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
8740 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
8741 ExtractedIndex = ExtractedIndex.getOperand(0);
8742 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8743 return SDValue();
8744
8745 // If this is the first extract from the index vector candidate, set the
8746 // indices vector, otherwise verify the extract is from the previously
8747 // defined indices vector.
8748 if (!IndicesVec)
8749 IndicesVec = ExtractedIndex.getOperand(0);
8750 else if (IndicesVec != ExtractedIndex.getOperand(0))
8751 return SDValue();
8752
8753 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
8754 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
8755 return SDValue();
8756 }
8757
8758 MVT VT = V.getSimpleValueType();
8759 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8760}
8761
8762SDValue
8763X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
8764 SDLoc dl(Op);
8765
8766 MVT VT = Op.getSimpleValueType();
8767 MVT EltVT = VT.getVectorElementType();
8768 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
8769 unsigned NumElems = Op.getNumOperands();
8770
8771 // Generate vectors for predicate vectors.
8772 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
8773 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
8774
8775 if (VT.getVectorElementType() == MVT::bf16 &&
8776 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
8777 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
8778
8779 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
8780 return VectorCst;
8781
8782 unsigned EVTBits = EltVT.getSizeInBits();
8783 APInt UndefMask = APInt::getZero(NumElems);
8784 APInt FrozenUndefMask = APInt::getZero(NumElems);
8785 APInt ZeroMask = APInt::getZero(NumElems);
8786 APInt NonZeroMask = APInt::getZero(NumElems);
8787 bool IsAllConstants = true;
8788 bool OneUseFrozenUndefs = true;
8789 SmallSet<SDValue, 8> Values;
8790 unsigned NumConstants = NumElems;
8791 for (unsigned i = 0; i < NumElems; ++i) {
8792 SDValue Elt = Op.getOperand(i);
8793 if (Elt.isUndef()) {
8794 UndefMask.setBit(i);
8795 continue;
8796 }
8797 if (ISD::isFreezeUndef(Elt.getNode())) {
8798 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
8799 FrozenUndefMask.setBit(i);
8800 continue;
8801 }
8802 Values.insert(Elt);
8803 if (!isIntOrFPConstant(Elt)) {
8804 IsAllConstants = false;
8805 NumConstants--;
8806 }
8807 if (X86::isZeroNode(Elt)) {
8808 ZeroMask.setBit(i);
8809 } else {
8810 NonZeroMask.setBit(i);
8811 }
8812 }
8813
8814 // All undef vector. Return an UNDEF.
8815 if (UndefMask.isAllOnes())
8816 return DAG.getUNDEF(VT);
8817
8818 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
8819 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
8820 return DAG.getFreeze(DAG.getUNDEF(VT));
8821
8822 // All undef/freeze(undef)/zero vector. Return a zero vector.
8823 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
8824 return getZeroVector(VT, Subtarget, DAG, dl);
8825
8826 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
8827 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
8828 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
8829 // and blend the FREEZE-UNDEF operands back in.
8830 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
8831 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
8832 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
8833 SmallVector<int, 16> BlendMask(NumElems, -1);
8834 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
8835 for (unsigned i = 0; i < NumElems; ++i) {
8836 if (UndefMask[i]) {
8837 BlendMask[i] = -1;
8838 continue;
8839 }
8840 BlendMask[i] = i;
8841 if (!FrozenUndefMask[i])
8842 Elts[i] = Op.getOperand(i);
8843 else
8844 BlendMask[i] += NumElems;
8845 }
8846 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
8847 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
8848 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
8849 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
8850 }
8851
8852 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
8853
8854 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
8855 // be better off lowering to a smaller build vector and padding with
8856 // undef/zero.
8857 if ((VT.is256BitVector() || VT.is512BitVector()) &&
8859 unsigned UpperElems = NumElems / 2;
8860 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
8861 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
8862 if (NumUpperUndefsOrZeros >= UpperElems) {
8863 if (VT.is512BitVector() &&
8864 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
8865 UpperElems = NumElems - (NumElems / 4);
8866 // If freeze(undef) is in any upper elements, force to zero.
8867 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
8868 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
8869 SDValue NewBV =
8870 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
8871 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
8872 }
8873 }
8874
8875 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
8876 return AddSub;
8877 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
8878 return HorizontalOp;
8879 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
8880 return Broadcast;
8881 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
8882 return BitOp;
8883
8884 unsigned NumZero = ZeroMask.popcount();
8885 unsigned NumNonZero = NonZeroMask.popcount();
8886
8887 // If we are inserting one variable into a vector of non-zero constants, try
8888 // to avoid loading each constant element as a scalar. Load the constants as a
8889 // vector and then insert the variable scalar element. If insertion is not
8890 // supported, fall back to a shuffle to get the scalar blended with the
8891 // constants. Insertion into a zero vector is handled as a special-case
8892 // somewhere below here.
8893 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
8894 FrozenUndefMask.isZero() &&
8897 // Create an all-constant vector. The variable element in the old
8898 // build vector is replaced by undef in the constant vector. Save the
8899 // variable scalar element and its index for use in the insertelement.
8900 LLVMContext &Context = *DAG.getContext();
8901 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
8902 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
8903 SDValue VarElt;
8904 SDValue InsIndex;
8905 for (unsigned i = 0; i != NumElems; ++i) {
8906 SDValue Elt = Op.getOperand(i);
8907 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
8908 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
8909 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
8910 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
8911 else if (!Elt.isUndef()) {
8912 assert(!VarElt.getNode() && !InsIndex.getNode() &&
8913 "Expected one variable element in this vector");
8914 VarElt = Elt;
8915 InsIndex = DAG.getVectorIdxConstant(i, dl);
8916 }
8917 }
8918 Constant *CV = ConstantVector::get(ConstVecOps);
8919 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
8920
8921 // The constants we just created may not be legal (eg, floating point). We
8922 // must lower the vector right here because we can not guarantee that we'll
8923 // legalize it before loading it. This is also why we could not just create
8924 // a new build vector here. If the build vector contains illegal constants,
8925 // it could get split back up into a series of insert elements.
8926 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
8927 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
8930 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
8931 unsigned InsertC = InsIndex->getAsZExtVal();
8932 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
8933 if (InsertC < NumEltsInLow128Bits)
8934 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
8935
8936 // There's no good way to insert into the high elements of a >128-bit
8937 // vector, so use shuffles to avoid an extract/insert sequence.
8938 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
8939 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
8940 SmallVector<int, 8> ShuffleMask;
8941 unsigned NumElts = VT.getVectorNumElements();
8942 for (unsigned i = 0; i != NumElts; ++i)
8943 ShuffleMask.push_back(i == InsertC ? NumElts : i);
8944 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
8945 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
8946 }
8947
8948 // Special case for single non-zero, non-undef, element.
8949 if (NumNonZero == 1) {
8950 unsigned Idx = NonZeroMask.countr_zero();
8951 SDValue Item = Op.getOperand(Idx);
8952
8953 // If we have a constant or non-constant insertion into the low element of
8954 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
8955 // the rest of the elements. This will be matched as movd/movq/movss/movsd
8956 // depending on what the source datatype is.
8957 if (Idx == 0) {
8958 if (NumZero == 0)
8959 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8960
8961 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
8962 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
8963 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
8964 assert((VT.is128BitVector() || VT.is256BitVector() ||
8965 VT.is512BitVector()) &&
8966 "Expected an SSE value type!");
8967 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8968 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
8969 // zero vector.
8970 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8971 }
8972
8973 // We can't directly insert an i8 or i16 into a vector, so zero extend
8974 // it to i32 first.
8975 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
8976 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
8977 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
8978 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
8979 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8980 return DAG.getBitcast(VT, Item);
8981 }
8982 }
8983
8984 // Is it a vector logical left shift?
8985 if (NumElems == 2 && Idx == 1 &&
8986 X86::isZeroNode(Op.getOperand(0)) &&
8987 !X86::isZeroNode(Op.getOperand(1))) {
8988 unsigned NumBits = VT.getSizeInBits();
8989 return getVShift(true, VT,
8991 VT, Op.getOperand(1)),
8992 NumBits/2, DAG, *this, dl);
8993 }
8994
8995 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
8996 return SDValue();
8997
8998 // Otherwise, if this is a vector with i32 or f32 elements, and the element
8999 // is a non-constant being inserted into an element other than the low one,
9000 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
9001 // movd/movss) to move this into the low element, then shuffle it into
9002 // place.
9003 if (EVTBits == 32) {
9004 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9005 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
9006 }
9007 }
9008
9009 // Splat is obviously ok. Let legalizer expand it to a shuffle.
9010 if (Values.size() == 1) {
9011 if (EVTBits == 32) {
9012 // Instead of a shuffle like this:
9013 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
9014 // Check if it's possible to issue this instead.
9015 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9016 unsigned Idx = NonZeroMask.countr_zero();
9017 SDValue Item = Op.getOperand(Idx);
9018 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9019 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9020 }
9021 return SDValue();
9022 }
9023
9024 // A vector full of immediates; various special cases are already
9025 // handled, so this is best done with a single constant-pool load.
9026 if (IsAllConstants)
9027 return SDValue();
9028
9029 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
9030 return V;
9031
9032 // See if we can use a vector load to get all of the elements.
9033 {
9034 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
9035 if (SDValue LD =
9036 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9037 return LD;
9038 }
9039
9040 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9041 // build_vector and broadcast it.
9042 // TODO: We could probably generalize this more.
9043 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
9044 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9045 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9046 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
9047 // Make sure all the even/odd operands match.
9048 for (unsigned i = 2; i != NumElems; ++i)
9049 if (Ops[i % 2] != Op.getOperand(i))
9050 return false;
9051 return true;
9052 };
9053 if (CanSplat(Op, NumElems, Ops)) {
9054 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
9055 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
9056 // Create a new build vector and cast to v2i64/v2f64.
9057 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
9058 DAG.getBuildVector(NarrowVT, dl, Ops));
9059 // Broadcast from v2i64/v2f64 and cast to final VT.
9060 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
9061 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
9062 NewBV));
9063 }
9064 }
9065
9066 // For AVX-length vectors, build the individual 128-bit pieces and use
9067 // shuffles to put them in place.
9068 if (VT.getSizeInBits() > 128) {
9069 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
9070
9071 // Build both the lower and upper subvector.
9072 SDValue Lower =
9073 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9075 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9076
9077 // Recreate the wider vector with the lower and upper part.
9078 return concatSubVectors(Lower, Upper, DAG, dl);
9079 }
9080
9081 // Let legalizer expand 2-wide build_vectors.
9082 if (EVTBits == 64) {
9083 if (NumNonZero == 1) {
9084 // One half is zero or undef.
9085 unsigned Idx = NonZeroMask.countr_zero();
9087 Op.getOperand(Idx));
9088 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
9089 }
9090 return SDValue();
9091 }
9092
9093 // If element VT is < 32 bits, convert it to inserts into a zero vector.
9094 if (EVTBits == 8 && NumElems == 16)
9095 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
9096 NumZero, DAG, Subtarget))
9097 return V;
9098
9099 if (EltVT == MVT::i16 && NumElems == 8)
9100 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
9101 NumZero, DAG, Subtarget))
9102 return V;
9103
9104 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
9105 if (EVTBits == 32 && NumElems == 4)
9106 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
9107 return V;
9108
9109 // If element VT is == 32 bits, turn it into a number of shuffles.
9110 if (NumElems == 4 && NumZero > 0) {
9111 SmallVector<SDValue, 8> Ops(NumElems);
9112 for (unsigned i = 0; i < 4; ++i) {
9113 bool isZero = !NonZeroMask[i];
9114 if (isZero)
9115 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
9116 else
9117 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9118 }
9119
9120 for (unsigned i = 0; i < 2; ++i) {
9121 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
9122 default: llvm_unreachable("Unexpected NonZero count");
9123 case 0:
9124 Ops[i] = Ops[i*2]; // Must be a zero vector.
9125 break;
9126 case 1:
9127 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9128 break;
9129 case 2:
9130 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9131 break;
9132 case 3:
9133 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9134 break;
9135 }
9136 }
9137
9138 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
9139 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
9140 int MaskVec[] = {
9141 Reverse1 ? 1 : 0,
9142 Reverse1 ? 0 : 1,
9143 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9144 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9145 };
9146 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
9147 }
9148
9149 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9150
9151 // Check for a build vector from mostly shuffle plus few inserting.
9152 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
9153 return Sh;
9154
9155 // For SSE 4.1, use insertps to put the high elements into the low element.
9156 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
9158 if (!Op.getOperand(0).isUndef())
9159 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
9160 else
9161 Result = DAG.getUNDEF(VT);
9162
9163 for (unsigned i = 1; i < NumElems; ++i) {
9164 if (Op.getOperand(i).isUndef()) continue;
9165 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
9166 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
9167 }
9168 return Result;
9169 }
9170
9171 // Otherwise, expand into a number of unpckl*, start by extending each of
9172 // our (non-undef) elements to the full vector width with the element in the
9173 // bottom slot of the vector (which generates no code for SSE).
9174 SmallVector<SDValue, 8> Ops(NumElems);
9175 for (unsigned i = 0; i < NumElems; ++i) {
9176 if (!Op.getOperand(i).isUndef())
9177 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9178 else
9179 Ops[i] = DAG.getUNDEF(VT);
9180 }
9181
9182 // Next, we iteratively mix elements, e.g. for v4f32:
9183 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
9184 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
9185 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
9186 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9187 // Generate scaled UNPCKL shuffle mask.
9189 for(unsigned i = 0; i != Scale; ++i)
9190 Mask.push_back(i);
9191 for (unsigned i = 0; i != Scale; ++i)
9192 Mask.push_back(NumElems+i);
9193 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9194
9195 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
9196 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
9197 }
9198 return Ops[0];
9199}
9200
9201// 256-bit AVX can use the vinsertf128 instruction
9202// to create 256-bit vectors from two other 128-bit ones.
9203// TODO: Detect subvector broadcast here instead of DAG combine?
9205 const X86Subtarget &Subtarget) {
9206 SDLoc dl(Op);
9207 MVT ResVT = Op.getSimpleValueType();
9208
9209 assert((ResVT.is256BitVector() ||
9210 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
9211
9212 unsigned NumOperands = Op.getNumOperands();
9213 unsigned NumFreezeUndef = 0;
9214 unsigned NumZero = 0;
9215 unsigned NumNonZero = 0;
9216 unsigned NonZeros = 0;
9217 for (unsigned i = 0; i != NumOperands; ++i) {
9218 SDValue SubVec = Op.getOperand(i);
9219 if (SubVec.isUndef())
9220 continue;
9221 if (ISD::isFreezeUndef(SubVec.getNode())) {
9222 // If the freeze(undef) has multiple uses then we must fold to zero.
9223 if (SubVec.hasOneUse())
9224 ++NumFreezeUndef;
9225 else
9226 ++NumZero;
9227 }
9228 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9229 ++NumZero;
9230 else {
9231 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9232 NonZeros |= 1 << i;
9233 ++NumNonZero;
9234 }
9235 }
9236
9237 // If we have more than 2 non-zeros, build each half separately.
9238 if (NumNonZero > 2) {
9239 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9240 ArrayRef<SDUse> Ops = Op->ops();
9241 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9242 Ops.slice(0, NumOperands/2));
9243 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9244 Ops.slice(NumOperands/2));
9245 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9246 }
9247
9248 // Otherwise, build it up through insert_subvectors.
9249 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9250 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
9251 : DAG.getUNDEF(ResVT));
9252
9253 MVT SubVT = Op.getOperand(0).getSimpleValueType();
9254 unsigned NumSubElems = SubVT.getVectorNumElements();
9255 for (unsigned i = 0; i != NumOperands; ++i) {
9256 if ((NonZeros & (1 << i)) == 0)
9257 continue;
9258
9259 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
9260 Op.getOperand(i),
9261 DAG.getIntPtrConstant(i * NumSubElems, dl));
9262 }
9263
9264 return Vec;
9265}
9266
9267// Returns true if the given node is a type promotion (by concatenating i1
9268// zeros) of the result of a node that already zeros all upper bits of
9269// k-register.
9270// TODO: Merge this with LowerAVXCONCAT_VECTORS?
9272 const X86Subtarget &Subtarget,
9273 SelectionDAG & DAG) {
9274 SDLoc dl(Op);
9275 MVT ResVT = Op.getSimpleValueType();
9276 unsigned NumOperands = Op.getNumOperands();
9277
9278 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9279 "Unexpected number of operands in CONCAT_VECTORS");
9280
9281 uint64_t Zeros = 0;
9282 uint64_t NonZeros = 0;
9283 for (unsigned i = 0; i != NumOperands; ++i) {
9284 SDValue SubVec = Op.getOperand(i);
9285 if (SubVec.isUndef())
9286 continue;
9287 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9288 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9289 Zeros |= (uint64_t)1 << i;
9290 else
9291 NonZeros |= (uint64_t)1 << i;
9292 }
9293
9294 unsigned NumElems = ResVT.getVectorNumElements();
9295
9296 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9297 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
9298 // insert_subvector will give us two kshifts.
9299 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9300 Log2_64(NonZeros) != NumOperands - 1) {
9301 unsigned Idx = Log2_64(NonZeros);
9302 SDValue SubVec = Op.getOperand(Idx);
9303 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9304 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
9305 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
9306 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
9307 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
9308 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
9309 DAG.getIntPtrConstant(0, dl));
9310 }
9311
9312 // If there are zero or one non-zeros we can handle this very simply.
9313 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
9314 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
9315 if (!NonZeros)
9316 return Vec;
9317 unsigned Idx = Log2_64(NonZeros);
9318 SDValue SubVec = Op.getOperand(Idx);
9319 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9320 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
9321 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
9322 }
9323
9324 if (NumOperands > 2) {
9325 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9326 ArrayRef<SDUse> Ops = Op->ops();
9327 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9328 Ops.slice(0, NumOperands/2));
9329 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9330 Ops.slice(NumOperands/2));
9331 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9332 }
9333
9334 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
9335
9336 if (ResVT.getVectorNumElements() >= 16)
9337 return Op; // The operation is legal with KUNPCK
9338
9339 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
9340 DAG.getUNDEF(ResVT), Op.getOperand(0),
9341 DAG.getIntPtrConstant(0, dl));
9342 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
9343 DAG.getIntPtrConstant(NumElems/2, dl));
9344}
9345
9347 const X86Subtarget &Subtarget,
9348 SelectionDAG &DAG) {
9349 MVT VT = Op.getSimpleValueType();
9350 if (VT.getVectorElementType() == MVT::i1)
9351 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
9352
9353 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
9354 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
9355 Op.getNumOperands() == 4)));
9356
9357 // AVX can use the vinsertf128 instruction to create 256-bit vectors
9358 // from two other 128-bit ones.
9359
9360 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9361 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
9362}
9363
9364//===----------------------------------------------------------------------===//
9365// Vector shuffle lowering
9366//
9367// This is an experimental code path for lowering vector shuffles on x86. It is
9368// designed to handle arbitrary vector shuffles and blends, gracefully
9369// degrading performance as necessary. It works hard to recognize idiomatic
9370// shuffles and lower them to optimal instruction patterns without leaving
9371// a framework that allows reasonably efficient handling of all vector shuffle
9372// patterns.
9373//===----------------------------------------------------------------------===//
9374
9375/// Tiny helper function to identify a no-op mask.
9376///
9377/// This is a somewhat boring predicate function. It checks whether the mask
9378/// array input, which is assumed to be a single-input shuffle mask of the kind
9379/// used by the X86 shuffle instructions (not a fully general
9380/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
9381/// in-place shuffle are 'no-op's.
9383 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9384 assert(Mask[i] >= -1 && "Out of bound mask element!");
9385 if (Mask[i] >= 0 && Mask[i] != i)
9386 return false;
9387 }
9388 return true;
9389}
9390
9391/// Test whether there are elements crossing LaneSizeInBits lanes in this
9392/// shuffle mask.
9393///
9394/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9395/// and we routinely test for these.
9396static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
9397 unsigned ScalarSizeInBits,
9398 ArrayRef<int> Mask) {
9399 assert(LaneSizeInBits && ScalarSizeInBits &&
9400 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9401 "Illegal shuffle lane size");
9402 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
9403 int Size = Mask.size();
9404 for (int i = 0; i < Size; ++i)
9405 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9406 return true;
9407 return false;
9408}
9409
9410/// Test whether there are elements crossing 128-bit lanes in this
9411/// shuffle mask.
9413 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
9414}
9415
9416/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
9417/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9418/// better support 'repeated mask + lane permute' style shuffles.
9419static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
9420 unsigned ScalarSizeInBits,
9421 ArrayRef<int> Mask) {
9422 assert(LaneSizeInBits && ScalarSizeInBits &&
9423 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9424 "Illegal shuffle lane size");
9425 int NumElts = Mask.size();
9426 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
9427 int NumLanes = NumElts / NumEltsPerLane;
9428 if (NumLanes > 1) {
9429 for (int i = 0; i != NumLanes; ++i) {
9430 int SrcLane = -1;
9431 for (int j = 0; j != NumEltsPerLane; ++j) {
9432 int M = Mask[(i * NumEltsPerLane) + j];
9433 if (M < 0)
9434 continue;
9435 int Lane = (M % NumElts) / NumEltsPerLane;
9436 if (SrcLane >= 0 && SrcLane != Lane)
9437 return true;
9438 SrcLane = Lane;
9439 }
9440 }
9441 }
9442 return false;
9443}
9444
9445/// Test whether a shuffle mask is equivalent within each sub-lane.
9446///
9447/// This checks a shuffle mask to see if it is performing the same
9448/// lane-relative shuffle in each sub-lane. This trivially implies
9449/// that it is also not lane-crossing. It may however involve a blend from the
9450/// same lane of a second vector.
9451///
9452/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
9453/// non-trivial to compute in the face of undef lanes. The representation is
9454/// suitable for use with existing 128-bit shuffles as entries from the second
9455/// vector have been remapped to [LaneSize, 2*LaneSize).
9456static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
9457 ArrayRef<int> Mask,
9458 SmallVectorImpl<int> &RepeatedMask) {
9459 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
9460 RepeatedMask.assign(LaneSize, -1);
9461 int Size = Mask.size();
9462 for (int i = 0; i < Size; ++i) {
9463 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
9464 if (Mask[i] < 0)
9465 continue;
9466 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9467 // This entry crosses lanes, so there is no way to model this shuffle.
9468 return false;
9469
9470 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
9471 // Adjust second vector indices to start at LaneSize instead of Size.
9472 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
9473 : Mask[i] % LaneSize + LaneSize;
9474 if (RepeatedMask[i % LaneSize] < 0)
9475 // This is the first non-undef entry in this slot of a 128-bit lane.
9476 RepeatedMask[i % LaneSize] = LocalM;
9477 else if (RepeatedMask[i % LaneSize] != LocalM)
9478 // Found a mismatch with the repeated mask.
9479 return false;
9480 }
9481 return true;
9482}
9483
9484/// Test whether a shuffle mask is equivalent within each 128-bit lane.
9485static bool
9487 SmallVectorImpl<int> &RepeatedMask) {
9488 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
9489}
9490
9491static bool
9493 SmallVector<int, 32> RepeatedMask;
9494 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
9495}
9496
9497/// Test whether a shuffle mask is equivalent within each 256-bit lane.
9498static bool
9500 SmallVectorImpl<int> &RepeatedMask) {
9501 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
9502}
9503
9504/// Test whether a target shuffle mask is equivalent within each sub-lane.
9505/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
9506static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
9507 unsigned EltSizeInBits,
9508 ArrayRef<int> Mask,
9509 SmallVectorImpl<int> &RepeatedMask) {
9510 int LaneSize = LaneSizeInBits / EltSizeInBits;
9511 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
9512 int Size = Mask.size();
9513 for (int i = 0; i < Size; ++i) {
9514 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
9515 if (Mask[i] == SM_SentinelUndef)
9516 continue;
9517 if (Mask[i] == SM_SentinelZero) {
9518 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
9519 return false;
9520 RepeatedMask[i % LaneSize] = SM_SentinelZero;
9521 continue;
9522 }
9523 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9524 // This entry crosses lanes, so there is no way to model this shuffle.
9525 return false;
9526
9527 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
9528 // later vector indices to start at multiples of LaneSize instead of Size.
9529 int LaneM = Mask[i] / Size;
9530 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
9531 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
9532 // This is the first non-undef entry in this slot of a 128-bit lane.
9533 RepeatedMask[i % LaneSize] = LocalM;
9534 else if (RepeatedMask[i % LaneSize] != LocalM)
9535 // Found a mismatch with the repeated mask.
9536 return false;
9537 }
9538 return true;
9539}
9540
9541/// Test whether a target shuffle mask is equivalent within each sub-lane.
9542/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
9543static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
9544 ArrayRef<int> Mask,
9545 SmallVectorImpl<int> &RepeatedMask) {
9546 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
9547 Mask, RepeatedMask);
9548}
9549
9550/// Checks whether the vector elements referenced by two shuffle masks are
9551/// equivalent.
9552static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
9553 int Idx, int ExpectedIdx) {
9554 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
9555 ExpectedIdx < MaskSize && "Out of range element index");
9556 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
9557 return false;
9558
9559 switch (Op.getOpcode()) {
9560 case ISD::BUILD_VECTOR:
9561 // If the values are build vectors, we can look through them to find
9562 // equivalent inputs that make the shuffles equivalent.
9563 // TODO: Handle MaskSize != Op.getNumOperands()?
9564 if (MaskSize == (int)Op.getNumOperands() &&
9565 MaskSize == (int)ExpectedOp.getNumOperands())
9566 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
9567 break;
9568 case X86ISD::VBROADCAST:
9570 // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
9571 return (Op == ExpectedOp &&
9572 (int)Op.getValueType().getVectorNumElements() == MaskSize);
9573 case X86ISD::HADD:
9574 case X86ISD::HSUB:
9575 case X86ISD::FHADD:
9576 case X86ISD::FHSUB:
9577 case X86ISD::PACKSS:
9578 case X86ISD::PACKUS:
9579 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
9580 // TODO: Handle MaskSize != NumElts?
9581 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
9582 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
9583 MVT VT = Op.getSimpleValueType();
9584 int NumElts = VT.getVectorNumElements();
9585 if (MaskSize == NumElts) {
9586 int NumLanes = VT.getSizeInBits() / 128;
9587 int NumEltsPerLane = NumElts / NumLanes;
9588 int NumHalfEltsPerLane = NumEltsPerLane / 2;
9589 bool SameLane =
9590 (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
9591 bool SameElt =
9592 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
9593 return SameLane && SameElt;
9594 }
9595 }
9596 break;
9597 }
9598
9599 return false;
9600}
9601
9602/// Checks whether a shuffle mask is equivalent to an explicit list of
9603/// arguments.
9604///
9605/// This is a fast way to test a shuffle mask against a fixed pattern:
9606///
9607/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
9608///
9609/// It returns true if the mask is exactly as wide as the argument list, and
9610/// each element of the mask is either -1 (signifying undef) or the value given
9611/// in the argument.
9612static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
9613 SDValue V1 = SDValue(),
9614 SDValue V2 = SDValue()) {
9615 int Size = Mask.size();
9616 if (Size != (int)ExpectedMask.size())
9617 return false;
9618
9619 for (int i = 0; i < Size; ++i) {
9620 assert(Mask[i] >= -1 && "Out of bound mask element!");
9621 int MaskIdx = Mask[i];
9622 int ExpectedIdx = ExpectedMask[i];
9623 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
9624 SDValue MaskV = MaskIdx < Size ? V1 : V2;
9625 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9626 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9627 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9628 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
9629 return false;
9630 }
9631 }
9632 return true;
9633}
9634
9635/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
9636///
9637/// The masks must be exactly the same width.
9638///
9639/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
9640/// value in ExpectedMask is always accepted. Otherwise the indices must match.
9641///
9642/// SM_SentinelZero is accepted as a valid negative index but must match in
9643/// both, or via a known bits test.
9645 ArrayRef<int> ExpectedMask,
9646 const SelectionDAG &DAG,
9647 SDValue V1 = SDValue(),
9648 SDValue V2 = SDValue()) {
9649 int Size = Mask.size();
9650 if (Size != (int)ExpectedMask.size())
9651 return false;
9652 assert(llvm::all_of(ExpectedMask,
9653 [Size](int M) { return isInRange(M, 0, 2 * Size); }) &&
9654 "Illegal target shuffle mask");
9655
9656 // Check for out-of-range target shuffle mask indices.
9657 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
9658 return false;
9659
9660 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
9661 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
9662 !V1.getValueType().isVector()))
9663 V1 = SDValue();
9664 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
9665 !V2.getValueType().isVector()))
9666 V2 = SDValue();
9667
9668 APInt ZeroV1 = APInt::getZero(Size);
9669 APInt ZeroV2 = APInt::getZero(Size);
9670
9671 for (int i = 0; i < Size; ++i) {
9672 int MaskIdx = Mask[i];
9673 int ExpectedIdx = ExpectedMask[i];
9674 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
9675 continue;
9676 if (MaskIdx == SM_SentinelZero) {
9677 // If we need this expected index to be a zero element, then update the
9678 // relevant zero mask and perform the known bits at the end to minimize
9679 // repeated computes.
9680 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9681 if (ExpectedV &&
9682 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
9683 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9684 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
9685 ZeroMask.setBit(BitIdx);
9686 continue;
9687 }
9688 }
9689 if (MaskIdx >= 0) {
9690 SDValue MaskV = MaskIdx < Size ? V1 : V2;
9691 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9692 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9693 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9694 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
9695 continue;
9696 }
9697 return false;
9698 }
9699 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
9700 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
9701}
9702
9703// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
9704// instructions.
9706 const SelectionDAG &DAG) {
9707 if (VT != MVT::v8i32 && VT != MVT::v8f32)
9708 return false;
9709
9710 SmallVector<int, 8> Unpcklwd;
9711 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
9712 /* Unary = */ false);
9713 SmallVector<int, 8> Unpckhwd;
9714 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
9715 /* Unary = */ false);
9716 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
9717 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
9718 return IsUnpackwdMask;
9719}
9720
9722 const SelectionDAG &DAG) {
9723 // Create 128-bit vector type based on mask size.
9724 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
9725 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
9726
9727 // We can't assume a canonical shuffle mask, so try the commuted version too.
9728 SmallVector<int, 4> CommutedMask(Mask);
9730
9731 // Match any of unary/binary or low/high.
9732 for (unsigned i = 0; i != 4; ++i) {
9733 SmallVector<int, 16> UnpackMask;
9734 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
9735 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
9736 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
9737 return true;
9738 }
9739 return false;
9740}
9741
9742/// Return true if a shuffle mask chooses elements identically in its top and
9743/// bottom halves. For example, any splat mask has the same top and bottom
9744/// halves. If an element is undefined in only one half of the mask, the halves
9745/// are not considered identical.
9747 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
9748 unsigned HalfSize = Mask.size() / 2;
9749 for (unsigned i = 0; i != HalfSize; ++i) {
9750 if (Mask[i] != Mask[i + HalfSize])
9751 return false;
9752 }
9753 return true;
9754}
9755
9756/// Get a 4-lane 8-bit shuffle immediate for a mask.
9757///
9758/// This helper function produces an 8-bit shuffle immediate corresponding to
9759/// the ubiquitous shuffle encoding scheme used in x86 instructions for
9760/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
9761/// example.
9762///
9763/// NB: We rely heavily on "undef" masks preserving the input lane.
9764static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
9765 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
9766 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
9767 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
9768 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
9769 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
9770
9771 // If the mask only uses one non-undef element, then fully 'splat' it to
9772 // improve later broadcast matching.
9773 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
9774 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
9775
9776 int FirstElt = Mask[FirstIndex];
9777 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
9778 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
9779
9780 unsigned Imm = 0;
9781 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
9782 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
9783 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
9784 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
9785 return Imm;
9786}
9787
9789 SelectionDAG &DAG) {
9790 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
9791}
9792
9793// The Shuffle result is as follow:
9794// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
9795// Each Zeroable's element correspond to a particular Mask's element.
9796// As described in computeZeroableShuffleElements function.
9797//
9798// The function looks for a sub-mask that the nonzero elements are in
9799// increasing order. If such sub-mask exist. The function returns true.
9800static bool isNonZeroElementsInOrder(const APInt &Zeroable,
9801 ArrayRef<int> Mask, const EVT &VectorType,
9802 bool &IsZeroSideLeft) {
9803 int NextElement = -1;
9804 // Check if the Mask's nonzero elements are in increasing order.
9805 for (int i = 0, e = Mask.size(); i < e; i++) {
9806 // Checks if the mask's zeros elements are built from only zeros.
9807 assert(Mask[i] >= -1 && "Out of bound mask element!");
9808 if (Mask[i] < 0)
9809 return false;
9810 if (Zeroable[i])
9811 continue;
9812 // Find the lowest non zero element
9813 if (NextElement < 0) {
9814 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
9815 IsZeroSideLeft = NextElement != 0;
9816 }
9817 // Exit if the mask's non zero elements are not in increasing order.
9818 if (NextElement != Mask[i])
9819 return false;
9820 NextElement++;
9821 }
9822 return true;
9823}
9824
9825/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
9827 ArrayRef<int> Mask, SDValue V1,
9828 SDValue V2, const APInt &Zeroable,
9829 const X86Subtarget &Subtarget,
9830 SelectionDAG &DAG) {
9831 int Size = Mask.size();
9832 int LaneSize = 128 / VT.getScalarSizeInBits();
9833 const int NumBytes = VT.getSizeInBits() / 8;
9834 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
9835
9836 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
9837 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
9838 (Subtarget.hasBWI() && VT.is512BitVector()));
9839
9840 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
9841 // Sign bit set in i8 mask means zero element.
9842 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
9843
9844 SDValue V;
9845 for (int i = 0; i < NumBytes; ++i) {
9846 int M = Mask[i / NumEltBytes];
9847 if (M < 0) {
9848 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
9849 continue;
9850 }
9851 if (Zeroable[i / NumEltBytes]) {
9852 PSHUFBMask[i] = ZeroMask;
9853 continue;
9854 }
9855
9856 // We can only use a single input of V1 or V2.
9857 SDValue SrcV = (M >= Size ? V2 : V1);
9858 if (V && V != SrcV)
9859 return SDValue();
9860 V = SrcV;
9861 M %= Size;
9862
9863 // PSHUFB can't cross lanes, ensure this doesn't happen.
9864 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
9865 return SDValue();
9866
9867 M = M % LaneSize;
9868 M = M * NumEltBytes + (i % NumEltBytes);
9869 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
9870 }
9871 assert(V && "Failed to find a source input");
9872
9873 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
9874 return DAG.getBitcast(
9875 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
9876 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
9877}
9878
9879static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
9880 const X86Subtarget &Subtarget, SelectionDAG &DAG,
9881 const SDLoc &dl);
9882
9883// X86 has dedicated shuffle that can be lowered to VEXPAND
9885 const APInt &Zeroable,
9886 ArrayRef<int> Mask, SDValue &V1,
9887 SDValue &V2, SelectionDAG &DAG,
9888 const X86Subtarget &Subtarget) {
9889 bool IsLeftZeroSide = true;
9890 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
9891 IsLeftZeroSide))
9892 return SDValue();
9893 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
9895 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9896 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
9897 unsigned NumElts = VT.getVectorNumElements();
9898 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
9899 "Unexpected number of vector elements");
9900 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
9901 Subtarget, DAG, DL);
9902 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
9903 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
9904 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
9905}
9906
9907static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
9908 unsigned &UnpackOpcode, bool IsUnary,
9909 ArrayRef<int> TargetMask, const SDLoc &DL,
9910 SelectionDAG &DAG,
9911 const X86Subtarget &Subtarget) {
9912 int NumElts = VT.getVectorNumElements();
9913
9914 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
9915 for (int i = 0; i != NumElts; i += 2) {
9916 int M1 = TargetMask[i + 0];
9917 int M2 = TargetMask[i + 1];
9918 Undef1 &= (SM_SentinelUndef == M1);
9919 Undef2 &= (SM_SentinelUndef == M2);
9920 Zero1 &= isUndefOrZero(M1);
9921 Zero2 &= isUndefOrZero(M2);
9922 }
9923 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
9924 "Zeroable shuffle detected");
9925
9926 // Attempt to match the target mask against the unpack lo/hi mask patterns.
9927 SmallVector<int, 64> Unpckl, Unpckh;
9928 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
9929 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
9930 (IsUnary ? V1 : V2))) {
9931 UnpackOpcode = X86ISD::UNPCKL;
9932 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9933 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9934 return true;
9935 }
9936
9937 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
9938 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
9939 (IsUnary ? V1 : V2))) {
9940 UnpackOpcode = X86ISD::UNPCKH;
9941 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9942 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9943 return true;
9944 }
9945
9946 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
9947 if (IsUnary && (Zero1 || Zero2)) {
9948 // Don't bother if we can blend instead.
9949 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
9950 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
9951 return false;
9952
9953 bool MatchLo = true, MatchHi = true;
9954 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
9955 int M = TargetMask[i];
9956
9957 // Ignore if the input is known to be zero or the index is undef.
9958 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
9959 (M == SM_SentinelUndef))
9960 continue;
9961
9962 MatchLo &= (M == Unpckl[i]);
9963 MatchHi &= (M == Unpckh[i]);
9964 }
9965
9966 if (MatchLo || MatchHi) {
9967 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
9968 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9969 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9970 return true;
9971 }
9972 }
9973
9974 // If a binary shuffle, commute and try again.
9975 if (!IsUnary) {
9977 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
9978 UnpackOpcode = X86ISD::UNPCKL;
9979 std::swap(V1, V2);
9980 return true;
9981 }
9982
9984 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
9985 UnpackOpcode = X86ISD::UNPCKH;
9986 std::swap(V1, V2);
9987 return true;
9988 }
9989 }
9990
9991 return false;
9992}
9993
9994// X86 has dedicated unpack instructions that can handle specific blend
9995// operations: UNPCKH and UNPCKL.
9997 ArrayRef<int> Mask, SDValue V1, SDValue V2,
9998 SelectionDAG &DAG) {
9999 SmallVector<int, 8> Unpckl;
10000 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
10001 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10002 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
10003
10004 SmallVector<int, 8> Unpckh;
10005 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
10006 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10007 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
10008
10009 // Commute and try again.
10011 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10012 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
10013
10015 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10016 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
10017
10018 return SDValue();
10019}
10020
10021/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10022/// followed by unpack 256-bit.
10024 ArrayRef<int> Mask, SDValue V1,
10025 SDValue V2, SelectionDAG &DAG) {
10026 SmallVector<int, 32> Unpckl, Unpckh;
10027 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
10028 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
10029
10030 unsigned UnpackOpcode;
10031 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10032 UnpackOpcode = X86ISD::UNPCKL;
10033 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10034 UnpackOpcode = X86ISD::UNPCKH;
10035 else
10036 return SDValue();
10037
10038 // This is a "natural" unpack operation (rather than the 128-bit sectored
10039 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10040 // input in order to use the x86 instruction.
10041 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
10042 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10043 V1 = DAG.getBitcast(VT, V1);
10044 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
10045}
10046
10047// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
10048// source into the lower elements and zeroing the upper elements.
10049static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
10050 ArrayRef<int> Mask, const APInt &Zeroable,
10051 const X86Subtarget &Subtarget) {
10052 if (!VT.is512BitVector() && !Subtarget.hasVLX())
10053 return false;
10054
10055 unsigned NumElts = Mask.size();
10056 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10057 unsigned MaxScale = 64 / EltSizeInBits;
10058
10059 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10060 unsigned SrcEltBits = EltSizeInBits * Scale;
10061 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10062 continue;
10063 unsigned NumSrcElts = NumElts / Scale;
10064 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
10065 continue;
10066 unsigned UpperElts = NumElts - NumSrcElts;
10067 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10068 continue;
10069 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
10070 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
10071 DstVT = MVT::getIntegerVT(EltSizeInBits);
10072 if ((NumSrcElts * EltSizeInBits) >= 128) {
10073 // ISD::TRUNCATE
10074 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
10075 } else {
10076 // X86ISD::VTRUNC
10077 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
10078 }
10079 return true;
10080 }
10081
10082 return false;
10083}
10084
10085// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
10086// element padding to the final DstVT.
10087static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
10088 const X86Subtarget &Subtarget,
10089 SelectionDAG &DAG, bool ZeroUppers) {
10090 MVT SrcVT = Src.getSimpleValueType();
10091 MVT DstSVT = DstVT.getScalarType();
10092 unsigned NumDstElts = DstVT.getVectorNumElements();
10093 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10094 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
10095
10096 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
10097 return SDValue();
10098
10099 // Perform a direct ISD::TRUNCATE if possible.
10100 if (NumSrcElts == NumDstElts)
10101 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
10102
10103 if (NumSrcElts > NumDstElts) {
10104 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10105 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10106 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
10107 }
10108
10109 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
10110 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10111 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10112 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10113 DstVT.getSizeInBits());
10114 }
10115
10116 // Non-VLX targets must truncate from a 512-bit type, so we need to
10117 // widen, truncate and then possibly extract the original subvector.
10118 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
10119 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
10120 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
10121 }
10122
10123 // Fallback to a X86ISD::VTRUNC, padding if necessary.
10124 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
10125 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
10126 if (DstVT != TruncVT)
10127 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10128 DstVT.getSizeInBits());
10129 return Trunc;
10130}
10131
10132// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
10133//
10134// An example is the following:
10135//
10136// t0: ch = EntryToken
10137// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
10138// t25: v4i32 = truncate t2
10139// t41: v8i16 = bitcast t25
10140// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
10141// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
10142// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
10143// t18: v2i64 = bitcast t51
10144//
10145// One can just use a single vpmovdw instruction, without avx512vl we need to
10146// use the zmm variant and extract the lower subvector, padding with zeroes.
10147// TODO: Merge with lowerShuffleAsVTRUNC.
10149 SDValue V2, ArrayRef<int> Mask,
10150 const APInt &Zeroable,
10151 const X86Subtarget &Subtarget,
10152 SelectionDAG &DAG) {
10153 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
10154 if (!Subtarget.hasAVX512())
10155 return SDValue();
10156
10157 unsigned NumElts = VT.getVectorNumElements();
10158 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10159 unsigned MaxScale = 64 / EltSizeInBits;
10160 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10161 unsigned SrcEltBits = EltSizeInBits * Scale;
10162 unsigned NumSrcElts = NumElts / Scale;
10163 unsigned UpperElts = NumElts - NumSrcElts;
10164 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
10165 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10166 continue;
10167
10168 // Attempt to find a matching source truncation, but as a fall back VLX
10169 // cases can use the VPMOV directly.
10170 SDValue Src = peekThroughBitcasts(V1);
10171 if (Src.getOpcode() == ISD::TRUNCATE &&
10172 Src.getScalarValueSizeInBits() == SrcEltBits) {
10173 Src = Src.getOperand(0);
10174 } else if (Subtarget.hasVLX()) {
10175 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10176 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10177 Src = DAG.getBitcast(SrcVT, Src);
10178 // Don't do this if PACKSS/PACKUS could perform it cheaper.
10179 if (Scale == 2 &&
10180 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
10181 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
10182 return SDValue();
10183 } else
10184 return SDValue();
10185
10186 // VPMOVWB is only available with avx512bw.
10187 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
10188 return SDValue();
10189
10190 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
10191 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10192 }
10193
10194 return SDValue();
10195}
10196
10197// Attempt to match binary shuffle patterns as a truncate.
10199 SDValue V2, ArrayRef<int> Mask,
10200 const APInt &Zeroable,
10201 const X86Subtarget &Subtarget,
10202 SelectionDAG &DAG) {
10203 assert((VT.is128BitVector() || VT.is256BitVector()) &&
10204 "Unexpected VTRUNC type");
10205 if (!Subtarget.hasAVX512())
10206 return SDValue();
10207
10208 unsigned NumElts = VT.getVectorNumElements();
10209 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10210 unsigned MaxScale = 64 / EltSizeInBits;
10211 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10212 // TODO: Support non-BWI VPMOVWB truncations?
10213 unsigned SrcEltBits = EltSizeInBits * Scale;
10214 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10215 continue;
10216
10217 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
10218 // Bail if the V2 elements are undef.
10219 unsigned NumHalfSrcElts = NumElts / Scale;
10220 unsigned NumSrcElts = 2 * NumHalfSrcElts;
10221 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
10222 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
10223 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
10224 continue;
10225
10226 // The elements beyond the truncation must be undef/zero.
10227 unsigned UpperElts = NumElts - NumSrcElts;
10228 if (UpperElts > 0 &&
10229 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10230 continue;
10231 bool UndefUppers =
10232 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
10233
10234 // For offset truncations, ensure that the concat is cheap.
10235 if (Offset) {
10236 auto IsCheapConcat = [&](SDValue Lo, SDValue Hi) {
10237 if (Lo.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
10238 Hi.getOpcode() == ISD::EXTRACT_SUBVECTOR)
10239 return Lo.getOperand(0) == Hi.getOperand(0);
10240 if (ISD::isNormalLoad(Lo.getNode()) &&
10241 ISD::isNormalLoad(Hi.getNode())) {
10242 auto *LDLo = cast<LoadSDNode>(Lo);
10243 auto *LDHi = cast<LoadSDNode>(Hi);
10245 LDHi, LDLo, Lo.getValueType().getStoreSize(), 1);
10246 }
10247 return false;
10248 };
10249 if (!IsCheapConcat(V1, V2))
10250 continue;
10251 }
10252
10253 // As we're using both sources then we need to concat them together
10254 // and truncate from the double-sized src.
10255 MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
10256 SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
10257
10258 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10259 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10260 Src = DAG.getBitcast(SrcVT, Src);
10261
10262 // Shift the offset'd elements into place for the truncation.
10263 // TODO: Use getTargetVShiftByConstNode.
10264 if (Offset)
10265 Src = DAG.getNode(
10266 X86ISD::VSRLI, DL, SrcVT, Src,
10267 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
10268
10269 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10270 }
10271 }
10272
10273 return SDValue();
10274}
10275
10276/// Check whether a compaction lowering can be done by dropping even/odd
10277/// elements and compute how many times even/odd elements must be dropped.
10278///
10279/// This handles shuffles which take every Nth element where N is a power of
10280/// two. Example shuffle masks:
10281///
10282/// (even)
10283/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10284/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10285/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10286/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10287/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10288/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10289///
10290/// (odd)
10291/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
10292/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
10293///
10294/// Any of these lanes can of course be undef.
10295///
10296/// This routine only supports N <= 3.
10297/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10298/// for larger N.
10299///
10300/// \returns N above, or the number of times even/odd elements must be dropped
10301/// if there is such a number. Otherwise returns zero.
10302static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
10303 bool IsSingleInput) {
10304 // The modulus for the shuffle vector entries is based on whether this is
10305 // a single input or not.
10306 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10307 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10308 "We should only be called with masks with a power-of-2 size!");
10309
10310 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10311 int Offset = MatchEven ? 0 : 1;
10312
10313 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10314 // and 2^3 simultaneously. This is because we may have ambiguity with
10315 // partially undef inputs.
10316 bool ViableForN[3] = {true, true, true};
10317
10318 for (int i = 0, e = Mask.size(); i < e; ++i) {
10319 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10320 // want.
10321 if (Mask[i] < 0)
10322 continue;
10323
10324 bool IsAnyViable = false;
10325 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10326 if (ViableForN[j]) {
10327 uint64_t N = j + 1;
10328
10329 // The shuffle mask must be equal to (i * 2^N) % M.
10330 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10331 IsAnyViable = true;
10332 else
10333 ViableForN[j] = false;
10334 }
10335 // Early exit if we exhaust the possible powers of two.
10336 if (!IsAnyViable)
10337 break;
10338 }
10339
10340 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10341 if (ViableForN[j])
10342 return j + 1;
10343
10344 // Return 0 as there is no viable power of two.
10345 return 0;
10346}
10347
10348// X86 has dedicated pack instructions that can handle specific truncation
10349// operations: PACKSS and PACKUS.
10350// Checks for compaction shuffle masks if MaxStages > 1.
10351// TODO: Add support for matching multiple PACKSS/PACKUS stages.
10352static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
10353 unsigned &PackOpcode, ArrayRef<int> TargetMask,
10354 const SelectionDAG &DAG,
10355 const X86Subtarget &Subtarget,
10356 unsigned MaxStages = 1) {
10357 unsigned NumElts = VT.getVectorNumElements();
10358 unsigned BitSize = VT.getScalarSizeInBits();
10359 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
10360 "Illegal maximum compaction");
10361
10362 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
10363 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
10364 unsigned NumPackedBits = NumSrcBits - BitSize;
10365 N1 = peekThroughBitcasts(N1);
10366 N2 = peekThroughBitcasts(N2);
10367 unsigned NumBits1 = N1.getScalarValueSizeInBits();
10368 unsigned NumBits2 = N2.getScalarValueSizeInBits();
10369 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
10370 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
10371 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
10372 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
10373 return false;
10374 if (Subtarget.hasSSE41() || BitSize == 8) {
10375 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
10376 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
10377 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
10378 V1 = N1;
10379 V2 = N2;
10380 SrcVT = PackVT;
10381 PackOpcode = X86ISD::PACKUS;
10382 return true;
10383 }
10384 }
10385 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
10386 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
10387 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
10388 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
10389 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
10390 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
10391 V1 = N1;
10392 V2 = N2;
10393 SrcVT = PackVT;
10394 PackOpcode = X86ISD::PACKSS;
10395 return true;
10396 }
10397 return false;
10398 };
10399
10400 // Attempt to match against wider and wider compaction patterns.
10401 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
10402 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
10403 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
10404
10405 // Try binary shuffle.
10406 SmallVector<int, 32> BinaryMask;
10407 createPackShuffleMask(VT, BinaryMask, false, NumStages);
10408 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
10409 if (MatchPACK(V1, V2, PackVT))
10410 return true;
10411
10412 // Try unary shuffle.
10413 SmallVector<int, 32> UnaryMask;
10414 createPackShuffleMask(VT, UnaryMask, true, NumStages);
10415 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
10416 if (MatchPACK(V1, V1, PackVT))
10417 return true;
10418 }
10419
10420 return false;
10421}
10422
10424 SDValue V1, SDValue V2, SelectionDAG &DAG,
10425 const X86Subtarget &Subtarget) {
10426 MVT PackVT;
10427 unsigned PackOpcode;
10428 unsigned SizeBits = VT.getSizeInBits();
10429 unsigned EltBits = VT.getScalarSizeInBits();
10430 unsigned MaxStages = Log2_32(64 / EltBits);
10431 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
10432 Subtarget, MaxStages))
10433 return SDValue();
10434
10435 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
10436 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
10437
10438 // Don't lower multi-stage packs on AVX512, truncation is better.
10439 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
10440 return SDValue();
10441
10442 // Pack to the largest type possible:
10443 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10444 unsigned MaxPackBits = 16;
10445 if (CurrentEltBits > 16 &&
10446 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
10447 MaxPackBits = 32;
10448
10449 // Repeatedly pack down to the target size.
10450 SDValue Res;
10451 for (unsigned i = 0; i != NumStages; ++i) {
10452 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
10453 unsigned NumSrcElts = SizeBits / SrcEltBits;
10454 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10455 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
10456 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10457 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
10458 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
10459 DAG.getBitcast(SrcVT, V2));
10460 V1 = V2 = Res;
10461 CurrentEltBits /= 2;
10462 }
10463 assert(Res && Res.getValueType() == VT &&
10464 "Failed to lower compaction shuffle");
10465 return Res;
10466}
10467
10468/// Try to emit a bitmask instruction for a shuffle.
10469///
10470/// This handles cases where we can model a blend exactly as a bitmask due to
10471/// one of the inputs being zeroable.
10473 SDValue V2, ArrayRef<int> Mask,
10474 const APInt &Zeroable,
10475 const X86Subtarget &Subtarget,
10476 SelectionDAG &DAG) {
10477 MVT MaskVT = VT;
10478 MVT EltVT = VT.getVectorElementType();
10479 SDValue Zero, AllOnes;
10480 // Use f64 if i64 isn't legal.
10481 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
10482 EltVT = MVT::f64;
10483 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
10484 }
10485
10486 MVT LogicVT = VT;
10487 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
10488 Zero = DAG.getConstantFP(0.0, DL, EltVT);
10489 APFloat AllOnesValue =
10491 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
10492 LogicVT =
10493 MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
10494 } else {
10495 Zero = DAG.getConstant(0, DL, EltVT);
10496 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10497 }
10498
10499 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
10500 SDValue V;
10501 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10502 if (Zeroable[i])
10503 continue;
10504 if (Mask[i] % Size != i)
10505 return SDValue(); // Not a blend.
10506 if (!V)
10507 V = Mask[i] < Size ? V1 : V2;
10508 else if (V != (Mask[i] < Size ? V1 : V2))
10509 return SDValue(); // Can only let one input through the mask.
10510
10511 VMaskOps[i] = AllOnes;
10512 }
10513 if (!V)
10514 return SDValue(); // No non-zeroable elements!
10515
10516 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
10517 VMask = DAG.getBitcast(LogicVT, VMask);
10518 V = DAG.getBitcast(LogicVT, V);
10519 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
10520 return DAG.getBitcast(VT, And);
10521}
10522
10523/// Try to emit a blend instruction for a shuffle using bit math.
10524///
10525/// This is used as a fallback approach when first class blend instructions are
10526/// unavailable. Currently it is only suitable for integer vectors, but could
10527/// be generalized for floating point vectors if desirable.
10529 SDValue V2, ArrayRef<int> Mask,
10530 SelectionDAG &DAG) {
10531 assert(VT.isInteger() && "Only supports integer vector types!");
10532 MVT EltVT = VT.getVectorElementType();
10533 SDValue Zero = DAG.getConstant(0, DL, EltVT);
10534 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10536 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10537 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
10538 return SDValue(); // Shuffled input!
10539 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
10540 }
10541
10542 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
10543 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
10544}
10545
10547 SDValue PreservedSrc,
10548 const X86Subtarget &Subtarget,
10549 SelectionDAG &DAG);
10550
10553 const APInt &Zeroable, bool &ForceV1Zero,
10554 bool &ForceV2Zero, uint64_t &BlendMask) {
10555 bool V1IsZeroOrUndef =
10557 bool V2IsZeroOrUndef =
10558 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
10559
10560 BlendMask = 0;
10561 ForceV1Zero = false, ForceV2Zero = false;
10562 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
10563
10564 int NumElts = Mask.size();
10565 int NumLanes = VT.getSizeInBits() / 128;
10566 int NumEltsPerLane = NumElts / NumLanes;
10567 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
10568
10569 // For 32/64-bit elements, if we only reference one input (plus any undefs),
10570 // then ensure the blend mask part for that lane just references that input.
10571 bool ForceWholeLaneMasks =
10572 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
10573
10574 // Attempt to generate the binary blend mask. If an input is zero then
10575 // we can use any lane.
10576 for (int Lane = 0; Lane != NumLanes; ++Lane) {
10577 // Keep track of the inputs used per lane.
10578 bool LaneV1InUse = false;
10579 bool LaneV2InUse = false;
10580 uint64_t LaneBlendMask = 0;
10581 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
10582 int Elt = (Lane * NumEltsPerLane) + LaneElt;
10583 int M = Mask[Elt];
10584 if (M == SM_SentinelUndef)
10585 continue;
10586 if (M == Elt || (0 <= M && M < NumElts &&
10587 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
10588 Mask[Elt] = Elt;
10589 LaneV1InUse = true;
10590 continue;
10591 }
10592 if (M == (Elt + NumElts) ||
10593 (NumElts <= M &&
10594 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
10595 LaneBlendMask |= 1ull << LaneElt;
10596 Mask[Elt] = Elt + NumElts;
10597 LaneV2InUse = true;
10598 continue;
10599 }
10600 if (Zeroable[Elt]) {
10601 if (V1IsZeroOrUndef) {
10602 ForceV1Zero = true;
10603 Mask[Elt] = Elt;
10604 LaneV1InUse = true;
10605 continue;
10606 }
10607 if (V2IsZeroOrUndef) {
10608 ForceV2Zero = true;
10609 LaneBlendMask |= 1ull << LaneElt;
10610 Mask[Elt] = Elt + NumElts;
10611 LaneV2InUse = true;
10612 continue;
10613 }
10614 }
10615 return false;
10616 }
10617
10618 // If we only used V2 then splat the lane blend mask to avoid any demanded
10619 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
10620 // blend mask bit).
10621 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
10622 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
10623
10624 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
10625 }
10626 return true;
10627}
10628
10629/// Try to emit a blend instruction for a shuffle.
10630///
10631/// This doesn't do any checks for the availability of instructions for blending
10632/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
10633/// be matched in the backend with the type given. What it does check for is
10634/// that the shuffle mask is a blend, or convertible into a blend with zero.
10636 SDValue V2, ArrayRef<int> Original,
10637 const APInt &Zeroable,
10638 const X86Subtarget &Subtarget,
10639 SelectionDAG &DAG) {
10640 uint64_t BlendMask = 0;
10641 bool ForceV1Zero = false, ForceV2Zero = false;
10642 SmallVector<int, 64> Mask(Original);
10643 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
10644 BlendMask))
10645 return SDValue();
10646
10647 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
10648 if (ForceV1Zero)
10649 V1 = getZeroVector(VT, Subtarget, DAG, DL);
10650 if (ForceV2Zero)
10651 V2 = getZeroVector(VT, Subtarget, DAG, DL);
10652
10653 unsigned NumElts = VT.getVectorNumElements();
10654
10655 switch (VT.SimpleTy) {
10656 case MVT::v4i64:
10657 case MVT::v8i32:
10658 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
10659 [[fallthrough]];
10660 case MVT::v4f64:
10661 case MVT::v8f32:
10662 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
10663 [[fallthrough]];
10664 case MVT::v2f64:
10665 case MVT::v2i64:
10666 case MVT::v4f32:
10667 case MVT::v4i32:
10668 case MVT::v8i16:
10669 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
10670 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
10671 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
10672 case MVT::v16i16: {
10673 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
10674 SmallVector<int, 8> RepeatedMask;
10675 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
10676 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
10677 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
10678 BlendMask = 0;
10679 for (int i = 0; i < 8; ++i)
10680 if (RepeatedMask[i] >= 8)
10681 BlendMask |= 1ull << i;
10682 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10683 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
10684 }
10685 // Use PBLENDW for lower/upper lanes and then blend lanes.
10686 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
10687 // merge to VSELECT where useful.
10688 uint64_t LoMask = BlendMask & 0xFF;
10689 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
10690 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
10691 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10692 DAG.getTargetConstant(LoMask, DL, MVT::i8));
10693 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10694 DAG.getTargetConstant(HiMask, DL, MVT::i8));
10695 return DAG.getVectorShuffle(
10696 MVT::v16i16, DL, Lo, Hi,
10697 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
10698 }
10699 [[fallthrough]];
10700 }
10701 case MVT::v32i8:
10702 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
10703 [[fallthrough]];
10704 case MVT::v16i8: {
10705 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
10706
10707 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
10708 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
10709 Subtarget, DAG))
10710 return Masked;
10711
10712 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
10713 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
10714 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
10715 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
10716 }
10717
10718 // If we have VPTERNLOG, we can use that as a bit blend.
10719 if (Subtarget.hasVLX())
10720 if (SDValue BitBlend =
10721 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
10722 return BitBlend;
10723
10724 // Scale the blend by the number of bytes per element.
10725 int Scale = VT.getScalarSizeInBits() / 8;
10726
10727 // This form of blend is always done on bytes. Compute the byte vector
10728 // type.
10729 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
10730
10731 // x86 allows load folding with blendvb from the 2nd source operand. But
10732 // we are still using LLVM select here (see comment below), so that's V1.
10733 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
10734 // allow that load-folding possibility.
10735 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
10737 std::swap(V1, V2);
10738 }
10739
10740 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
10741 // mix of LLVM's code generator and the x86 backend. We tell the code
10742 // generator that boolean values in the elements of an x86 vector register
10743 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
10744 // mapping a select to operand #1, and 'false' mapping to operand #2. The
10745 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
10746 // of the element (the remaining are ignored) and 0 in that high bit would
10747 // mean operand #1 while 1 in the high bit would mean operand #2. So while
10748 // the LLVM model for boolean values in vector elements gets the relevant
10749 // bit set, it is set backwards and over constrained relative to x86's
10750 // actual model.
10751 SmallVector<SDValue, 32> VSELECTMask;
10752 for (int i = 0, Size = Mask.size(); i < Size; ++i)
10753 for (int j = 0; j < Scale; ++j)
10754 VSELECTMask.push_back(
10755 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
10756 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
10757 MVT::i8));
10758
10759 V1 = DAG.getBitcast(BlendVT, V1);
10760 V2 = DAG.getBitcast(BlendVT, V2);
10761 return DAG.getBitcast(
10762 VT,
10763 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
10764 V1, V2));
10765 }
10766 case MVT::v16f32:
10767 case MVT::v8f64:
10768 case MVT::v8i64:
10769 case MVT::v16i32:
10770 case MVT::v32i16:
10771 case MVT::v64i8: {
10772 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
10773 bool OptForSize = DAG.shouldOptForSize();
10774 if (!OptForSize) {
10775 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
10776 Subtarget, DAG))
10777 return Masked;
10778 }
10779
10780 // Otherwise load an immediate into a GPR, cast to k-register, and use a
10781 // masked move.
10782 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
10783 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
10784 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
10785 }
10786 default:
10787 llvm_unreachable("Not a supported integer vector type!");
10788 }
10789}
10790
10791/// Try to lower as a blend of elements from two inputs followed by
10792/// a single-input permutation.
10793///
10794/// This matches the pattern where we can blend elements from two inputs and
10795/// then reduce the shuffle to a single-input permutation.
10797 SDValue V1, SDValue V2,
10798 ArrayRef<int> Mask,
10799 SelectionDAG &DAG,
10800 bool ImmBlends = false) {
10801 // We build up the blend mask while checking whether a blend is a viable way
10802 // to reduce the shuffle.
10803 SmallVector<int, 32> BlendMask(Mask.size(), -1);
10804 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
10805
10806 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10807 if (Mask[i] < 0)
10808 continue;
10809
10810 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
10811
10812 if (BlendMask[Mask[i] % Size] < 0)
10813 BlendMask[Mask[i] % Size] = Mask[i];
10814 else if (BlendMask[Mask[i] % Size] != Mask[i])
10815 return SDValue(); // Can't blend in the needed input!
10816
10817 PermuteMask[i] = Mask[i] % Size;
10818 }
10819
10820 // If only immediate blends, then bail if the blend mask can't be widened to
10821 // i16.
10822 unsigned EltSize = VT.getScalarSizeInBits();
10823 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
10824 return SDValue();
10825
10826 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
10827 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
10828}
10829
10830/// Try to lower as an unpack of elements from two inputs followed by
10831/// a single-input permutation.
10832///
10833/// This matches the pattern where we can unpack elements from two inputs and
10834/// then reduce the shuffle to a single-input (wider) permutation.
10836 SDValue V1, SDValue V2,
10837 ArrayRef<int> Mask,
10838 SelectionDAG &DAG) {
10839 int NumElts = Mask.size();
10840 int NumLanes = VT.getSizeInBits() / 128;
10841 int NumLaneElts = NumElts / NumLanes;
10842 int NumHalfLaneElts = NumLaneElts / 2;
10843
10844 bool MatchLo = true, MatchHi = true;
10845 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
10846
10847 // Determine UNPCKL/UNPCKH type and operand order.
10848 for (int Elt = 0; Elt != NumElts; ++Elt) {
10849 int M = Mask[Elt];
10850 if (M < 0)
10851 continue;
10852
10853 // Normalize the mask value depending on whether it's V1 or V2.
10854 int NormM = M;
10855 SDValue &Op = Ops[Elt & 1];
10856 if (M < NumElts && (Op.isUndef() || Op == V1))
10857 Op = V1;
10858 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
10859 Op = V2;
10860 NormM -= NumElts;
10861 } else
10862 return SDValue();
10863
10864 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
10865 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
10866 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
10867 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
10868 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
10869 if (MatchLoAnyLane || MatchHiAnyLane) {
10870 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
10871 "Failed to match UNPCKLO/UNPCKHI");
10872 break;
10873 }
10874 }
10875 MatchLo &= MatchLoAnyLane;
10876 MatchHi &= MatchHiAnyLane;
10877 if (!MatchLo && !MatchHi)
10878 return SDValue();
10879 }
10880 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
10881
10882 // Element indices have changed after unpacking. Calculate permute mask
10883 // so that they will be put back to the position as dictated by the
10884 // original shuffle mask indices.
10885 SmallVector<int, 32> PermuteMask(NumElts, -1);
10886 for (int Elt = 0; Elt != NumElts; ++Elt) {
10887 int M = Mask[Elt];
10888 if (M < 0)
10889 continue;
10890 int NormM = M;
10891 if (NumElts <= M)
10892 NormM -= NumElts;
10893 bool IsFirstOp = M < NumElts;
10894 int BaseMaskElt =
10895 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
10896 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
10897 PermuteMask[Elt] = BaseMaskElt;
10898 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
10899 PermuteMask[Elt] = BaseMaskElt + 1;
10900 assert(PermuteMask[Elt] != -1 &&
10901 "Input mask element is defined but failed to assign permute mask");
10902 }
10903
10904 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10905 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
10906 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
10907}
10908
10909/// Try to lower a shuffle as a permute of the inputs followed by an
10910/// UNPCK instruction.
10911///
10912/// This specifically targets cases where we end up with alternating between
10913/// the two inputs, and so can permute them into something that feeds a single
10914/// UNPCK instruction. Note that this routine only targets integer vectors
10915/// because for floating point vectors we have a generalized SHUFPS lowering
10916/// strategy that handles everything that doesn't *exactly* match an unpack,
10917/// making this clever lowering unnecessary.
10919 SDValue V1, SDValue V2,
10920 ArrayRef<int> Mask,
10921 const X86Subtarget &Subtarget,
10922 SelectionDAG &DAG) {
10923 int Size = Mask.size();
10924 assert(Mask.size() >= 2 && "Single element masks are invalid.");
10925
10926 // This routine only supports 128-bit integer dual input vectors.
10927 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
10928 return SDValue();
10929
10930 int NumLoInputs =
10931 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
10932 int NumHiInputs =
10933 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
10934
10935 bool UnpackLo = NumLoInputs >= NumHiInputs;
10936
10937 auto TryUnpack = [&](int ScalarSize, int Scale) {
10938 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
10939 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
10940
10941 for (int i = 0; i < Size; ++i) {
10942 if (Mask[i] < 0)
10943 continue;
10944
10945 // Each element of the unpack contains Scale elements from this mask.
10946 int UnpackIdx = i / Scale;
10947
10948 // We only handle the case where V1 feeds the first slots of the unpack.
10949 // We rely on canonicalization to ensure this is the case.
10950 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
10951 return SDValue();
10952
10953 // Setup the mask for this input. The indexing is tricky as we have to
10954 // handle the unpack stride.
10955 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
10956 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
10957 Mask[i] % Size;
10958 }
10959
10960 // If we will have to shuffle both inputs to use the unpack, check whether
10961 // we can just unpack first and shuffle the result. If so, skip this unpack.
10962 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
10963 !isNoopShuffleMask(V2Mask))
10964 return SDValue();
10965
10966 // Shuffle the inputs into place.
10967 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
10968 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
10969
10970 // Cast the inputs to the type we will use to unpack them.
10971 MVT UnpackVT =
10972 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
10973 V1 = DAG.getBitcast(UnpackVT, V1);
10974 V2 = DAG.getBitcast(UnpackVT, V2);
10975
10976 // Unpack the inputs and cast the result back to the desired type.
10977 return DAG.getBitcast(
10978 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10979 UnpackVT, V1, V2));
10980 };
10981
10982 // We try each unpack from the largest to the smallest to try and find one
10983 // that fits this mask.
10984 int OrigScalarSize = VT.getScalarSizeInBits();
10985 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
10986 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
10987 return Unpack;
10988
10989 // If we're shuffling with a zero vector then we're better off not doing
10990 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
10992 ISD::isBuildVectorAllZeros(V2.getNode()))
10993 return SDValue();
10994
10995 // If none of the unpack-rooted lowerings worked (or were profitable) try an
10996 // initial unpack.
10997 if (NumLoInputs == 0 || NumHiInputs == 0) {
10998 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
10999 "We have to have *some* inputs!");
11000 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11001
11002 // FIXME: We could consider the total complexity of the permute of each
11003 // possible unpacking. Or at the least we should consider how many
11004 // half-crossings are created.
11005 // FIXME: We could consider commuting the unpacks.
11006
11007 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11008 for (int i = 0; i < Size; ++i) {
11009 if (Mask[i] < 0)
11010 continue;
11011
11012 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11013
11014 PermMask[i] =
11015 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11016 }
11017 return DAG.getVectorShuffle(
11018 VT, DL,
11019 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
11020 V1, V2),
11021 DAG.getUNDEF(VT), PermMask);
11022 }
11023
11024 return SDValue();
11025}
11026
11027/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11028/// permuting the elements of the result in place.
11030 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11031 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11032 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11033 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11034 (VT.is512BitVector() && !Subtarget.hasBWI()))
11035 return SDValue();
11036
11037 // We don't currently support lane crossing permutes.
11038 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11039 return SDValue();
11040
11041 int Scale = VT.getScalarSizeInBits() / 8;
11042 int NumLanes = VT.getSizeInBits() / 128;
11043 int NumElts = VT.getVectorNumElements();
11044 int NumEltsPerLane = NumElts / NumLanes;
11045
11046 // Determine range of mask elts.
11047 bool Blend1 = true;
11048 bool Blend2 = true;
11049 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
11050 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
11051 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11052 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11053 int M = Mask[Lane + Elt];
11054 if (M < 0)
11055 continue;
11056 if (M < NumElts) {
11057 Blend1 &= (M == (Lane + Elt));
11058 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11059 M = M % NumEltsPerLane;
11060 Range1.first = std::min(Range1.first, M);
11061 Range1.second = std::max(Range1.second, M);
11062 } else {
11063 M -= NumElts;
11064 Blend2 &= (M == (Lane + Elt));
11065 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11066 M = M % NumEltsPerLane;
11067 Range2.first = std::min(Range2.first, M);
11068 Range2.second = std::max(Range2.second, M);
11069 }
11070 }
11071 }
11072
11073 // Bail if we don't need both elements.
11074 // TODO - it might be worth doing this for unary shuffles if the permute
11075 // can be widened.
11076 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11077 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11078 return SDValue();
11079
11080 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11081 return SDValue();
11082
11083 // Rotate the 2 ops so we can access both ranges, then permute the result.
11084 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11085 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11086 SDValue Rotate = DAG.getBitcast(
11087 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11088 DAG.getBitcast(ByteVT, Lo),
11089 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11090 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11091 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11092 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11093 int M = Mask[Lane + Elt];
11094 if (M < 0)
11095 continue;
11096 if (M < NumElts)
11097 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11098 else
11099 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11100 }
11101 }
11102 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11103 };
11104
11105 // Check if the ranges are small enough to rotate from either direction.
11106 if (Range2.second < Range1.first)
11107 return RotateAndPermute(V1, V2, Range1.first, 0);
11108 if (Range1.second < Range2.first)
11109 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11110 return SDValue();
11111}
11112
11114 return isUndefOrEqual(Mask, 0);
11115}
11116
11118 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
11119}
11120
11121/// Check if the Mask consists of the same element repeated multiple times.
11123 size_t NumUndefs = 0;
11124 std::optional<int> UniqueElt;
11125 for (int Elt : Mask) {
11126 if (Elt == SM_SentinelUndef) {
11127 NumUndefs++;
11128 continue;
11129 }
11130 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
11131 return false;
11132 UniqueElt = Elt;
11133 }
11134 // Make sure the element is repeated enough times by checking the number of
11135 // undefs is small.
11136 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
11137}
11138
11139/// Generic routine to decompose a shuffle and blend into independent
11140/// blends and permutes.
11141///
11142/// This matches the extremely common pattern for handling combined
11143/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11144/// operations. It will try to pick the best arrangement of shuffles and
11145/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
11147 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11148 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11149 int NumElts = Mask.size();
11150 int NumLanes = VT.getSizeInBits() / 128;
11151 int NumEltsPerLane = NumElts / NumLanes;
11152
11153 // Shuffle the input elements into the desired positions in V1 and V2 and
11154 // unpack/blend them together.
11155 bool IsAlternating = true;
11156 SmallVector<int, 32> V1Mask(NumElts, -1);
11157 SmallVector<int, 32> V2Mask(NumElts, -1);
11158 SmallVector<int, 32> FinalMask(NumElts, -1);
11159 for (int i = 0; i < NumElts; ++i) {
11160 int M = Mask[i];
11161 if (M >= 0 && M < NumElts) {
11162 V1Mask[i] = M;
11163 FinalMask[i] = i;
11164 IsAlternating &= (i & 1) == 0;
11165 } else if (M >= NumElts) {
11166 V2Mask[i] = M - NumElts;
11167 FinalMask[i] = i + NumElts;
11168 IsAlternating &= (i & 1) == 1;
11169 }
11170 }
11171
11172 // If we effectively only demand the 0'th element of \p Input, and not only
11173 // as 0'th element, then broadcast said input,
11174 // and change \p InputMask to be a no-op (identity) mask.
11175 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
11176 &DAG](SDValue &Input,
11177 MutableArrayRef<int> InputMask) {
11178 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
11179 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
11180 !X86::mayFoldLoad(Input, Subtarget)))
11181 return;
11182 if (isNoopShuffleMask(InputMask))
11183 return;
11184 assert(isBroadcastShuffleMask(InputMask) &&
11185 "Expected to demand only the 0'th element.");
11186 Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);
11187 for (auto I : enumerate(InputMask)) {
11188 int &InputMaskElt = I.value();
11189 if (InputMaskElt >= 0)
11190 InputMaskElt = I.index();
11191 }
11192 };
11193
11194 // Currently, we may need to produce one shuffle per input, and blend results.
11195 // It is possible that the shuffle for one of the inputs is already a no-op.
11196 // See if we can simplify non-no-op shuffles into broadcasts,
11197 // which we consider to be strictly better than an arbitrary shuffle.
11198 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
11200 canonicalizeBroadcastableInput(V1, V1Mask);
11201 canonicalizeBroadcastableInput(V2, V2Mask);
11202 }
11203
11204 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11205 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11206 // the shuffle may be able to fold with a load or other benefit. However, when
11207 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11208 // pre-shuffle first is a better strategy.
11209 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11210 // Only prefer immediate blends to unpack/rotate.
11211 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11212 DAG, true))
11213 return BlendPerm;
11214 // If either input vector provides only a single element which is repeated
11215 // multiple times, unpacking from both input vectors would generate worse
11216 // code. e.g. for
11217 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
11218 // it is better to process t4 first to create a vector of t4[0], then unpack
11219 // that vector with t2.
11220 if (!isSingleElementRepeatedMask(V1Mask) &&
11222 if (SDValue UnpackPerm =
11223 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
11224 return UnpackPerm;
11226 DL, VT, V1, V2, Mask, Subtarget, DAG))
11227 return RotatePerm;
11228 // Unpack/rotate failed - try again with variable blends.
11229 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11230 DAG))
11231 return BlendPerm;
11232 if (VT.getScalarSizeInBits() >= 32)
11233 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11234 DL, VT, V1, V2, Mask, Subtarget, DAG))
11235 return PermUnpack;
11236 }
11237
11238 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
11239 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
11240 // TODO: It doesn't have to be alternating - but each lane mustn't have more
11241 // than half the elements coming from each source.
11242 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
11243 V1Mask.assign(NumElts, -1);
11244 V2Mask.assign(NumElts, -1);
11245 FinalMask.assign(NumElts, -1);
11246 for (int i = 0; i != NumElts; i += NumEltsPerLane)
11247 for (int j = 0; j != NumEltsPerLane; ++j) {
11248 int M = Mask[i + j];
11249 if (M >= 0 && M < NumElts) {
11250 V1Mask[i + (j / 2)] = M;
11251 FinalMask[i + j] = i + (j / 2);
11252 } else if (M >= NumElts) {
11253 V2Mask[i + (j / 2)] = M - NumElts;
11254 FinalMask[i + j] = i + (j / 2) + NumElts;
11255 }
11256 }
11257 }
11258
11259 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11260 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11261 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
11262}
11263
11264static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11265 const X86Subtarget &Subtarget,
11266 ArrayRef<int> Mask) {
11267 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11268 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11269
11270 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11271 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11272 int MaxSubElts = 64 / EltSizeInBits;
11273 unsigned RotateAmt, NumSubElts;
11274 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
11275 MaxSubElts, NumSubElts, RotateAmt))
11276 return -1;
11277 unsigned NumElts = Mask.size();
11278 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11279 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11280 return RotateAmt;
11281}
11282
11283/// Lower shuffle using X86ISD::VROTLI rotations.
11285 ArrayRef<int> Mask,
11286 const X86Subtarget &Subtarget,
11287 SelectionDAG &DAG) {
11288 // Only XOP + AVX512 targets have bit rotation instructions.
11289 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
11290 bool IsLegal =
11291 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11292 if (!IsLegal && Subtarget.hasSSE3())
11293 return SDValue();
11294
11295 MVT RotateVT;
11296 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11297 Subtarget, Mask);
11298 if (RotateAmt < 0)
11299 return SDValue();
11300
11301 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11302 // expanded to OR(SRL,SHL), will be more efficient, but if they can
11303 // widen to vXi16 or more then existing lowering should will be better.
11304 if (!IsLegal) {
11305 if ((RotateAmt % 16) == 0)
11306 return SDValue();
11307 // TODO: Use getTargetVShiftByConstNode.
11308 unsigned ShlAmt = RotateAmt;
11309 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11310 V1 = DAG.getBitcast(RotateVT, V1);
11311 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11312 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11313 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11314 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11315 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11316 return DAG.getBitcast(VT, Rot);
11317 }
11318
11319 SDValue Rot =
11320 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11321 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11322 return DAG.getBitcast(VT, Rot);
11323}
11324
11325/// Try to match a vector shuffle as an element rotation.
11326///
11327/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11329 ArrayRef<int> Mask) {
11330 int NumElts = Mask.size();
11331
11332 // We need to detect various ways of spelling a rotation:
11333 // [11, 12, 13, 14, 15, 0, 1, 2]
11334 // [-1, 12, 13, 14, -1, -1, 1, -1]
11335 // [-1, -1, -1, -1, -1, -1, 1, 2]
11336 // [ 3, 4, 5, 6, 7, 8, 9, 10]
11337 // [-1, 4, 5, 6, -1, -1, 9, -1]
11338 // [-1, 4, 5, 6, -1, -1, -1, -1]
11339 int Rotation = 0;
11340 SDValue Lo, Hi;
11341 for (int i = 0; i < NumElts; ++i) {
11342 int M = Mask[i];
11343 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
11344 "Unexpected mask index.");
11345 if (M < 0)
11346 continue;
11347
11348 // Determine where a rotated vector would have started.
11349 int StartIdx = i - (M % NumElts);
11350 if (StartIdx == 0)
11351 // The identity rotation isn't interesting, stop.
11352 return -1;
11353
11354 // If we found the tail of a vector the rotation must be the missing
11355 // front. If we found the head of a vector, it must be how much of the
11356 // head.
11357 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11358
11359 if (Rotation == 0)
11360 Rotation = CandidateRotation;
11361 else if (Rotation != CandidateRotation)
11362 // The rotations don't match, so we can't match this mask.
11363 return -1;
11364
11365 // Compute which value this mask is pointing at.
11366 SDValue MaskV = M < NumElts ? V1 : V2;
11367
11368 // Compute which of the two target values this index should be assigned
11369 // to. This reflects whether the high elements are remaining or the low
11370 // elements are remaining.
11371 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11372
11373 // Either set up this value if we've not encountered it before, or check
11374 // that it remains consistent.
11375 if (!TargetV)
11376 TargetV = MaskV;
11377 else if (TargetV != MaskV)
11378 // This may be a rotation, but it pulls from the inputs in some
11379 // unsupported interleaving.
11380 return -1;
11381 }
11382
11383 // Check that we successfully analyzed the mask, and normalize the results.
11384 assert(Rotation != 0 && "Failed to locate a viable rotation!");
11385 assert((Lo || Hi) && "Failed to find a rotated input vector!");
11386 if (!Lo)
11387 Lo = Hi;
11388 else if (!Hi)
11389 Hi = Lo;
11390
11391 V1 = Lo;
11392 V2 = Hi;
11393
11394 return Rotation;
11395}
11396
11397/// Try to lower a vector shuffle as a byte rotation.
11398///
11399/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11400/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11401/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11402/// try to generically lower a vector shuffle through such an pattern. It
11403/// does not check for the profitability of lowering either as PALIGNR or
11404/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11405/// This matches shuffle vectors that look like:
11406///
11407/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11408///
11409/// Essentially it concatenates V1 and V2, shifts right by some number of
11410/// elements, and takes the low elements as the result. Note that while this is
11411/// specified as a *right shift* because x86 is little-endian, it is a *left
11412/// rotate* of the vector lanes.
11414 ArrayRef<int> Mask) {
11415 // Don't accept any shuffles with zero elements.
11416 if (isAnyZero(Mask))
11417 return -1;
11418
11419 // PALIGNR works on 128-bit lanes.
11420 SmallVector<int, 16> RepeatedMask;
11421 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11422 return -1;
11423
11424 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
11425 if (Rotation <= 0)
11426 return -1;
11427
11428 // PALIGNR rotates bytes, so we need to scale the
11429 // rotation based on how many bytes are in the vector lane.
11430 int NumElts = RepeatedMask.size();
11431 int Scale = 16 / NumElts;
11432 return Rotation * Scale;
11433}
11434
11436 SDValue V2, ArrayRef<int> Mask,
11437 const X86Subtarget &Subtarget,
11438 SelectionDAG &DAG) {
11439 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11440
11441 SDValue Lo = V1, Hi = V2;
11442 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11443 if (ByteRotation <= 0)
11444 return SDValue();
11445
11446 // Cast the inputs to i8 vector of correct length to match PALIGNR or
11447 // PSLLDQ/PSRLDQ.
11448 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11449 Lo = DAG.getBitcast(ByteVT, Lo);
11450 Hi = DAG.getBitcast(ByteVT, Hi);
11451
11452 // SSSE3 targets can use the palignr instruction.
11453 if (Subtarget.hasSSSE3()) {
11454 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
11455 "512-bit PALIGNR requires BWI instructions");
11456 return DAG.getBitcast(
11457 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
11458 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
11459 }
11460
11461 assert(VT.is128BitVector() &&
11462 "Rotate-based lowering only supports 128-bit lowering!");
11463 assert(Mask.size() <= 16 &&
11464 "Can shuffle at most 16 bytes in a 128-bit vector!");
11465 assert(ByteVT == MVT::v16i8 &&
11466 "SSE2 rotate lowering only needed for v16i8!");
11467
11468 // Default SSE2 implementation
11469 int LoByteShift = 16 - ByteRotation;
11470 int HiByteShift = ByteRotation;
11471
11472 SDValue LoShift =
11473 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
11474 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
11475 SDValue HiShift =
11476 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
11477 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
11478 return DAG.getBitcast(VT,
11479 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
11480}
11481
11482/// Try to lower a vector shuffle as a dword/qword rotation.
11483///
11484/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
11485/// rotation of the concatenation of two vectors; This routine will
11486/// try to generically lower a vector shuffle through such an pattern.
11487///
11488/// Essentially it concatenates V1 and V2, shifts right by some number of
11489/// elements, and takes the low elements as the result. Note that while this is
11490/// specified as a *right shift* because x86 is little-endian, it is a *left
11491/// rotate* of the vector lanes.
11493 SDValue V2, ArrayRef<int> Mask,
11494 const APInt &Zeroable,
11495 const X86Subtarget &Subtarget,
11496 SelectionDAG &DAG) {
11497 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
11498 "Only 32-bit and 64-bit elements are supported!");
11499
11500 // 128/256-bit vectors are only supported with VLX.
11501 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
11502 && "VLX required for 128/256-bit vectors");
11503
11504 SDValue Lo = V1, Hi = V2;
11505 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
11506 if (0 < Rotation)
11507 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
11508 DAG.getTargetConstant(Rotation, DL, MVT::i8));
11509
11510 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
11511 // TODO: Pull this out as a matchShuffleAsElementShift helper?
11512 // TODO: We can probably make this more aggressive and use shift-pairs like
11513 // lowerShuffleAsByteShiftMask.
11514 unsigned NumElts = Mask.size();
11515 unsigned ZeroLo = Zeroable.countr_one();
11516 unsigned ZeroHi = Zeroable.countl_one();
11517 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
11518 if (!ZeroLo && !ZeroHi)
11519 return SDValue();
11520
11521 if (ZeroLo) {
11522 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11523 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
11524 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
11525 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
11526 getZeroVector(VT, Subtarget, DAG, DL),
11527 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
11528 }
11529
11530 if (ZeroHi) {
11531 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
11532 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
11533 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
11534 return DAG.getNode(X86ISD::VALIGN, DL, VT,
11535 getZeroVector(VT, Subtarget, DAG, DL), Src,
11536 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
11537 }
11538
11539 return SDValue();
11540}
11541
11542/// Try to lower a vector shuffle as a byte shift sequence.
11544 SDValue V2, ArrayRef<int> Mask,
11545 const APInt &Zeroable,
11546 const X86Subtarget &Subtarget,
11547 SelectionDAG &DAG) {
11548 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11549 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
11550
11551 // We need a shuffle that has zeros at one/both ends and a sequential
11552 // shuffle from one source within.
11553 unsigned ZeroLo = Zeroable.countr_one();
11554 unsigned ZeroHi = Zeroable.countl_one();
11555 if (!ZeroLo && !ZeroHi)
11556 return SDValue();
11557
11558 unsigned NumElts = Mask.size();
11559 unsigned Len = NumElts - (ZeroLo + ZeroHi);
11560 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
11561 return SDValue();
11562
11563 unsigned Scale = VT.getScalarSizeInBits() / 8;
11564 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
11565 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
11566 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
11567 return SDValue();
11568
11569 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11570 Res = DAG.getBitcast(MVT::v16i8, Res);
11571
11572 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
11573 // inner sequential set of elements, possibly offset:
11574 // 01234567 --> zzzzzz01 --> 1zzzzzzz
11575 // 01234567 --> 4567zzzz --> zzzzz456
11576 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
11577 if (ZeroLo == 0) {
11578 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11579 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11580 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11581 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11582 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
11583 } else if (ZeroHi == 0) {
11584 unsigned Shift = Mask[ZeroLo] % NumElts;
11585 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11586 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11587 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11588 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
11589 } else if (!Subtarget.hasSSSE3()) {
11590 // If we don't have PSHUFB then its worth avoiding an AND constant mask
11591 // by performing 3 byte shifts. Shuffle combining can kick in above that.
11592 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
11593 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11594 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11595 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11596 Shift += Mask[ZeroLo] % NumElts;
11597 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11598 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11599 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11600 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
11601 } else
11602 return SDValue();
11603
11604 return DAG.getBitcast(VT, Res);
11605}
11606
11607/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
11608///
11609/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
11610/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
11611/// matches elements from one of the input vectors shuffled to the left or
11612/// right with zeroable elements 'shifted in'. It handles both the strictly
11613/// bit-wise element shifts and the byte shift across an entire 128-bit double
11614/// quad word lane.
11615///
11616/// PSHL : (little-endian) left bit shift.
11617/// [ zz, 0, zz, 2 ]
11618/// [ -1, 4, zz, -1 ]
11619/// PSRL : (little-endian) right bit shift.
11620/// [ 1, zz, 3, zz]
11621/// [ -1, -1, 7, zz]
11622/// PSLLDQ : (little-endian) left byte shift
11623/// [ zz, 0, 1, 2, 3, 4, 5, 6]
11624/// [ zz, zz, -1, -1, 2, 3, 4, -1]
11625/// [ zz, zz, zz, zz, zz, zz, -1, 1]
11626/// PSRLDQ : (little-endian) right byte shift
11627/// [ 5, 6, 7, zz, zz, zz, zz, zz]
11628/// [ -1, 5, 6, 7, zz, zz, zz, zz]
11629/// [ 1, 2, -1, -1, -1, -1, zz, zz]
11630static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
11631 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
11632 int MaskOffset, const APInt &Zeroable,
11633 const X86Subtarget &Subtarget) {
11634 int Size = Mask.size();
11635 unsigned SizeInBits = Size * ScalarSizeInBits;
11636
11637 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
11638 for (int i = 0; i < Size; i += Scale)
11639 for (int j = 0; j < Shift; ++j)
11640 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
11641 return false;
11642
11643 return true;
11644 };
11645
11646 auto MatchShift = [&](int Shift, int Scale, bool Left) {
11647 for (int i = 0; i != Size; i += Scale) {
11648 unsigned Pos = Left ? i + Shift : i;
11649 unsigned Low = Left ? i : i + Shift;
11650 unsigned Len = Scale - Shift;
11651 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
11652 return -1;
11653 }
11654
11655 int ShiftEltBits = ScalarSizeInBits * Scale;
11656 bool ByteShift = ShiftEltBits > 64;
11657 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
11658 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
11659 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
11660
11661 // Normalize the scale for byte shifts to still produce an i64 element
11662 // type.
11663 Scale = ByteShift ? Scale / 2 : Scale;
11664
11665 // We need to round trip through the appropriate type for the shift.
11666 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
11667 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
11668 : MVT::getVectorVT(ShiftSVT, Size / Scale);
11669 return (int)ShiftAmt;
11670 };
11671
11672 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
11673 // keep doubling the size of the integer elements up to that. We can
11674 // then shift the elements of the integer vector by whole multiples of
11675 // their width within the elements of the larger integer vector. Test each
11676 // multiple to see if we can find a match with the moved element indices
11677 // and that the shifted in elements are all zeroable.
11678 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
11679 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
11680 for (int Shift = 1; Shift != Scale; ++Shift)
11681 for (bool Left : {true, false})
11682 if (CheckZeros(Shift, Scale, Left)) {
11683 int ShiftAmt = MatchShift(Shift, Scale, Left);
11684 if (0 < ShiftAmt)
11685 return ShiftAmt;
11686 }
11687
11688 // no match
11689 return -1;
11690}
11691
11693 SDValue V2, ArrayRef<int> Mask,
11694 const APInt &Zeroable,
11695 const X86Subtarget &Subtarget,
11696 SelectionDAG &DAG, bool BitwiseOnly) {
11697 int Size = Mask.size();
11698 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11699
11700 MVT ShiftVT;
11701 SDValue V = V1;
11702 unsigned Opcode;
11703
11704 // Try to match shuffle against V1 shift.
11705 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
11706 Mask, 0, Zeroable, Subtarget);
11707
11708 // If V1 failed, try to match shuffle against V2 shift.
11709 if (ShiftAmt < 0) {
11710 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
11711 Mask, Size, Zeroable, Subtarget);
11712 V = V2;
11713 }
11714
11715 if (ShiftAmt < 0)
11716 return SDValue();
11717
11718 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
11719 return SDValue();
11720
11721 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
11722 "Illegal integer vector type");
11723 V = DAG.getBitcast(ShiftVT, V);
11724 V = DAG.getNode(Opcode, DL, ShiftVT, V,
11725 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
11726 return DAG.getBitcast(VT, V);
11727}
11728
11729// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
11730// Remainder of lower half result is zero and upper half is all undef.
11731static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
11732 ArrayRef<int> Mask, uint64_t &BitLen,
11733 uint64_t &BitIdx, const APInt &Zeroable) {
11734 int Size = Mask.size();
11735 int HalfSize = Size / 2;
11736 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11737 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
11738
11739 // Upper half must be undefined.
11740 if (!isUndefUpperHalf(Mask))
11741 return false;
11742
11743 // Determine the extraction length from the part of the
11744 // lower half that isn't zeroable.
11745 int Len = HalfSize;
11746 for (; Len > 0; --Len)
11747 if (!Zeroable[Len - 1])
11748 break;
11749 assert(Len > 0 && "Zeroable shuffle mask");
11750
11751 // Attempt to match first Len sequential elements from the lower half.
11752 SDValue Src;
11753 int Idx = -1;
11754 for (int i = 0; i != Len; ++i) {
11755 int M = Mask[i];
11756 if (M == SM_SentinelUndef)
11757 continue;
11758 SDValue &V = (M < Size ? V1 : V2);
11759 M = M % Size;
11760
11761 // The extracted elements must start at a valid index and all mask
11762 // elements must be in the lower half.
11763 if (i > M || M >= HalfSize)
11764 return false;
11765
11766 if (Idx < 0 || (Src == V && Idx == (M - i))) {
11767 Src = V;
11768 Idx = M - i;
11769 continue;
11770 }
11771 return false;
11772 }
11773
11774 if (!Src || Idx < 0)
11775 return false;
11776
11777 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
11778 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
11779 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
11780 V1 = Src;
11781 return true;
11782}
11783
11784// INSERTQ: Extract lowest Len elements from lower half of second source and
11785// insert over first source, starting at Idx.
11786// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
11787static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
11788 ArrayRef<int> Mask, uint64_t &BitLen,
11789 uint64_t &BitIdx) {
11790 int Size = Mask.size();
11791 int HalfSize = Size / 2;
11792 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11793
11794 // Upper half must be undefined.
11795 if (!isUndefUpperHalf(Mask))
11796 return false;
11797
11798 for (int Idx = 0; Idx != HalfSize; ++Idx) {
11799 SDValue Base;
11800
11801 // Attempt to match first source from mask before insertion point.
11802 if (isUndefInRange(Mask, 0, Idx)) {
11803 /* EMPTY */
11804 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
11805 Base = V1;
11806 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
11807 Base = V2;
11808 } else {
11809 continue;
11810 }
11811
11812 // Extend the extraction length looking to match both the insertion of
11813 // the second source and the remaining elements of the first.
11814 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
11815 SDValue Insert;
11816 int Len = Hi - Idx;
11817
11818 // Match insertion.
11819 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
11820 Insert = V1;
11821 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
11822 Insert = V2;
11823 } else {
11824 continue;
11825 }
11826
11827 // Match the remaining elements of the lower half.
11828 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
11829 /* EMPTY */
11830 } else if ((!Base || (Base == V1)) &&
11831 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
11832 Base = V1;
11833 } else if ((!Base || (Base == V2)) &&
11834 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
11835 Size + Hi)) {
11836 Base = V2;
11837 } else {
11838 continue;
11839 }
11840
11841 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
11842 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
11843 V1 = Base;
11844 V2 = Insert;
11845 return true;
11846 }
11847 }
11848
11849 return false;
11850}
11851
11852/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
11854 SDValue V2, ArrayRef<int> Mask,
11855 const APInt &Zeroable, SelectionDAG &DAG) {
11856 uint64_t BitLen, BitIdx;
11857 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
11858 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
11859 DAG.getTargetConstant(BitLen, DL, MVT::i8),
11860 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
11861
11862 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
11863 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
11864 V2 ? V2 : DAG.getUNDEF(VT),
11865 DAG.getTargetConstant(BitLen, DL, MVT::i8),
11866 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
11867
11868 return SDValue();
11869}
11870
11871/// Lower a vector shuffle as a zero or any extension.
11872///
11873/// Given a specific number of elements, element bit width, and extension
11874/// stride, produce either a zero or any extension based on the available
11875/// features of the subtarget. The extended elements are consecutive and
11876/// begin and can start from an offsetted element index in the input; to
11877/// avoid excess shuffling the offset must either being in the bottom lane
11878/// or at the start of a higher lane. All extended elements must be from
11879/// the same lane.
11881 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
11882 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11883 assert(Scale > 1 && "Need a scale to extend.");
11884 int EltBits = VT.getScalarSizeInBits();
11885 int NumElements = VT.getVectorNumElements();
11886 int NumEltsPerLane = 128 / EltBits;
11887 int OffsetLane = Offset / NumEltsPerLane;
11888 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
11889 "Only 8, 16, and 32 bit elements can be extended.");
11890 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
11891 assert(0 <= Offset && "Extension offset must be positive.");
11892 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
11893 "Extension offset must be in the first lane or start an upper lane.");
11894
11895 // Check that an index is in same lane as the base offset.
11896 auto SafeOffset = [&](int Idx) {
11897 return OffsetLane == (Idx / NumEltsPerLane);
11898 };
11899
11900 // Shift along an input so that the offset base moves to the first element.
11901 auto ShuffleOffset = [&](SDValue V) {
11902 if (!Offset)
11903 return V;
11904
11905 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
11906 for (int i = 0; i * Scale < NumElements; ++i) {
11907 int SrcIdx = i + Offset;
11908 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
11909 }
11910 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
11911 };
11912
11913 // Found a valid a/zext mask! Try various lowering strategies based on the
11914 // input type and available ISA extensions.
11915 if (Subtarget.hasSSE41()) {
11916 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
11917 // PUNPCK will catch this in a later shuffle match.
11918 if (Offset && Scale == 2 && VT.is128BitVector())
11919 return SDValue();
11920 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
11921 NumElements / Scale);
11922 InputV = DAG.getBitcast(VT, InputV);
11923 InputV = ShuffleOffset(InputV);
11925 DL, ExtVT, InputV, DAG);
11926 return DAG.getBitcast(VT, InputV);
11927 }
11928
11929 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
11930 InputV = DAG.getBitcast(VT, InputV);
11931
11932 // For any extends we can cheat for larger element sizes and use shuffle
11933 // instructions that can fold with a load and/or copy.
11934 if (AnyExt && EltBits == 32) {
11935 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
11936 -1};
11937 return DAG.getBitcast(
11938 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
11939 DAG.getBitcast(MVT::v4i32, InputV),
11940 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11941 }
11942 if (AnyExt && EltBits == 16 && Scale > 2) {
11943 int PSHUFDMask[4] = {Offset / 2, -1,
11944 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
11945 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
11946 DAG.getBitcast(MVT::v4i32, InputV),
11947 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
11948 int PSHUFWMask[4] = {1, -1, -1, -1};
11949 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
11950 return DAG.getBitcast(
11951 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
11952 DAG.getBitcast(MVT::v8i16, InputV),
11953 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
11954 }
11955
11956 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
11957 // to 64-bits.
11958 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
11959 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
11960 assert(VT.is128BitVector() && "Unexpected vector width!");
11961
11962 int LoIdx = Offset * EltBits;
11963 SDValue Lo = DAG.getBitcast(
11964 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
11965 DAG.getTargetConstant(EltBits, DL, MVT::i8),
11966 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
11967
11968 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
11969 return DAG.getBitcast(VT, Lo);
11970
11971 int HiIdx = (Offset + 1) * EltBits;
11972 SDValue Hi = DAG.getBitcast(
11973 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
11974 DAG.getTargetConstant(EltBits, DL, MVT::i8),
11975 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
11976 return DAG.getBitcast(VT,
11977 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
11978 }
11979
11980 // If this would require more than 2 unpack instructions to expand, use
11981 // pshufb when available. We can only use more than 2 unpack instructions
11982 // when zero extending i8 elements which also makes it easier to use pshufb.
11983 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
11984 assert(NumElements == 16 && "Unexpected byte vector width!");
11985 SDValue PSHUFBMask[16];
11986 for (int i = 0; i < 16; ++i) {
11987 int Idx = Offset + (i / Scale);
11988 if ((i % Scale == 0 && SafeOffset(Idx))) {
11989 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
11990 continue;
11991 }
11992 PSHUFBMask[i] =
11993 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
11994 }
11995 InputV = DAG.getBitcast(MVT::v16i8, InputV);
11996 return DAG.getBitcast(
11997 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
11998 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
11999 }
12000
12001 // If we are extending from an offset, ensure we start on a boundary that
12002 // we can unpack from.
12003 int AlignToUnpack = Offset % (NumElements / Scale);
12004 if (AlignToUnpack) {
12005 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12006 for (int i = AlignToUnpack; i < NumElements; ++i)
12007 ShMask[i - AlignToUnpack] = i;
12008 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
12009 Offset -= AlignToUnpack;
12010 }
12011
12012 // Otherwise emit a sequence of unpacks.
12013 do {
12014 unsigned UnpackLoHi = X86ISD::UNPCKL;
12015 if (Offset >= (NumElements / 2)) {
12016 UnpackLoHi = X86ISD::UNPCKH;
12017 Offset -= (NumElements / 2);
12018 }
12019
12020 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
12021 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
12022 : getZeroVector(InputVT, Subtarget, DAG, DL);
12023 InputV = DAG.getBitcast(InputVT, InputV);
12024 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
12025 Scale /= 2;
12026 EltBits *= 2;
12027 NumElements /= 2;
12028 } while (Scale > 1);
12029 return DAG.getBitcast(VT, InputV);
12030}
12031
12032/// Try to lower a vector shuffle as a zero extension on any microarch.
12033///
12034/// This routine will try to do everything in its power to cleverly lower
12035/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12036/// check for the profitability of this lowering, it tries to aggressively
12037/// match this pattern. It will use all of the micro-architectural details it
12038/// can to emit an efficient lowering. It handles both blends with all-zero
12039/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12040/// masking out later).
12041///
12042/// The reason we have dedicated lowering for zext-style shuffles is that they
12043/// are both incredibly common and often quite performance sensitive.
12045 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12046 const APInt &Zeroable, const X86Subtarget &Subtarget,
12047 SelectionDAG &DAG) {
12048 int Bits = VT.getSizeInBits();
12049 int NumLanes = Bits / 128;
12050 int NumElements = VT.getVectorNumElements();
12051 int NumEltsPerLane = NumElements / NumLanes;
12052 assert(VT.getScalarSizeInBits() <= 32 &&
12053 "Exceeds 32-bit integer zero extension limit");
12054 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
12055
12056 // Define a helper function to check a particular ext-scale and lower to it if
12057 // valid.
12058 auto Lower = [&](int Scale) -> SDValue {
12059 SDValue InputV;
12060 bool AnyExt = true;
12061 int Offset = 0;
12062 int Matches = 0;
12063 for (int i = 0; i < NumElements; ++i) {
12064 int M = Mask[i];
12065 if (M < 0)
12066 continue; // Valid anywhere but doesn't tell us anything.
12067 if (i % Scale != 0) {
12068 // Each of the extended elements need to be zeroable.
12069 if (!Zeroable[i])
12070 return SDValue();
12071
12072 // We no longer are in the anyext case.
12073 AnyExt = false;
12074 continue;
12075 }
12076
12077 // Each of the base elements needs to be consecutive indices into the
12078 // same input vector.
12079 SDValue V = M < NumElements ? V1 : V2;
12080 M = M % NumElements;
12081 if (!InputV) {
12082 InputV = V;
12083 Offset = M - (i / Scale);
12084 } else if (InputV != V)
12085 return SDValue(); // Flip-flopping inputs.
12086
12087 // Offset must start in the lowest 128-bit lane or at the start of an
12088 // upper lane.
12089 // FIXME: Is it ever worth allowing a negative base offset?
12090 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12091 (Offset % NumEltsPerLane) == 0))
12092 return SDValue();
12093
12094 // If we are offsetting, all referenced entries must come from the same
12095 // lane.
12096 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12097 return SDValue();
12098
12099 if ((M % NumElements) != (Offset + (i / Scale)))
12100 return SDValue(); // Non-consecutive strided elements.
12101 Matches++;
12102 }
12103
12104 // If we fail to find an input, we have a zero-shuffle which should always
12105 // have already been handled.
12106 // FIXME: Maybe handle this here in case during blending we end up with one?
12107 if (!InputV)
12108 return SDValue();
12109
12110 // If we are offsetting, don't extend if we only match a single input, we
12111 // can always do better by using a basic PSHUF or PUNPCK.
12112 if (Offset != 0 && Matches < 2)
12113 return SDValue();
12114
12115 return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
12116 InputV, Mask, Subtarget, DAG);
12117 };
12118
12119 // The widest scale possible for extending is to a 64-bit integer.
12120 assert(Bits % 64 == 0 &&
12121 "The number of bits in a vector must be divisible by 64 on x86!");
12122 int NumExtElements = Bits / 64;
12123
12124 // Each iteration, try extending the elements half as much, but into twice as
12125 // many elements.
12126 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12127 assert(NumElements % NumExtElements == 0 &&
12128 "The input vector size must be divisible by the extended size.");
12129 if (SDValue V = Lower(NumElements / NumExtElements))
12130 return V;
12131 }
12132
12133 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12134 if (Bits != 128)
12135 return SDValue();
12136
12137 // Returns one of the source operands if the shuffle can be reduced to a
12138 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12139 auto CanZExtLowHalf = [&]() {
12140 for (int i = NumElements / 2; i != NumElements; ++i)
12141 if (!Zeroable[i])
12142 return SDValue();
12143 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12144 return V1;
12145 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12146 return V2;
12147 return SDValue();
12148 };
12149
12150 if (SDValue V = CanZExtLowHalf()) {
12151 V = DAG.getBitcast(MVT::v2i64, V);
12152 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12153 return DAG.getBitcast(VT, V);
12154 }
12155
12156 // No viable ext lowering found.
12157 return SDValue();
12158}
12159
12160/// Try to get a scalar value for a specific element of a vector.
12161///
12162/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12164 SelectionDAG &DAG) {
12165 MVT VT = V.getSimpleValueType();
12166 MVT EltVT = VT.getVectorElementType();
12167 V = peekThroughBitcasts(V);
12168
12169 // If the bitcasts shift the element size, we can't extract an equivalent
12170 // element from it.
12171 MVT NewVT = V.getSimpleValueType();
12172 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12173 return SDValue();
12174
12175 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12176 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12177 // Ensure the scalar operand is the same size as the destination.
12178 // FIXME: Add support for scalar truncation where possible.
12179 SDValue S = V.getOperand(Idx);
12180 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12181 return DAG.getBitcast(EltVT, S);
12182 }
12183
12184 return SDValue();
12185}
12186
12187/// Helper to test for a load that can be folded with x86 shuffles.
12188///
12189/// This is particularly important because the set of instructions varies
12190/// significantly based on whether the operand is a load or not.
12192 return V->hasOneUse() &&
12194}
12195
12196template<typename T>
12197static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
12198 T EltVT = VT.getScalarType();
12199 return EltVT == MVT::bf16 || (EltVT == MVT::f16 && !Subtarget.hasFP16());
12200}
12201
12202/// Try to lower insertion of a single element into a zero vector.
12203///
12204/// This is a common pattern that we have especially efficient patterns to lower
12205/// across all subtarget feature sets.
12207 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12208 const APInt &Zeroable, const X86Subtarget &Subtarget,
12209 SelectionDAG &DAG) {
12210 MVT ExtVT = VT;
12211 MVT EltVT = VT.getVectorElementType();
12212 unsigned NumElts = VT.getVectorNumElements();
12213 unsigned EltBits = VT.getScalarSizeInBits();
12214
12215 if (isSoftF16(EltVT, Subtarget))
12216 return SDValue();
12217
12218 int V2Index =
12219 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12220 Mask.begin();
12221 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
12222 bool IsV1Zeroable = true;
12223 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12224 if (i != V2Index && !Zeroable[i]) {
12225 IsV1Zeroable = false;
12226 break;
12227 }
12228
12229 // Bail if a non-zero V1 isn't used in place.
12230 if (!IsV1Zeroable) {
12231 SmallVector<int, 8> V1Mask(Mask);
12232 V1Mask[V2Index] = -1;
12233 if (!isNoopShuffleMask(V1Mask))
12234 return SDValue();
12235 }
12236
12237 // Check for a single input from a SCALAR_TO_VECTOR node.
12238 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12239 // all the smarts here sunk into that routine. However, the current
12240 // lowering of BUILD_VECTOR makes that nearly impossible until the old
12241 // vector shuffle lowering is dead.
12242 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12243 DAG);
12244 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12245 // We need to zext the scalar if it is smaller than an i32.
12246 V2S = DAG.getBitcast(EltVT, V2S);
12247 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
12248 // Using zext to expand a narrow element won't work for non-zero
12249 // insertions. But we can use a masked constant vector if we're
12250 // inserting V2 into the bottom of V1.
12251 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
12252 return SDValue();
12253
12254 // Zero-extend directly to i32.
12255 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12256 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12257
12258 // If we're inserting into a constant, mask off the inserted index
12259 // and OR with the zero-extended scalar.
12260 if (!IsV1Zeroable) {
12261 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
12262 Bits[V2Index] = APInt::getZero(EltBits);
12263 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
12264 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
12265 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12266 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
12267 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12268 }
12269 }
12270 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12271 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12272 EltVT == MVT::i16) {
12273 // Either not inserting from the low element of the input or the input
12274 // element size is too small to use VZEXT_MOVL to clear the high bits.
12275 return SDValue();
12276 }
12277
12278 if (!IsV1Zeroable) {
12279 // If V1 can't be treated as a zero vector we have fewer options to lower
12280 // this. We can't support integer vectors or non-zero targets cheaply.
12281 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12282 if (!VT.isFloatingPoint() || V2Index != 0)
12283 return SDValue();
12284 if (!VT.is128BitVector())
12285 return SDValue();
12286
12287 // Otherwise, use MOVSD, MOVSS or MOVSH.
12288 unsigned MovOpc = 0;
12289 if (EltVT == MVT::f16)
12290 MovOpc = X86ISD::MOVSH;
12291 else if (EltVT == MVT::f32)
12292 MovOpc = X86ISD::MOVSS;
12293 else if (EltVT == MVT::f64)
12294 MovOpc = X86ISD::MOVSD;
12295 else
12296 llvm_unreachable("Unsupported floating point element type to handle!");
12297 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
12298 }
12299
12300 // This lowering only works for the low element with floating point vectors.
12301 if (VT.isFloatingPoint() && V2Index != 0)
12302 return SDValue();
12303
12304 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12305 if (ExtVT != VT)
12306 V2 = DAG.getBitcast(VT, V2);
12307
12308 if (V2Index != 0) {
12309 // If we have 4 or fewer lanes we can cheaply shuffle the element into
12310 // the desired position. Otherwise it is more efficient to do a vector
12311 // shift left. We know that we can do a vector shift left because all
12312 // the inputs are zero.
12313 if (VT.isFloatingPoint() || NumElts <= 4) {
12314 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12315 V2Shuffle[V2Index] = 0;
12316 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12317 } else {
12318 V2 = DAG.getBitcast(MVT::v16i8, V2);
12319 V2 = DAG.getNode(
12320 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12321 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
12322 V2 = DAG.getBitcast(VT, V2);
12323 }
12324 }
12325 return V2;
12326}
12327
12328/// Try to lower broadcast of a single - truncated - integer element,
12329/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12330///
12331/// This assumes we have AVX2.
12333 int BroadcastIdx,
12334 const X86Subtarget &Subtarget,
12335 SelectionDAG &DAG) {
12336 assert(Subtarget.hasAVX2() &&
12337 "We can only lower integer broadcasts with AVX2!");
12338
12339 MVT EltVT = VT.getVectorElementType();
12340 MVT V0VT = V0.getSimpleValueType();
12341
12342 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12343 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12344
12345 MVT V0EltVT = V0VT.getVectorElementType();
12346 if (!V0EltVT.isInteger())
12347 return SDValue();
12348
12349 const unsigned EltSize = EltVT.getSizeInBits();
12350 const unsigned V0EltSize = V0EltVT.getSizeInBits();
12351
12352 // This is only a truncation if the original element type is larger.
12353 if (V0EltSize <= EltSize)
12354 return SDValue();
12355
12356 assert(((V0EltSize % EltSize) == 0) &&
12357 "Scalar type sizes must all be powers of 2 on x86!");
12358
12359 const unsigned V0Opc = V0.getOpcode();
12360 const unsigned Scale = V0EltSize / EltSize;
12361 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12362
12363 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12364 V0Opc != ISD::BUILD_VECTOR)
12365 return SDValue();
12366
12367 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12368
12369 // If we're extracting non-least-significant bits, shift so we can truncate.
12370 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12371 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12372 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12373 if (const int OffsetIdx = BroadcastIdx % Scale)
12374 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12375 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12376
12377 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12378 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12379}
12380
12381/// Test whether this can be lowered with a single SHUFPS instruction.
12382///
12383/// This is used to disable more specialized lowerings when the shufps lowering
12384/// will happen to be efficient.
12386 // This routine only handles 128-bit shufps.
12387 assert(Mask.size() == 4 && "Unsupported mask size!");
12388 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12389 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12390 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12391 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12392
12393 // To lower with a single SHUFPS we need to have the low half and high half
12394 // each requiring a single input.
12395 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12396 return false;
12397 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12398 return false;
12399
12400 return true;
12401}
12402
12403/// Test whether the specified input (0 or 1) is in-place blended by the
12404/// given mask.
12405///
12406/// This returns true if the elements from a particular input are already in the
12407/// slot required by the given mask and require no permutation.
12408static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12409 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12410 int Size = Mask.size();
12411 for (int i = 0; i < Size; ++i)
12412 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12413 return false;
12414
12415 return true;
12416}
12417
12418/// If we are extracting two 128-bit halves of a vector and shuffling the
12419/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12420/// multi-shuffle lowering.
12422 SDValue N1, ArrayRef<int> Mask,
12423 SelectionDAG &DAG) {
12424 MVT VT = N0.getSimpleValueType();
12425 assert((VT.is128BitVector() &&
12426 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
12427 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12428
12429 // Check that both sources are extracts of the same source vector.
12430 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12432 N0.getOperand(0) != N1.getOperand(0) ||
12433 !N0.hasOneUse() || !N1.hasOneUse())
12434 return SDValue();
12435
12436 SDValue WideVec = N0.getOperand(0);
12437 MVT WideVT = WideVec.getSimpleValueType();
12438 if (!WideVT.is256BitVector())
12439 return SDValue();
12440
12441 // Match extracts of each half of the wide source vector. Commute the shuffle
12442 // if the extract of the low half is N1.
12443 unsigned NumElts = VT.getVectorNumElements();
12444 SmallVector<int, 4> NewMask(Mask);
12445 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
12446 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
12447 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
12449 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
12450 return SDValue();
12451
12452 // Final bailout: if the mask is simple, we are better off using an extract
12453 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
12454 // because that avoids a constant load from memory.
12455 if (NumElts == 4 &&
12456 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
12457 return SDValue();
12458
12459 // Extend the shuffle mask with undef elements.
12460 NewMask.append(NumElts, -1);
12461
12462 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
12463 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
12464 NewMask);
12465 // This is free: ymm -> xmm.
12466 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
12467 DAG.getIntPtrConstant(0, DL));
12468}
12469
12470/// Try to lower broadcast of a single element.
12471///
12472/// For convenience, this code also bundles all of the subtarget feature set
12473/// filtering. While a little annoying to re-dispatch on type here, there isn't
12474/// a convenient way to factor it out.
12476 SDValue V2, ArrayRef<int> Mask,
12477 const X86Subtarget &Subtarget,
12478 SelectionDAG &DAG) {
12479 MVT EltVT = VT.getVectorElementType();
12480 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
12481 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
12482 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
12483 return SDValue();
12484
12485 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
12486 // we can only broadcast from a register with AVX2.
12487 unsigned NumEltBits = VT.getScalarSizeInBits();
12488 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
12491 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
12492
12493 // Check that the mask is a broadcast.
12494 int BroadcastIdx = getSplatIndex(Mask);
12495 if (BroadcastIdx < 0)
12496 return SDValue();
12497 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
12498 "a sorted mask where the broadcast "
12499 "comes from V1.");
12500
12501 // Go up the chain of (vector) values to find a scalar load that we can
12502 // combine with the broadcast.
12503 // TODO: Combine this logic with findEltLoadSrc() used by
12504 // EltsFromConsecutiveLoads().
12505 int BitOffset = BroadcastIdx * NumEltBits;
12506 SDValue V = V1;
12507 for (;;) {
12508 switch (V.getOpcode()) {
12509 case ISD::BITCAST: {
12510 V = V.getOperand(0);
12511 continue;
12512 }
12513 case ISD::CONCAT_VECTORS: {
12514 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
12515 int OpIdx = BitOffset / OpBitWidth;
12516 V = V.getOperand(OpIdx);
12517 BitOffset %= OpBitWidth;
12518 continue;
12519 }
12521 // The extraction index adds to the existing offset.
12522 unsigned EltBitWidth = V.getScalarValueSizeInBits();
12523 unsigned Idx = V.getConstantOperandVal(1);
12524 unsigned BeginOffset = Idx * EltBitWidth;
12525 BitOffset += BeginOffset;
12526 V = V.getOperand(0);
12527 continue;
12528 }
12529 case ISD::INSERT_SUBVECTOR: {
12530 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
12531 int EltBitWidth = VOuter.getScalarValueSizeInBits();
12532 int Idx = (int)V.getConstantOperandVal(2);
12533 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
12534 int BeginOffset = Idx * EltBitWidth;
12535 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
12536 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
12537 BitOffset -= BeginOffset;
12538 V = VInner;
12539 } else {
12540 V = VOuter;
12541 }
12542 continue;
12543 }
12544 }
12545 break;
12546 }
12547 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
12548 BroadcastIdx = BitOffset / NumEltBits;
12549
12550 // Do we need to bitcast the source to retrieve the original broadcast index?
12551 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
12552
12553 // Check if this is a broadcast of a scalar. We special case lowering
12554 // for scalars so that we can more effectively fold with loads.
12555 // If the original value has a larger element type than the shuffle, the
12556 // broadcast element is in essence truncated. Make that explicit to ease
12557 // folding.
12558 if (BitCastSrc && VT.isInteger())
12559 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
12560 DL, VT, V, BroadcastIdx, Subtarget, DAG))
12561 return TruncBroadcast;
12562
12563 // Also check the simpler case, where we can directly reuse the scalar.
12564 if (!BitCastSrc &&
12565 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
12566 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
12567 V = V.getOperand(BroadcastIdx);
12568
12569 // If we can't broadcast from a register, check that the input is a load.
12570 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
12571 return SDValue();
12572 } else if (ISD::isNormalLoad(V.getNode()) &&
12573 cast<LoadSDNode>(V)->isSimple()) {
12574 // We do not check for one-use of the vector load because a broadcast load
12575 // is expected to be a win for code size, register pressure, and possibly
12576 // uops even if the original vector load is not eliminated.
12577
12578 // Reduce the vector load and shuffle to a broadcasted scalar load.
12579 LoadSDNode *Ld = cast<LoadSDNode>(V);
12580 SDValue BaseAddr = Ld->getOperand(1);
12581 MVT SVT = VT.getScalarType();
12582 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
12583 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
12584 SDValue NewAddr =
12586
12587 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
12588 // than MOVDDUP.
12589 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
12590 if (Opcode == X86ISD::VBROADCAST) {
12591 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
12592 SDValue Ops[] = {Ld->getChain(), NewAddr};
12593 V = DAG.getMemIntrinsicNode(
12594 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
12596 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12598 return DAG.getBitcast(VT, V);
12599 }
12600 assert(SVT == MVT::f64 && "Unexpected VT!");
12601 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
12603 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12605 } else if (!BroadcastFromReg) {
12606 // We can't broadcast from a vector register.
12607 return SDValue();
12608 } else if (BitOffset != 0) {
12609 // We can only broadcast from the zero-element of a vector register,
12610 // but it can be advantageous to broadcast from the zero-element of a
12611 // subvector.
12612 if (!VT.is256BitVector() && !VT.is512BitVector())
12613 return SDValue();
12614
12615 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
12616 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12617 return SDValue();
12618
12619 // Only broadcast the zero-element of a 128-bit subvector.
12620 if ((BitOffset % 128) != 0)
12621 return SDValue();
12622
12623 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
12624 "Unexpected bit-offset");
12625 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
12626 "Unexpected vector size");
12627 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
12628 V = extract128BitVector(V, ExtractIdx, DAG, DL);
12629 }
12630
12631 // On AVX we can use VBROADCAST directly for scalar sources.
12632 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
12633 V = DAG.getBitcast(MVT::f64, V);
12634 if (Subtarget.hasAVX()) {
12635 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
12636 return DAG.getBitcast(VT, V);
12637 }
12638 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
12639 }
12640
12641 // If this is a scalar, do the broadcast on this type and bitcast.
12642 if (!V.getValueType().isVector()) {
12643 assert(V.getScalarValueSizeInBits() == NumEltBits &&
12644 "Unexpected scalar size");
12645 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
12647 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
12648 }
12649
12650 // We only support broadcasting from 128-bit vectors to minimize the
12651 // number of patterns we need to deal with in isel. So extract down to
12652 // 128-bits, removing as many bitcasts as possible.
12653 if (V.getValueSizeInBits() > 128)
12655
12656 // Otherwise cast V to a vector with the same element type as VT, but
12657 // possibly narrower than VT. Then perform the broadcast.
12658 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
12659 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
12660 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
12661}
12662
12663// Check for whether we can use INSERTPS to perform the shuffle. We only use
12664// INSERTPS when the V1 elements are already in the correct locations
12665// because otherwise we can just always use two SHUFPS instructions which
12666// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
12667// perform INSERTPS if a single V1 element is out of place and all V2
12668// elements are zeroable.
12670 unsigned &InsertPSMask,
12671 const APInt &Zeroable,
12672 ArrayRef<int> Mask, SelectionDAG &DAG) {
12673 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
12674 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
12675 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12676
12677 // Attempt to match INSERTPS with one element from VA or VB being
12678 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
12679 // are updated.
12680 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
12681 ArrayRef<int> CandidateMask) {
12682 unsigned ZMask = 0;
12683 int VADstIndex = -1;
12684 int VBDstIndex = -1;
12685 bool VAUsedInPlace = false;
12686
12687 for (int i = 0; i < 4; ++i) {
12688 // Synthesize a zero mask from the zeroable elements (includes undefs).
12689 if (Zeroable[i]) {
12690 ZMask |= 1 << i;
12691 continue;
12692 }
12693
12694 // Flag if we use any VA inputs in place.
12695 if (i == CandidateMask[i]) {
12696 VAUsedInPlace = true;
12697 continue;
12698 }
12699
12700 // We can only insert a single non-zeroable element.
12701 if (VADstIndex >= 0 || VBDstIndex >= 0)
12702 return false;
12703
12704 if (CandidateMask[i] < 4) {
12705 // VA input out of place for insertion.
12706 VADstIndex = i;
12707 } else {
12708 // VB input for insertion.
12709 VBDstIndex = i;
12710 }
12711 }
12712
12713 // Don't bother if we have no (non-zeroable) element for insertion.
12714 if (VADstIndex < 0 && VBDstIndex < 0)
12715 return false;
12716
12717 // Determine element insertion src/dst indices. The src index is from the
12718 // start of the inserted vector, not the start of the concatenated vector.
12719 unsigned VBSrcIndex = 0;
12720 if (VADstIndex >= 0) {
12721 // If we have a VA input out of place, we use VA as the V2 element
12722 // insertion and don't use the original V2 at all.
12723 VBSrcIndex = CandidateMask[VADstIndex];
12724 VBDstIndex = VADstIndex;
12725 VB = VA;
12726 } else {
12727 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
12728 }
12729
12730 // If no V1 inputs are used in place, then the result is created only from
12731 // the zero mask and the V2 insertion - so remove V1 dependency.
12732 if (!VAUsedInPlace)
12733 VA = DAG.getUNDEF(MVT::v4f32);
12734
12735 // Update V1, V2 and InsertPSMask accordingly.
12736 V1 = VA;
12737 V2 = VB;
12738
12739 // Insert the V2 element into the desired position.
12740 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
12741 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
12742 return true;
12743 };
12744
12745 if (matchAsInsertPS(V1, V2, Mask))
12746 return true;
12747
12748 // Commute and try again.
12749 SmallVector<int, 4> CommutedMask(Mask);
12751 if (matchAsInsertPS(V2, V1, CommutedMask))
12752 return true;
12753
12754 return false;
12755}
12756
12758 ArrayRef<int> Mask, const APInt &Zeroable,
12759 SelectionDAG &DAG) {
12760 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
12761 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
12762
12763 // Attempt to match the insertps pattern.
12764 unsigned InsertPSMask = 0;
12765 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
12766 return SDValue();
12767
12768 // Insert the V2 element into the desired position.
12769 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
12770 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
12771}
12772
12773/// Handle lowering of 2-lane 64-bit floating point shuffles.
12774///
12775/// This is the basis function for the 2-lane 64-bit shuffles as we have full
12776/// support for floating point shuffles but not integer shuffles. These
12777/// instructions will incur a domain crossing penalty on some chips though so
12778/// it is better to avoid lowering through this for integer vectors where
12779/// possible.
12781 const APInt &Zeroable, SDValue V1, SDValue V2,
12782 const X86Subtarget &Subtarget,
12783 SelectionDAG &DAG) {
12784 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
12785 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
12786 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
12787
12788 if (V2.isUndef()) {
12789 // Check for being able to broadcast a single element.
12790 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
12791 Mask, Subtarget, DAG))
12792 return Broadcast;
12793
12794 // Straight shuffle of a single input vector. Simulate this by using the
12795 // single input as both of the "inputs" to this instruction..
12796 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
12797
12798 if (Subtarget.hasAVX()) {
12799 // If we have AVX, we can use VPERMILPS which will allow folding a load
12800 // into the shuffle.
12801 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
12802 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
12803 }
12804
12805 return DAG.getNode(
12806 X86ISD::SHUFP, DL, MVT::v2f64,
12807 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
12808 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
12809 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
12810 }
12811 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
12812 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
12813 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
12814 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
12815
12816 if (Subtarget.hasAVX2())
12817 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
12818 return Extract;
12819
12820 // When loading a scalar and then shuffling it into a vector we can often do
12821 // the insertion cheaply.
12823 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
12824 return Insertion;
12825 // Try inverting the insertion since for v2 masks it is easy to do and we
12826 // can't reliably sort the mask one way or the other.
12827 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
12828 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
12830 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
12831 return Insertion;
12832
12833 // Try to use one of the special instruction patterns to handle two common
12834 // blend patterns if a zero-blend above didn't work.
12835 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
12836 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
12837 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
12838 // We can either use a special instruction to load over the low double or
12839 // to move just the low double.
12840 return DAG.getNode(
12841 X86ISD::MOVSD, DL, MVT::v2f64, V2,
12842 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
12843
12844 if (Subtarget.hasSSE41())
12845 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
12846 Zeroable, Subtarget, DAG))
12847 return Blend;
12848
12849 // Use dedicated unpack instructions for masks that match their pattern.
12850 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
12851 return V;
12852
12853 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
12854 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
12855 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
12856}
12857
12858/// Handle lowering of 2-lane 64-bit integer shuffles.
12859///
12860/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
12861/// the integer unit to minimize domain crossing penalties. However, for blends
12862/// it falls back to the floating point shuffle operation with appropriate bit
12863/// casting.
12865 const APInt &Zeroable, SDValue V1, SDValue V2,
12866 const X86Subtarget &Subtarget,
12867 SelectionDAG &DAG) {
12868 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
12869 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
12870 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
12871
12872 if (V2.isUndef()) {
12873 // Check for being able to broadcast a single element.
12874 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
12875 Mask, Subtarget, DAG))
12876 return Broadcast;
12877
12878 // Straight shuffle of a single input vector. For everything from SSE2
12879 // onward this has a single fast instruction with no scary immediates.
12880 // We have to map the mask as it is actually a v4i32 shuffle instruction.
12881 V1 = DAG.getBitcast(MVT::v4i32, V1);
12882 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
12883 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
12884 Mask[1] < 0 ? -1 : (Mask[1] * 2),
12885 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
12886 return DAG.getBitcast(
12887 MVT::v2i64,
12888 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
12889 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
12890 }
12891 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
12892 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
12893 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
12894 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
12895
12896 if (Subtarget.hasAVX2())
12897 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
12898 return Extract;
12899
12900 // Try to use shift instructions.
12901 if (SDValue Shift =
12902 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
12903 DAG, /*BitwiseOnly*/ false))
12904 return Shift;
12905
12906 // When loading a scalar and then shuffling it into a vector we can often do
12907 // the insertion cheaply.
12909 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
12910 return Insertion;
12911 // Try inverting the insertion since for v2 masks it is easy to do and we
12912 // can't reliably sort the mask one way or the other.
12913 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
12915 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
12916 return Insertion;
12917
12918 // We have different paths for blend lowering, but they all must use the
12919 // *exact* same predicate.
12920 bool IsBlendSupported = Subtarget.hasSSE41();
12921 if (IsBlendSupported)
12922 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
12923 Zeroable, Subtarget, DAG))
12924 return Blend;
12925
12926 // Use dedicated unpack instructions for masks that match their pattern.
12927 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
12928 return V;
12929
12930 // Try to use byte rotation instructions.
12931 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
12932 if (Subtarget.hasSSSE3()) {
12933 if (Subtarget.hasVLX())
12934 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
12935 Zeroable, Subtarget, DAG))
12936 return Rotate;
12937
12938 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
12939 Subtarget, DAG))
12940 return Rotate;
12941 }
12942
12943 // If we have direct support for blends, we should lower by decomposing into
12944 // a permute. That will be faster than the domain cross.
12945 if (IsBlendSupported)
12946 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
12947 Subtarget, DAG);
12948
12949 // We implement this with SHUFPD which is pretty lame because it will likely
12950 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
12951 // However, all the alternatives are still more cycles and newer chips don't
12952 // have this problem. It would be really nice if x86 had better shuffles here.
12953 V1 = DAG.getBitcast(MVT::v2f64, V1);
12954 V2 = DAG.getBitcast(MVT::v2f64, V2);
12955 return DAG.getBitcast(MVT::v2i64,
12956 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
12957}
12958
12959/// Lower a vector shuffle using the SHUFPS instruction.
12960///
12961/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
12962/// It makes no assumptions about whether this is the *best* lowering, it simply
12963/// uses it.
12965 ArrayRef<int> Mask, SDValue V1,
12966 SDValue V2, SelectionDAG &DAG) {
12967 SDValue LowV = V1, HighV = V2;
12968 SmallVector<int, 4> NewMask(Mask);
12969 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
12970
12971 if (NumV2Elements == 1) {
12972 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
12973
12974 // Compute the index adjacent to V2Index and in the same half by toggling
12975 // the low bit.
12976 int V2AdjIndex = V2Index ^ 1;
12977
12978 if (Mask[V2AdjIndex] < 0) {
12979 // Handles all the cases where we have a single V2 element and an undef.
12980 // This will only ever happen in the high lanes because we commute the
12981 // vector otherwise.
12982 if (V2Index < 2)
12983 std::swap(LowV, HighV);
12984 NewMask[V2Index] -= 4;
12985 } else {
12986 // Handle the case where the V2 element ends up adjacent to a V1 element.
12987 // To make this work, blend them together as the first step.
12988 int V1Index = V2AdjIndex;
12989 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
12990 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
12991 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
12992
12993 // Now proceed to reconstruct the final blend as we have the necessary
12994 // high or low half formed.
12995 if (V2Index < 2) {
12996 LowV = V2;
12997 HighV = V1;
12998 } else {
12999 HighV = V2;
13000 }
13001 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
13002 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
13003 }
13004 } else if (NumV2Elements == 2) {
13005 if (Mask[0] < 4 && Mask[1] < 4) {
13006 // Handle the easy case where we have V1 in the low lanes and V2 in the
13007 // high lanes.
13008 NewMask[2] -= 4;
13009 NewMask[3] -= 4;
13010 } else if (Mask[2] < 4 && Mask[3] < 4) {
13011 // We also handle the reversed case because this utility may get called
13012 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
13013 // arrange things in the right direction.
13014 NewMask[0] -= 4;
13015 NewMask[1] -= 4;
13016 HighV = V1;
13017 LowV = V2;
13018 } else {
13019 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
13020 // trying to place elements directly, just blend them and set up the final
13021 // shuffle to place them.
13022
13023 // The first two blend mask elements are for V1, the second two are for
13024 // V2.
13025 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13026 Mask[2] < 4 ? Mask[2] : Mask[3],
13027 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13028 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13029 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13030 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13031
13032 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13033 // a blend.
13034 LowV = HighV = V1;
13035 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13036 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13037 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13038 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13039 }
13040 } else if (NumV2Elements == 3) {
13041 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
13042 // we can get here due to other paths (e.g repeated mask matching) that we
13043 // don't want to do another round of lowerVECTOR_SHUFFLE.
13045 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
13046 }
13047 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13048 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13049}
13050
13051/// Lower 4-lane 32-bit floating point shuffles.
13052///
13053/// Uses instructions exclusively from the floating point unit to minimize
13054/// domain crossing penalties, as these are sufficient to implement all v4f32
13055/// shuffles.
13057 const APInt &Zeroable, SDValue V1, SDValue V2,
13058 const X86Subtarget &Subtarget,
13059 SelectionDAG &DAG) {
13060 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13061 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13062 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13063
13064 if (Subtarget.hasSSE41())
13065 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13066 Zeroable, Subtarget, DAG))
13067 return Blend;
13068
13069 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13070
13071 if (NumV2Elements == 0) {
13072 // Check for being able to broadcast a single element.
13073 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13074 Mask, Subtarget, DAG))
13075 return Broadcast;
13076
13077 // Use even/odd duplicate instructions for masks that match their pattern.
13078 if (Subtarget.hasSSE3()) {
13079 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
13080 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13081 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
13082 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13083 }
13084
13085 if (Subtarget.hasAVX()) {
13086 // If we have AVX, we can use VPERMILPS which will allow folding a load
13087 // into the shuffle.
13088 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13089 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13090 }
13091
13092 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13093 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13094 if (!Subtarget.hasSSE2()) {
13095 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
13096 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13097 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
13098 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13099 }
13100
13101 // Otherwise, use a straight shuffle of a single input vector. We pass the
13102 // input vector to both operands to simulate this with a SHUFPS.
13103 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13104 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13105 }
13106
13107 if (Subtarget.hasSSE2())
13109 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
13110 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
13111 return ZExt;
13112 }
13113
13114 if (Subtarget.hasAVX2())
13115 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13116 return Extract;
13117
13118 // There are special ways we can lower some single-element blends. However, we
13119 // have custom ways we can lower more complex single-element blends below that
13120 // we defer to if both this and BLENDPS fail to match, so restrict this to
13121 // when the V2 input is targeting element 0 of the mask -- that is the fast
13122 // case here.
13123 if (NumV2Elements == 1 && Mask[0] >= 4)
13125 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13126 return V;
13127
13128 if (Subtarget.hasSSE41()) {
13129 // Use INSERTPS if we can complete the shuffle efficiently.
13130 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13131 return V;
13132
13133 if (!isSingleSHUFPSMask(Mask))
13134 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13135 V2, Mask, DAG))
13136 return BlendPerm;
13137 }
13138
13139 // Use low/high mov instructions. These are only valid in SSE1 because
13140 // otherwise they are widened to v2f64 and never get here.
13141 if (!Subtarget.hasSSE2()) {
13142 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
13143 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13144 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
13145 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13146 }
13147
13148 // Use dedicated unpack instructions for masks that match their pattern.
13149 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
13150 return V;
13151
13152 // Otherwise fall back to a SHUFPS lowering strategy.
13153 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13154}
13155
13156/// Lower 4-lane i32 vector shuffles.
13157///
13158/// We try to handle these with integer-domain shuffles where we can, but for
13159/// blends we use the floating point domain blend instructions.
13161 const APInt &Zeroable, SDValue V1, SDValue V2,
13162 const X86Subtarget &Subtarget,
13163 SelectionDAG &DAG) {
13164 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13165 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13166 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13167
13168 // Whenever we can lower this as a zext, that instruction is strictly faster
13169 // than any alternative. It also allows us to fold memory operands into the
13170 // shuffle in many cases.
13171 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13172 Zeroable, Subtarget, DAG))
13173 return ZExt;
13174
13175 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13176
13177 // Try to use shift instructions if fast.
13178 if (Subtarget.preferLowerShuffleAsShift()) {
13179 if (SDValue Shift =
13180 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
13181 Subtarget, DAG, /*BitwiseOnly*/ true))
13182 return Shift;
13183 if (NumV2Elements == 0)
13184 if (SDValue Rotate =
13185 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
13186 return Rotate;
13187 }
13188
13189 if (NumV2Elements == 0) {
13190 // Try to use broadcast unless the mask only has one non-undef element.
13191 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
13192 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
13193 Mask, Subtarget, DAG))
13194 return Broadcast;
13195 }
13196
13197 // Straight shuffle of a single input vector. For everything from SSE2
13198 // onward this has a single fast instruction with no scary immediates.
13199 // We coerce the shuffle pattern to be compatible with UNPCK instructions
13200 // but we aren't actually going to use the UNPCK instruction because doing
13201 // so prevents folding a load into this instruction or making a copy.
13202 const int UnpackLoMask[] = {0, 0, 1, 1};
13203 const int UnpackHiMask[] = {2, 2, 3, 3};
13204 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
13205 Mask = UnpackLoMask;
13206 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
13207 Mask = UnpackHiMask;
13208
13209 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13210 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13211 }
13212
13213 if (Subtarget.hasAVX2())
13214 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13215 return Extract;
13216
13217 // Try to use shift instructions.
13218 if (SDValue Shift =
13219 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
13220 DAG, /*BitwiseOnly*/ false))
13221 return Shift;
13222
13223 // There are special ways we can lower some single-element blends.
13224 if (NumV2Elements == 1)
13226 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13227 return V;
13228
13229 // We have different paths for blend lowering, but they all must use the
13230 // *exact* same predicate.
13231 bool IsBlendSupported = Subtarget.hasSSE41();
13232 if (IsBlendSupported)
13233 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
13234 Zeroable, Subtarget, DAG))
13235 return Blend;
13236
13237 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13238 Zeroable, Subtarget, DAG))
13239 return Masked;
13240
13241 // Use dedicated unpack instructions for masks that match their pattern.
13242 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
13243 return V;
13244
13245 // Try to use byte rotation instructions.
13246 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13247 if (Subtarget.hasSSSE3()) {
13248 if (Subtarget.hasVLX())
13249 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13250 Zeroable, Subtarget, DAG))
13251 return Rotate;
13252
13253 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13254 Subtarget, DAG))
13255 return Rotate;
13256 }
13257
13258 // Assume that a single SHUFPS is faster than an alternative sequence of
13259 // multiple instructions (even if the CPU has a domain penalty).
13260 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13261 if (!isSingleSHUFPSMask(Mask)) {
13262 // If we have direct support for blends, we should lower by decomposing into
13263 // a permute. That will be faster than the domain cross.
13264 if (IsBlendSupported)
13265 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
13266 Subtarget, DAG);
13267
13268 // Try to lower by permuting the inputs into an unpack instruction.
13269 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13270 Mask, Subtarget, DAG))
13271 return Unpack;
13272 }
13273
13274 // We implement this with SHUFPS because it can blend from two vectors.
13275 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13276 // up the inputs, bypassing domain shift penalties that we would incur if we
13277 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13278 // relevant.
13279 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13280 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13281 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13282 return DAG.getBitcast(MVT::v4i32, ShufPS);
13283}
13284
13285/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13286/// shuffle lowering, and the most complex part.
13287///
13288/// The lowering strategy is to try to form pairs of input lanes which are
13289/// targeted at the same half of the final vector, and then use a dword shuffle
13290/// to place them onto the right half, and finally unpack the paired lanes into
13291/// their final position.
13292///
13293/// The exact breakdown of how to form these dword pairs and align them on the
13294/// correct sides is really tricky. See the comments within the function for
13295/// more of the details.
13296///
13297/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13298/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13299/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13300/// vector, form the analogous 128-bit 8-element Mask.
13302 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13303 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13304 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
13305 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13306
13307 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
13308 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13309 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13310
13311 // Attempt to directly match PSHUFLW or PSHUFHW.
13312 if (isUndefOrInRange(LoMask, 0, 4) &&
13313 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13314 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13315 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13316 }
13317 if (isUndefOrInRange(HiMask, 4, 8) &&
13318 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13319 for (int i = 0; i != 4; ++i)
13320 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13321 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13322 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13323 }
13324
13325 SmallVector<int, 4> LoInputs;
13326 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13327 array_pod_sort(LoInputs.begin(), LoInputs.end());
13328 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
13329 SmallVector<int, 4> HiInputs;
13330 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13331 array_pod_sort(HiInputs.begin(), HiInputs.end());
13332 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
13333 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13334 int NumHToL = LoInputs.size() - NumLToL;
13335 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13336 int NumHToH = HiInputs.size() - NumLToH;
13337 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13338 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13339 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13340 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13341
13342 // If we are shuffling values from one half - check how many different DWORD
13343 // pairs we need to create. If only 1 or 2 then we can perform this as a
13344 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13345 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13346 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13347 V = DAG.getNode(ShufWOp, DL, VT, V,
13348 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13349 V = DAG.getBitcast(PSHUFDVT, V);
13350 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13351 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13352 return DAG.getBitcast(VT, V);
13353 };
13354
13355 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13356 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13357 SmallVector<std::pair<int, int>, 4> DWordPairs;
13358 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13359
13360 // Collect the different DWORD pairs.
13361 for (int DWord = 0; DWord != 4; ++DWord) {
13362 int M0 = Mask[2 * DWord + 0];
13363 int M1 = Mask[2 * DWord + 1];
13364 M0 = (M0 >= 0 ? M0 % 4 : M0);
13365 M1 = (M1 >= 0 ? M1 % 4 : M1);
13366 if (M0 < 0 && M1 < 0)
13367 continue;
13368
13369 bool Match = false;
13370 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13371 auto &DWordPair = DWordPairs[j];
13372 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13373 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13374 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13375 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13376 PSHUFDMask[DWord] = DOffset + j;
13377 Match = true;
13378 break;
13379 }
13380 }
13381 if (!Match) {
13382 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13383 DWordPairs.push_back(std::make_pair(M0, M1));
13384 }
13385 }
13386
13387 if (DWordPairs.size() <= 2) {
13388 DWordPairs.resize(2, std::make_pair(-1, -1));
13389 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13390 DWordPairs[1].first, DWordPairs[1].second};
13391 if ((NumHToL + NumHToH) == 0)
13392 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13393 if ((NumLToL + NumLToH) == 0)
13394 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13395 }
13396 }
13397
13398 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13399 // such inputs we can swap two of the dwords across the half mark and end up
13400 // with <=2 inputs to each half in each half. Once there, we can fall through
13401 // to the generic code below. For example:
13402 //
13403 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13404 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13405 //
13406 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13407 // and an existing 2-into-2 on the other half. In this case we may have to
13408 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13409 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
13410 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
13411 // because any other situation (including a 3-into-1 or 1-into-3 in the other
13412 // half than the one we target for fixing) will be fixed when we re-enter this
13413 // path. We will also combine away any sequence of PSHUFD instructions that
13414 // result into a single instruction. Here is an example of the tricky case:
13415 //
13416 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13417 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
13418 //
13419 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
13420 //
13421 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
13422 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
13423 //
13424 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
13425 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
13426 //
13427 // The result is fine to be handled by the generic logic.
13428 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
13429 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
13430 int AOffset, int BOffset) {
13431 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
13432 "Must call this with A having 3 or 1 inputs from the A half.");
13433 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
13434 "Must call this with B having 1 or 3 inputs from the B half.");
13435 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
13436 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
13437
13438 bool ThreeAInputs = AToAInputs.size() == 3;
13439
13440 // Compute the index of dword with only one word among the three inputs in
13441 // a half by taking the sum of the half with three inputs and subtracting
13442 // the sum of the actual three inputs. The difference is the remaining
13443 // slot.
13444 int ADWord = 0, BDWord = 0;
13445 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
13446 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
13447 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
13448 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
13449 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
13450 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
13451 int TripleNonInputIdx =
13452 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
13453 TripleDWord = TripleNonInputIdx / 2;
13454
13455 // We use xor with one to compute the adjacent DWord to whichever one the
13456 // OneInput is in.
13457 OneInputDWord = (OneInput / 2) ^ 1;
13458
13459 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
13460 // and BToA inputs. If there is also such a problem with the BToB and AToB
13461 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
13462 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
13463 // is essential that we don't *create* a 3<-1 as then we might oscillate.
13464 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
13465 // Compute how many inputs will be flipped by swapping these DWords. We
13466 // need
13467 // to balance this to ensure we don't form a 3-1 shuffle in the other
13468 // half.
13469 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
13470 llvm::count(AToBInputs, 2 * ADWord + 1);
13471 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
13472 llvm::count(BToBInputs, 2 * BDWord + 1);
13473 if ((NumFlippedAToBInputs == 1 &&
13474 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
13475 (NumFlippedBToBInputs == 1 &&
13476 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
13477 // We choose whether to fix the A half or B half based on whether that
13478 // half has zero flipped inputs. At zero, we may not be able to fix it
13479 // with that half. We also bias towards fixing the B half because that
13480 // will more commonly be the high half, and we have to bias one way.
13481 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
13482 ArrayRef<int> Inputs) {
13483 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
13484 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
13485 // Determine whether the free index is in the flipped dword or the
13486 // unflipped dword based on where the pinned index is. We use this bit
13487 // in an xor to conditionally select the adjacent dword.
13488 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
13489 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13490 if (IsFixIdxInput == IsFixFreeIdxInput)
13491 FixFreeIdx += 1;
13492 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13493 assert(IsFixIdxInput != IsFixFreeIdxInput &&
13494 "We need to be changing the number of flipped inputs!");
13495 int PSHUFHalfMask[] = {0, 1, 2, 3};
13496 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
13497 V = DAG.getNode(
13498 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
13499 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
13500 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13501
13502 for (int &M : Mask)
13503 if (M >= 0 && M == FixIdx)
13504 M = FixFreeIdx;
13505 else if (M >= 0 && M == FixFreeIdx)
13506 M = FixIdx;
13507 };
13508 if (NumFlippedBToBInputs != 0) {
13509 int BPinnedIdx =
13510 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
13511 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
13512 } else {
13513 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
13514 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
13515 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
13516 }
13517 }
13518 }
13519
13520 int PSHUFDMask[] = {0, 1, 2, 3};
13521 PSHUFDMask[ADWord] = BDWord;
13522 PSHUFDMask[BDWord] = ADWord;
13523 V = DAG.getBitcast(
13524 VT,
13525 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
13526 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13527
13528 // Adjust the mask to match the new locations of A and B.
13529 for (int &M : Mask)
13530 if (M >= 0 && M/2 == ADWord)
13531 M = 2 * BDWord + M % 2;
13532 else if (M >= 0 && M/2 == BDWord)
13533 M = 2 * ADWord + M % 2;
13534
13535 // Recurse back into this routine to re-compute state now that this isn't
13536 // a 3 and 1 problem.
13537 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
13538 };
13539 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
13540 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
13541 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
13542 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
13543
13544 // At this point there are at most two inputs to the low and high halves from
13545 // each half. That means the inputs can always be grouped into dwords and
13546 // those dwords can then be moved to the correct half with a dword shuffle.
13547 // We use at most one low and one high word shuffle to collect these paired
13548 // inputs into dwords, and finally a dword shuffle to place them.
13549 int PSHUFLMask[4] = {-1, -1, -1, -1};
13550 int PSHUFHMask[4] = {-1, -1, -1, -1};
13551 int PSHUFDMask[4] = {-1, -1, -1, -1};
13552
13553 // First fix the masks for all the inputs that are staying in their
13554 // original halves. This will then dictate the targets of the cross-half
13555 // shuffles.
13556 auto fixInPlaceInputs =
13557 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
13558 MutableArrayRef<int> SourceHalfMask,
13559 MutableArrayRef<int> HalfMask, int HalfOffset) {
13560 if (InPlaceInputs.empty())
13561 return;
13562 if (InPlaceInputs.size() == 1) {
13563 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13564 InPlaceInputs[0] - HalfOffset;
13565 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
13566 return;
13567 }
13568 if (IncomingInputs.empty()) {
13569 // Just fix all of the in place inputs.
13570 for (int Input : InPlaceInputs) {
13571 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
13572 PSHUFDMask[Input / 2] = Input / 2;
13573 }
13574 return;
13575 }
13576
13577 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
13578 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13579 InPlaceInputs[0] - HalfOffset;
13580 // Put the second input next to the first so that they are packed into
13581 // a dword. We find the adjacent index by toggling the low bit.
13582 int AdjIndex = InPlaceInputs[0] ^ 1;
13583 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
13584 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
13585 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
13586 };
13587 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
13588 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
13589
13590 // Now gather the cross-half inputs and place them into a free dword of
13591 // their target half.
13592 // FIXME: This operation could almost certainly be simplified dramatically to
13593 // look more like the 3-1 fixing operation.
13594 auto moveInputsToRightHalf = [&PSHUFDMask](
13595 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
13596 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
13597 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
13598 int DestOffset) {
13599 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
13600 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
13601 };
13602 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
13603 int Word) {
13604 int LowWord = Word & ~1;
13605 int HighWord = Word | 1;
13606 return isWordClobbered(SourceHalfMask, LowWord) ||
13607 isWordClobbered(SourceHalfMask, HighWord);
13608 };
13609
13610 if (IncomingInputs.empty())
13611 return;
13612
13613 if (ExistingInputs.empty()) {
13614 // Map any dwords with inputs from them into the right half.
13615 for (int Input : IncomingInputs) {
13616 // If the source half mask maps over the inputs, turn those into
13617 // swaps and use the swapped lane.
13618 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
13619 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
13620 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
13621 Input - SourceOffset;
13622 // We have to swap the uses in our half mask in one sweep.
13623 for (int &M : HalfMask)
13624 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
13625 M = Input;
13626 else if (M == Input)
13627 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13628 } else {
13629 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
13630 Input - SourceOffset &&
13631 "Previous placement doesn't match!");
13632 }
13633 // Note that this correctly re-maps both when we do a swap and when
13634 // we observe the other side of the swap above. We rely on that to
13635 // avoid swapping the members of the input list directly.
13636 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13637 }
13638
13639 // Map the input's dword into the correct half.
13640 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
13641 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
13642 else
13643 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
13644 Input / 2 &&
13645 "Previous placement doesn't match!");
13646 }
13647
13648 // And just directly shift any other-half mask elements to be same-half
13649 // as we will have mirrored the dword containing the element into the
13650 // same position within that half.
13651 for (int &M : HalfMask)
13652 if (M >= SourceOffset && M < SourceOffset + 4) {
13653 M = M - SourceOffset + DestOffset;
13654 assert(M >= 0 && "This should never wrap below zero!");
13655 }
13656 return;
13657 }
13658
13659 // Ensure we have the input in a viable dword of its current half. This
13660 // is particularly tricky because the original position may be clobbered
13661 // by inputs being moved and *staying* in that half.
13662 if (IncomingInputs.size() == 1) {
13663 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13664 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
13665 SourceOffset;
13666 SourceHalfMask[InputFixed - SourceOffset] =
13667 IncomingInputs[0] - SourceOffset;
13668 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
13669 InputFixed);
13670 IncomingInputs[0] = InputFixed;
13671 }
13672 } else if (IncomingInputs.size() == 2) {
13673 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
13674 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13675 // We have two non-adjacent or clobbered inputs we need to extract from
13676 // the source half. To do this, we need to map them into some adjacent
13677 // dword slot in the source mask.
13678 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
13679 IncomingInputs[1] - SourceOffset};
13680
13681 // If there is a free slot in the source half mask adjacent to one of
13682 // the inputs, place the other input in it. We use (Index XOR 1) to
13683 // compute an adjacent index.
13684 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
13685 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
13686 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
13687 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
13688 InputsFixed[1] = InputsFixed[0] ^ 1;
13689 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
13690 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
13691 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
13692 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
13693 InputsFixed[0] = InputsFixed[1] ^ 1;
13694 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
13695 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
13696 // The two inputs are in the same DWord but it is clobbered and the
13697 // adjacent DWord isn't used at all. Move both inputs to the free
13698 // slot.
13699 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
13700 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
13701 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
13702 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
13703 } else {
13704 // The only way we hit this point is if there is no clobbering
13705 // (because there are no off-half inputs to this half) and there is no
13706 // free slot adjacent to one of the inputs. In this case, we have to
13707 // swap an input with a non-input.
13708 for (int i = 0; i < 4; ++i)
13709 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
13710 "We can't handle any clobbers here!");
13711 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
13712 "Cannot have adjacent inputs here!");
13713
13714 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
13715 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
13716
13717 // We also have to update the final source mask in this case because
13718 // it may need to undo the above swap.
13719 for (int &M : FinalSourceHalfMask)
13720 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
13721 M = InputsFixed[1] + SourceOffset;
13722 else if (M == InputsFixed[1] + SourceOffset)
13723 M = (InputsFixed[0] ^ 1) + SourceOffset;
13724
13725 InputsFixed[1] = InputsFixed[0] ^ 1;
13726 }
13727
13728 // Point everything at the fixed inputs.
13729 for (int &M : HalfMask)
13730 if (M == IncomingInputs[0])
13731 M = InputsFixed[0] + SourceOffset;
13732 else if (M == IncomingInputs[1])
13733 M = InputsFixed[1] + SourceOffset;
13734
13735 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
13736 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
13737 }
13738 } else {
13739 llvm_unreachable("Unhandled input size!");
13740 }
13741
13742 // Now hoist the DWord down to the right half.
13743 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
13744 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
13745 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
13746 for (int &M : HalfMask)
13747 for (int Input : IncomingInputs)
13748 if (M == Input)
13749 M = FreeDWord * 2 + Input % 2;
13750 };
13751 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
13752 /*SourceOffset*/ 4, /*DestOffset*/ 0);
13753 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
13754 /*SourceOffset*/ 0, /*DestOffset*/ 4);
13755
13756 // Now enact all the shuffles we've computed to move the inputs into their
13757 // target half.
13758 if (!isNoopShuffleMask(PSHUFLMask))
13759 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13760 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
13761 if (!isNoopShuffleMask(PSHUFHMask))
13762 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13763 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
13764 if (!isNoopShuffleMask(PSHUFDMask))
13765 V = DAG.getBitcast(
13766 VT,
13767 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
13768 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13769
13770 // At this point, each half should contain all its inputs, and we can then
13771 // just shuffle them into their final position.
13772 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
13773 "Failed to lift all the high half inputs to the low mask!");
13774 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
13775 "Failed to lift all the low half inputs to the high mask!");
13776
13777 // Do a half shuffle for the low mask.
13778 if (!isNoopShuffleMask(LoMask))
13779 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13780 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13781
13782 // Do a half shuffle with the high mask after shifting its values down.
13783 for (int &M : HiMask)
13784 if (M >= 0)
13785 M -= 4;
13786 if (!isNoopShuffleMask(HiMask))
13787 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13788 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13789
13790 return V;
13791}
13792
13793/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
13794/// blend if only one input is used.
13796 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13797 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
13799 "Lane crossing shuffle masks not supported");
13800
13801 int NumBytes = VT.getSizeInBits() / 8;
13802 int Size = Mask.size();
13803 int Scale = NumBytes / Size;
13804
13805 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
13806 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
13807 V1InUse = false;
13808 V2InUse = false;
13809
13810 for (int i = 0; i < NumBytes; ++i) {
13811 int M = Mask[i / Scale];
13812 if (M < 0)
13813 continue;
13814
13815 const int ZeroMask = 0x80;
13816 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
13817 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
13818 if (Zeroable[i / Scale])
13819 V1Idx = V2Idx = ZeroMask;
13820
13821 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
13822 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
13823 V1InUse |= (ZeroMask != V1Idx);
13824 V2InUse |= (ZeroMask != V2Idx);
13825 }
13826
13827 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
13828 if (V1InUse)
13829 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
13830 DAG.getBuildVector(ShufVT, DL, V1Mask));
13831 if (V2InUse)
13832 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
13833 DAG.getBuildVector(ShufVT, DL, V2Mask));
13834
13835 // If we need shuffled inputs from both, blend the two.
13836 SDValue V;
13837 if (V1InUse && V2InUse)
13838 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
13839 else
13840 V = V1InUse ? V1 : V2;
13841
13842 // Cast the result back to the correct type.
13843 return DAG.getBitcast(VT, V);
13844}
13845
13846/// Generic lowering of 8-lane i16 shuffles.
13847///
13848/// This handles both single-input shuffles and combined shuffle/blends with
13849/// two inputs. The single input shuffles are immediately delegated to
13850/// a dedicated lowering routine.
13851///
13852/// The blends are lowered in one of three fundamental ways. If there are few
13853/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
13854/// of the input is significantly cheaper when lowered as an interleaving of
13855/// the two inputs, try to interleave them. Otherwise, blend the low and high
13856/// halves of the inputs separately (making them have relatively few inputs)
13857/// and then concatenate them.
13859 const APInt &Zeroable, SDValue V1, SDValue V2,
13860 const X86Subtarget &Subtarget,
13861 SelectionDAG &DAG) {
13862 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
13863 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
13864 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13865
13866 // Whenever we can lower this as a zext, that instruction is strictly faster
13867 // than any alternative.
13868 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
13869 Zeroable, Subtarget, DAG))
13870 return ZExt;
13871
13872 // Try to use lower using a truncation.
13873 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
13874 Subtarget, DAG))
13875 return V;
13876
13877 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
13878
13879 if (NumV2Inputs == 0) {
13880 // Try to use shift instructions.
13881 if (SDValue Shift =
13882 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
13883 Subtarget, DAG, /*BitwiseOnly*/ false))
13884 return Shift;
13885
13886 // Check for being able to broadcast a single element.
13887 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
13888 Mask, Subtarget, DAG))
13889 return Broadcast;
13890
13891 // Try to use bit rotation instructions.
13892 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
13893 Subtarget, DAG))
13894 return Rotate;
13895
13896 // Use dedicated unpack instructions for masks that match their pattern.
13897 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
13898 return V;
13899
13900 // Use dedicated pack instructions for masks that match their pattern.
13901 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
13902 Subtarget))
13903 return V;
13904
13905 // Try to use byte rotation instructions.
13906 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
13907 Subtarget, DAG))
13908 return Rotate;
13909
13910 // Make a copy of the mask so it can be modified.
13911 SmallVector<int, 8> MutableMask(Mask);
13912 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
13913 Subtarget, DAG);
13914 }
13915
13916 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
13917 "All single-input shuffles should be canonicalized to be V1-input "
13918 "shuffles.");
13919
13920 // Try to use shift instructions.
13921 if (SDValue Shift =
13922 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
13923 DAG, /*BitwiseOnly*/ false))
13924 return Shift;
13925
13926 // See if we can use SSE4A Extraction / Insertion.
13927 if (Subtarget.hasSSE4A())
13928 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
13929 Zeroable, DAG))
13930 return V;
13931
13932 // There are special ways we can lower some single-element blends.
13933 if (NumV2Inputs == 1)
13935 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13936 return V;
13937
13938 // We have different paths for blend lowering, but they all must use the
13939 // *exact* same predicate.
13940 bool IsBlendSupported = Subtarget.hasSSE41();
13941 if (IsBlendSupported)
13942 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
13943 Zeroable, Subtarget, DAG))
13944 return Blend;
13945
13946 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
13947 Zeroable, Subtarget, DAG))
13948 return Masked;
13949
13950 // Use dedicated unpack instructions for masks that match their pattern.
13951 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
13952 return V;
13953
13954 // Use dedicated pack instructions for masks that match their pattern.
13955 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
13956 Subtarget))
13957 return V;
13958
13959 // Try to use lower using a truncation.
13960 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
13961 Subtarget, DAG))
13962 return V;
13963
13964 // Try to use byte rotation instructions.
13965 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
13966 Subtarget, DAG))
13967 return Rotate;
13968
13969 if (SDValue BitBlend =
13970 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
13971 return BitBlend;
13972
13973 // Try to use byte shift instructions to mask.
13974 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
13975 Zeroable, Subtarget, DAG))
13976 return V;
13977
13978 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
13979 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
13980 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
13981 !Subtarget.hasVLX()) {
13982 // Check if this is part of a 256-bit vector truncation.
13983 unsigned PackOpc = 0;
13984 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
13987 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
13988 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
13989 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
13990 DAG.getTargetConstant(0xEE, DL, MVT::i8));
13991 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
13992 V1 = extract128BitVector(V1V2, 0, DAG, DL);
13993 V2 = extract128BitVector(V1V2, 4, DAG, DL);
13994 PackOpc = X86ISD::PACKUS;
13995 } else if (Subtarget.hasSSE41()) {
13996 SmallVector<SDValue, 4> DWordClearOps(4,
13997 DAG.getConstant(0, DL, MVT::i32));
13998 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
13999 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
14000 SDValue DWordClearMask =
14001 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
14002 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
14003 DWordClearMask);
14004 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
14005 DWordClearMask);
14006 PackOpc = X86ISD::PACKUS;
14007 } else if (!Subtarget.hasSSSE3()) {
14008 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
14009 V1 = DAG.getBitcast(MVT::v4i32, V1);
14010 V2 = DAG.getBitcast(MVT::v4i32, V2);
14011 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
14012 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
14013 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
14014 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
14015 PackOpc = X86ISD::PACKSS;
14016 }
14017 if (PackOpc) {
14018 // Now pack things back together.
14019 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
14020 if (NumEvenDrops == 2) {
14021 Result = DAG.getBitcast(MVT::v4i32, Result);
14022 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
14023 }
14024 return Result;
14025 }
14026 }
14027
14028 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
14029 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
14030 if (NumOddDrops == 1) {
14031 bool HasSSE41 = Subtarget.hasSSE41();
14032 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14033 DAG.getBitcast(MVT::v4i32, V1),
14034 DAG.getTargetConstant(16, DL, MVT::i8));
14035 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14036 DAG.getBitcast(MVT::v4i32, V2),
14037 DAG.getTargetConstant(16, DL, MVT::i8));
14038 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
14039 MVT::v8i16, V1, V2);
14040 }
14041
14042 // Try to lower by permuting the inputs into an unpack instruction.
14043 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14044 Mask, Subtarget, DAG))
14045 return Unpack;
14046
14047 // If we can't directly blend but can use PSHUFB, that will be better as it
14048 // can both shuffle and set up the inefficient blend.
14049 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14050 bool V1InUse, V2InUse;
14051 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14052 Zeroable, DAG, V1InUse, V2InUse);
14053 }
14054
14055 // We can always bit-blend if we have to so the fallback strategy is to
14056 // decompose into single-input permutes and blends/unpacks.
14057 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
14058 Mask, Subtarget, DAG);
14059}
14060
14061/// Lower 8-lane 16-bit floating point shuffles.
14063 const APInt &Zeroable, SDValue V1, SDValue V2,
14064 const X86Subtarget &Subtarget,
14065 SelectionDAG &DAG) {
14066 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14067 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14068 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14069 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
14070
14071 if (Subtarget.hasFP16()) {
14072 if (NumV2Elements == 0) {
14073 // Check for being able to broadcast a single element.
14074 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
14075 Mask, Subtarget, DAG))
14076 return Broadcast;
14077 }
14078 if (NumV2Elements == 1 && Mask[0] >= 8)
14080 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14081 return V;
14082 }
14083
14084 V1 = DAG.getBitcast(MVT::v8i16, V1);
14085 V2 = DAG.getBitcast(MVT::v8i16, V2);
14086 return DAG.getBitcast(MVT::v8f16,
14087 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14088}
14089
14090// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14091// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14092// the active subvector is extracted.
14094 ArrayRef<int> Mask, SDValue V1, SDValue V2,
14095 const X86Subtarget &Subtarget,
14096 SelectionDAG &DAG) {
14097 MVT MaskVT = VT.changeTypeToInteger();
14098 SDValue MaskNode;
14099 MVT ShuffleVT = VT;
14100 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
14101 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
14102 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
14103 ShuffleVT = V1.getSimpleValueType();
14104
14105 // Adjust mask to correct indices for the second input.
14106 int NumElts = VT.getVectorNumElements();
14107 unsigned Scale = 512 / VT.getSizeInBits();
14108 SmallVector<int, 32> AdjustedMask(Mask);
14109 for (int &M : AdjustedMask)
14110 if (NumElts <= M)
14111 M += (Scale - 1) * NumElts;
14112 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
14113 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
14114 } else {
14115 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
14116 }
14117
14118 SDValue Result;
14119 if (V2.isUndef())
14120 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
14121 else
14122 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
14123
14124 if (VT != ShuffleVT)
14125 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
14126
14127 return Result;
14128}
14129
14130/// Generic lowering of v16i8 shuffles.
14131///
14132/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14133/// detect any complexity reducing interleaving. If that doesn't help, it uses
14134/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14135/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14136/// back together.
14138 const APInt &Zeroable, SDValue V1, SDValue V2,
14139 const X86Subtarget &Subtarget,
14140 SelectionDAG &DAG) {
14141 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14142 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14143 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14144
14145 // Try to use shift instructions.
14146 if (SDValue Shift =
14147 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
14148 DAG, /*BitwiseOnly*/ false))
14149 return Shift;
14150
14151 // Try to use byte rotation instructions.
14152 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14153 Subtarget, DAG))
14154 return Rotate;
14155
14156 // Use dedicated pack instructions for masks that match their pattern.
14157 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
14158 Subtarget))
14159 return V;
14160
14161 // Try to use a zext lowering.
14162 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14163 Zeroable, Subtarget, DAG))
14164 return ZExt;
14165
14166 // Try to use lower using a truncation.
14167 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14168 Subtarget, DAG))
14169 return V;
14170
14171 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14172 Subtarget, DAG))
14173 return V;
14174
14175 // See if we can use SSE4A Extraction / Insertion.
14176 if (Subtarget.hasSSE4A())
14177 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
14178 Zeroable, DAG))
14179 return V;
14180
14181 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
14182
14183 // For single-input shuffles, there are some nicer lowering tricks we can use.
14184 if (NumV2Elements == 0) {
14185 // Check for being able to broadcast a single element.
14186 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
14187 Mask, Subtarget, DAG))
14188 return Broadcast;
14189
14190 // Try to use bit rotation instructions.
14191 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
14192 Subtarget, DAG))
14193 return Rotate;
14194
14195 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
14196 return V;
14197
14198 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
14199 // Notably, this handles splat and partial-splat shuffles more efficiently.
14200 // However, it only makes sense if the pre-duplication shuffle simplifies
14201 // things significantly. Currently, this means we need to be able to
14202 // express the pre-duplication shuffle as an i16 shuffle.
14203 //
14204 // FIXME: We should check for other patterns which can be widened into an
14205 // i16 shuffle as well.
14206 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
14207 for (int i = 0; i < 16; i += 2)
14208 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14209 return false;
14210
14211 return true;
14212 };
14213 auto tryToWidenViaDuplication = [&]() -> SDValue {
14214 if (!canWidenViaDuplication(Mask))
14215 return SDValue();
14216 SmallVector<int, 4> LoInputs;
14217 copy_if(Mask, std::back_inserter(LoInputs),
14218 [](int M) { return M >= 0 && M < 8; });
14219 array_pod_sort(LoInputs.begin(), LoInputs.end());
14220 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
14221 LoInputs.end());
14222 SmallVector<int, 4> HiInputs;
14223 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
14224 array_pod_sort(HiInputs.begin(), HiInputs.end());
14225 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
14226 HiInputs.end());
14227
14228 bool TargetLo = LoInputs.size() >= HiInputs.size();
14229 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14230 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14231
14232 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14234 for (int I : InPlaceInputs) {
14235 PreDupI16Shuffle[I/2] = I/2;
14236 LaneMap[I] = I;
14237 }
14238 int j = TargetLo ? 0 : 4, je = j + 4;
14239 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14240 // Check if j is already a shuffle of this input. This happens when
14241 // there are two adjacent bytes after we move the low one.
14242 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14243 // If we haven't yet mapped the input, search for a slot into which
14244 // we can map it.
14245 while (j < je && PreDupI16Shuffle[j] >= 0)
14246 ++j;
14247
14248 if (j == je)
14249 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14250 return SDValue();
14251
14252 // Map this input with the i16 shuffle.
14253 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14254 }
14255
14256 // Update the lane map based on the mapping we ended up with.
14257 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14258 }
14259 V1 = DAG.getBitcast(
14260 MVT::v16i8,
14261 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14262 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14263
14264 // Unpack the bytes to form the i16s that will be shuffled into place.
14265 bool EvenInUse = false, OddInUse = false;
14266 for (int i = 0; i < 16; i += 2) {
14267 EvenInUse |= (Mask[i + 0] >= 0);
14268 OddInUse |= (Mask[i + 1] >= 0);
14269 if (EvenInUse && OddInUse)
14270 break;
14271 }
14272 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14273 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14274 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14275
14276 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14277 for (int i = 0; i < 16; ++i)
14278 if (Mask[i] >= 0) {
14279 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14280 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
14281 if (PostDupI16Shuffle[i / 2] < 0)
14282 PostDupI16Shuffle[i / 2] = MappedMask;
14283 else
14284 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
14285 "Conflicting entries in the original shuffle!");
14286 }
14287 return DAG.getBitcast(
14288 MVT::v16i8,
14289 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14290 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14291 };
14292 if (SDValue V = tryToWidenViaDuplication())
14293 return V;
14294 }
14295
14296 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14297 Zeroable, Subtarget, DAG))
14298 return Masked;
14299
14300 // Use dedicated unpack instructions for masks that match their pattern.
14301 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
14302 return V;
14303
14304 // Try to use byte shift instructions to mask.
14305 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
14306 Zeroable, Subtarget, DAG))
14307 return V;
14308
14309 // Check for compaction patterns.
14310 bool IsSingleInput = V2.isUndef();
14311 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
14312
14313 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14314 // with PSHUFB. It is important to do this before we attempt to generate any
14315 // blends but after all of the single-input lowerings. If the single input
14316 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14317 // want to preserve that and we can DAG combine any longer sequences into
14318 // a PSHUFB in the end. But once we start blending from multiple inputs,
14319 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14320 // and there are *very* few patterns that would actually be faster than the
14321 // PSHUFB approach because of its ability to zero lanes.
14322 //
14323 // If the mask is a binary compaction, we can more efficiently perform this
14324 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14325 //
14326 // FIXME: The only exceptions to the above are blends which are exact
14327 // interleavings with direct instructions supporting them. We currently don't
14328 // handle those well here.
14329 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
14330 bool V1InUse = false;
14331 bool V2InUse = false;
14332
14334 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14335
14336 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14337 // do so. This avoids using them to handle blends-with-zero which is
14338 // important as a single pshufb is significantly faster for that.
14339 if (V1InUse && V2InUse) {
14340 if (Subtarget.hasSSE41())
14341 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14342 Zeroable, Subtarget, DAG))
14343 return Blend;
14344
14345 // We can use an unpack to do the blending rather than an or in some
14346 // cases. Even though the or may be (very minorly) more efficient, we
14347 // preference this lowering because there are common cases where part of
14348 // the complexity of the shuffles goes away when we do the final blend as
14349 // an unpack.
14350 // FIXME: It might be worth trying to detect if the unpack-feeding
14351 // shuffles will both be pshufb, in which case we shouldn't bother with
14352 // this.
14354 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14355 return Unpack;
14356
14357 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14358 if (Subtarget.hasVBMI())
14359 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
14360 DAG);
14361
14362 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
14363 if (Subtarget.hasXOP()) {
14364 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
14365 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
14366 }
14367
14368 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14369 // PALIGNR will be cheaper than the second PSHUFB+OR.
14371 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14372 return V;
14373 }
14374
14375 return PSHUFB;
14376 }
14377
14378 // There are special ways we can lower some single-element blends.
14379 if (NumV2Elements == 1)
14381 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14382 return V;
14383
14384 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14385 return Blend;
14386
14387 // Check whether a compaction lowering can be done. This handles shuffles
14388 // which take every Nth element for some even N. See the helper function for
14389 // details.
14390 //
14391 // We special case these as they can be particularly efficiently handled with
14392 // the PACKUSB instruction on x86 and they show up in common patterns of
14393 // rearranging bytes to truncate wide elements.
14394 if (NumEvenDrops) {
14395 // NumEvenDrops is the power of two stride of the elements. Another way of
14396 // thinking about it is that we need to drop the even elements this many
14397 // times to get the original input.
14398
14399 // First we need to zero all the dropped bytes.
14400 assert(NumEvenDrops <= 3 &&
14401 "No support for dropping even elements more than 3 times.");
14402 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
14403 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14404 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
14405 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
14406 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
14407 WordClearMask);
14408 if (!IsSingleInput)
14409 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
14410 WordClearMask);
14411
14412 // Now pack things back together.
14413 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14414 IsSingleInput ? V1 : V2);
14415 for (int i = 1; i < NumEvenDrops; ++i) {
14416 Result = DAG.getBitcast(MVT::v8i16, Result);
14417 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
14418 }
14419 return Result;
14420 }
14421
14422 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
14423 if (NumOddDrops == 1) {
14424 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14425 DAG.getBitcast(MVT::v8i16, V1),
14426 DAG.getTargetConstant(8, DL, MVT::i8));
14427 if (!IsSingleInput)
14428 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14429 DAG.getBitcast(MVT::v8i16, V2),
14430 DAG.getTargetConstant(8, DL, MVT::i8));
14431 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14432 IsSingleInput ? V1 : V2);
14433 }
14434
14435 // Handle multi-input cases by blending/unpacking single-input shuffles.
14436 if (NumV2Elements > 0)
14437 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
14438 Subtarget, DAG);
14439
14440 // The fallback path for single-input shuffles widens this into two v8i16
14441 // vectors with unpacks, shuffles those, and then pulls them back together
14442 // with a pack.
14443 SDValue V = V1;
14444
14445 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14446 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14447 for (int i = 0; i < 16; ++i)
14448 if (Mask[i] >= 0)
14449 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
14450
14451 SDValue VLoHalf, VHiHalf;
14452 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
14453 // them out and avoid using UNPCK{L,H} to extract the elements of V as
14454 // i16s.
14455 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
14456 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
14457 // Use a mask to drop the high bytes.
14458 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
14459 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
14460 DAG.getConstant(0x00FF, DL, MVT::v8i16));
14461
14462 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
14463 VHiHalf = DAG.getUNDEF(MVT::v8i16);
14464
14465 // Squash the masks to point directly into VLoHalf.
14466 for (int &M : LoBlendMask)
14467 if (M >= 0)
14468 M /= 2;
14469 for (int &M : HiBlendMask)
14470 if (M >= 0)
14471 M /= 2;
14472 } else {
14473 // Otherwise just unpack the low half of V into VLoHalf and the high half into
14474 // VHiHalf so that we can blend them as i16s.
14475 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
14476
14477 VLoHalf = DAG.getBitcast(
14478 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
14479 VHiHalf = DAG.getBitcast(
14480 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
14481 }
14482
14483 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
14484 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
14485
14486 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
14487}
14488
14489/// Dispatching routine to lower various 128-bit x86 vector shuffles.
14490///
14491/// This routine breaks down the specific type of 128-bit shuffle and
14492/// dispatches to the lowering routines accordingly.
14494 MVT VT, SDValue V1, SDValue V2,
14495 const APInt &Zeroable,
14496 const X86Subtarget &Subtarget,
14497 SelectionDAG &DAG) {
14498 if (VT == MVT::v8bf16) {
14499 V1 = DAG.getBitcast(MVT::v8i16, V1);
14500 V2 = DAG.getBitcast(MVT::v8i16, V2);
14501 return DAG.getBitcast(VT,
14502 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14503 }
14504
14505 switch (VT.SimpleTy) {
14506 case MVT::v2i64:
14507 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14508 case MVT::v2f64:
14509 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14510 case MVT::v4i32:
14511 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14512 case MVT::v4f32:
14513 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14514 case MVT::v8i16:
14515 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14516 case MVT::v8f16:
14517 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14518 case MVT::v16i8:
14519 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14520
14521 default:
14522 llvm_unreachable("Unimplemented!");
14523 }
14524}
14525
14526/// Generic routine to split vector shuffle into half-sized shuffles.
14527///
14528/// This routine just extracts two subvectors, shuffles them independently, and
14529/// then concatenates them back together. This should work effectively with all
14530/// AVX vector shuffle types.
14532 SDValue V2, ArrayRef<int> Mask,
14533 SelectionDAG &DAG, bool SimpleOnly) {
14534 assert(VT.getSizeInBits() >= 256 &&
14535 "Only for 256-bit or wider vector shuffles!");
14536 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
14537 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
14538
14539 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
14540 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
14541
14542 int NumElements = VT.getVectorNumElements();
14543 int SplitNumElements = NumElements / 2;
14544 MVT ScalarVT = VT.getVectorElementType();
14545 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
14546
14547 // Use splitVector/extractSubVector so that split build-vectors just build two
14548 // narrower build vectors. This helps shuffling with splats and zeros.
14549 auto SplitVector = [&](SDValue V) {
14550 SDValue LoV, HiV;
14551 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
14552 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
14553 DAG.getBitcast(SplitVT, HiV));
14554 };
14555
14556 SDValue LoV1, HiV1, LoV2, HiV2;
14557 std::tie(LoV1, HiV1) = SplitVector(V1);
14558 std::tie(LoV2, HiV2) = SplitVector(V2);
14559
14560 // Now create two 4-way blends of these half-width vectors.
14561 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
14562 bool &UseHiV1, bool &UseLoV2,
14563 bool &UseHiV2) {
14564 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
14565 for (int i = 0; i < SplitNumElements; ++i) {
14566 int M = HalfMask[i];
14567 if (M >= NumElements) {
14568 if (M >= NumElements + SplitNumElements)
14569 UseHiV2 = true;
14570 else
14571 UseLoV2 = true;
14572 } else if (M >= 0) {
14573 if (M >= SplitNumElements)
14574 UseHiV1 = true;
14575 else
14576 UseLoV1 = true;
14577 }
14578 }
14579 };
14580
14581 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
14582 if (!SimpleOnly)
14583 return true;
14584
14585 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14586 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14587
14588 return !(UseHiV1 || UseHiV2);
14589 };
14590
14591 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
14592 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
14593 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
14594 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
14595 for (int i = 0; i < SplitNumElements; ++i) {
14596 int M = HalfMask[i];
14597 if (M >= NumElements) {
14598 V2BlendMask[i] = M - NumElements;
14599 BlendMask[i] = SplitNumElements + i;
14600 } else if (M >= 0) {
14601 V1BlendMask[i] = M;
14602 BlendMask[i] = i;
14603 }
14604 }
14605
14606 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14607 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14608
14609 // Because the lowering happens after all combining takes place, we need to
14610 // manually combine these blend masks as much as possible so that we create
14611 // a minimal number of high-level vector shuffle nodes.
14612 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
14613
14614 // First try just blending the halves of V1 or V2.
14615 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
14616 return DAG.getUNDEF(SplitVT);
14617 if (!UseLoV2 && !UseHiV2)
14618 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14619 if (!UseLoV1 && !UseHiV1)
14620 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14621
14622 SDValue V1Blend, V2Blend;
14623 if (UseLoV1 && UseHiV1) {
14624 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14625 } else {
14626 // We only use half of V1 so map the usage down into the final blend mask.
14627 V1Blend = UseLoV1 ? LoV1 : HiV1;
14628 for (int i = 0; i < SplitNumElements; ++i)
14629 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
14630 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
14631 }
14632 if (UseLoV2 && UseHiV2) {
14633 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14634 } else {
14635 // We only use half of V2 so map the usage down into the final blend mask.
14636 V2Blend = UseLoV2 ? LoV2 : HiV2;
14637 for (int i = 0; i < SplitNumElements; ++i)
14638 if (BlendMask[i] >= SplitNumElements)
14639 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
14640 }
14641 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
14642 };
14643
14644 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
14645 return SDValue();
14646
14647 SDValue Lo = HalfBlend(LoMask);
14648 SDValue Hi = HalfBlend(HiMask);
14649 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
14650}
14651
14652/// Either split a vector in halves or decompose the shuffles and the
14653/// blend/unpack.
14654///
14655/// This is provided as a good fallback for many lowerings of non-single-input
14656/// shuffles with more than one 128-bit lane. In those cases, we want to select
14657/// between splitting the shuffle into 128-bit components and stitching those
14658/// back together vs. extracting the single-input shuffles and blending those
14659/// results.
14661 SDValue V2, ArrayRef<int> Mask,
14662 const X86Subtarget &Subtarget,
14663 SelectionDAG &DAG) {
14664 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
14665 "shuffles as it could then recurse on itself.");
14666 int Size = Mask.size();
14667
14668 // If this can be modeled as a broadcast of two elements followed by a blend,
14669 // prefer that lowering. This is especially important because broadcasts can
14670 // often fold with memory operands.
14671 auto DoBothBroadcast = [&] {
14672 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
14673 for (int M : Mask)
14674 if (M >= Size) {
14675 if (V2BroadcastIdx < 0)
14676 V2BroadcastIdx = M - Size;
14677 else if (M - Size != V2BroadcastIdx)
14678 return false;
14679 } else if (M >= 0) {
14680 if (V1BroadcastIdx < 0)
14681 V1BroadcastIdx = M;
14682 else if (M != V1BroadcastIdx)
14683 return false;
14684 }
14685 return true;
14686 };
14687 if (DoBothBroadcast())
14688 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
14689 DAG);
14690
14691 // If the inputs all stem from a single 128-bit lane of each input, then we
14692 // split them rather than blending because the split will decompose to
14693 // unusually few instructions.
14694 int LaneCount = VT.getSizeInBits() / 128;
14695 int LaneSize = Size / LaneCount;
14696 SmallBitVector LaneInputs[2];
14697 LaneInputs[0].resize(LaneCount, false);
14698 LaneInputs[1].resize(LaneCount, false);
14699 for (int i = 0; i < Size; ++i)
14700 if (Mask[i] >= 0)
14701 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
14702 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
14703 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
14704 /*SimpleOnly*/ false);
14705
14706 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
14707 // requires that the decomposed single-input shuffles don't end up here.
14708 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
14709 DAG);
14710}
14711
14712// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
14713// TODO: Extend to support v8f32 (+ 512-bit shuffles).
14715 SDValue V1, SDValue V2,
14716 ArrayRef<int> Mask,
14717 SelectionDAG &DAG) {
14718 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
14719
14720 int LHSMask[4] = {-1, -1, -1, -1};
14721 int RHSMask[4] = {-1, -1, -1, -1};
14722 unsigned SHUFPMask = 0;
14723
14724 // As SHUFPD uses a single LHS/RHS element per lane, we can always
14725 // perform the shuffle once the lanes have been shuffled in place.
14726 for (int i = 0; i != 4; ++i) {
14727 int M = Mask[i];
14728 if (M < 0)
14729 continue;
14730 int LaneBase = i & ~1;
14731 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
14732 LaneMask[LaneBase + (M & 1)] = M;
14733 SHUFPMask |= (M & 1) << i;
14734 }
14735
14736 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
14737 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
14738 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
14739 DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
14740}
14741
14742/// Lower a vector shuffle crossing multiple 128-bit lanes as
14743/// a lane permutation followed by a per-lane permutation.
14744///
14745/// This is mainly for cases where we can have non-repeating permutes
14746/// in each lane.
14747///
14748/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
14749/// we should investigate merging them.
14751 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14752 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
14753 int NumElts = VT.getVectorNumElements();
14754 int NumLanes = VT.getSizeInBits() / 128;
14755 int NumEltsPerLane = NumElts / NumLanes;
14756 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
14757
14758 /// Attempts to find a sublane permute with the given size
14759 /// that gets all elements into their target lanes.
14760 ///
14761 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
14762 /// If unsuccessful, returns false and may overwrite InLaneMask.
14763 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
14764 int NumSublanesPerLane = NumSublanes / NumLanes;
14765 int NumEltsPerSublane = NumElts / NumSublanes;
14766
14767 SmallVector<int, 16> CrossLaneMask;
14768 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
14769 // CrossLaneMask but one entry == one sublane.
14770 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
14771
14772 for (int i = 0; i != NumElts; ++i) {
14773 int M = Mask[i];
14774 if (M < 0)
14775 continue;
14776
14777 int SrcSublane = M / NumEltsPerSublane;
14778 int DstLane = i / NumEltsPerLane;
14779
14780 // We only need to get the elements into the right lane, not sublane.
14781 // So search all sublanes that make up the destination lane.
14782 bool Found = false;
14783 int DstSubStart = DstLane * NumSublanesPerLane;
14784 int DstSubEnd = DstSubStart + NumSublanesPerLane;
14785 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
14786 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
14787 continue;
14788
14789 Found = true;
14790 CrossLaneMaskLarge[DstSublane] = SrcSublane;
14791 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
14792 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
14793 break;
14794 }
14795 if (!Found)
14796 return SDValue();
14797 }
14798
14799 // Fill CrossLaneMask using CrossLaneMaskLarge.
14800 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
14801
14802 if (!CanUseSublanes) {
14803 // If we're only shuffling a single lowest lane and the rest are identity
14804 // then don't bother.
14805 // TODO - isShuffleMaskInputInPlace could be extended to something like
14806 // this.
14807 int NumIdentityLanes = 0;
14808 bool OnlyShuffleLowestLane = true;
14809 for (int i = 0; i != NumLanes; ++i) {
14810 int LaneOffset = i * NumEltsPerLane;
14811 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
14812 i * NumEltsPerLane))
14813 NumIdentityLanes++;
14814 else if (CrossLaneMask[LaneOffset] != 0)
14815 OnlyShuffleLowestLane = false;
14816 }
14817 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
14818 return SDValue();
14819 }
14820
14821 // Avoid returning the same shuffle operation. For example,
14822 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
14823 // undef:v16i16
14824 if (CrossLaneMask == Mask || InLaneMask == Mask)
14825 return SDValue();
14826
14827 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
14828 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
14829 InLaneMask);
14830 };
14831
14832 // First attempt a solution with full lanes.
14833 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
14834 return V;
14835
14836 // The rest of the solutions use sublanes.
14837 if (!CanUseSublanes)
14838 return SDValue();
14839
14840 // Then attempt a solution with 64-bit sublanes (vpermq).
14841 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
14842 return V;
14843
14844 // If that doesn't work and we have fast variable cross-lane shuffle,
14845 // attempt 32-bit sublanes (vpermd).
14846 if (!Subtarget.hasFastVariableCrossLaneShuffle())
14847 return SDValue();
14848
14849 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
14850}
14851
14852/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
14853static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
14854 SmallVector<int> &InLaneMask) {
14855 int Size = Mask.size();
14856 InLaneMask.assign(Mask.begin(), Mask.end());
14857 for (int i = 0; i < Size; ++i) {
14858 int &M = InLaneMask[i];
14859 if (M < 0)
14860 continue;
14861 if (((M % Size) / LaneSize) != (i / LaneSize))
14862 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
14863 }
14864}
14865
14866/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
14867/// source with a lane permutation.
14868///
14869/// This lowering strategy results in four instructions in the worst case for a
14870/// single-input cross lane shuffle which is lower than any other fully general
14871/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
14872/// shuffle pattern should be handled prior to trying this lowering.
14874 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14875 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
14876 // FIXME: This should probably be generalized for 512-bit vectors as well.
14877 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
14878 int Size = Mask.size();
14879 int LaneSize = Size / 2;
14880
14881 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
14882 // Only do this if the elements aren't all from the lower lane,
14883 // otherwise we're (probably) better off doing a split.
14884 if (VT == MVT::v4f64 &&
14885 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
14886 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
14887
14888 // If there are only inputs from one 128-bit lane, splitting will in fact be
14889 // less expensive. The flags track whether the given lane contains an element
14890 // that crosses to another lane.
14891 bool AllLanes;
14892 if (!Subtarget.hasAVX2()) {
14893 bool LaneCrossing[2] = {false, false};
14894 for (int i = 0; i < Size; ++i)
14895 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
14896 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
14897 AllLanes = LaneCrossing[0] && LaneCrossing[1];
14898 } else {
14899 bool LaneUsed[2] = {false, false};
14900 for (int i = 0; i < Size; ++i)
14901 if (Mask[i] >= 0)
14902 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
14903 AllLanes = LaneUsed[0] && LaneUsed[1];
14904 }
14905
14906 // TODO - we could support shuffling V2 in the Flipped input.
14907 assert(V2.isUndef() &&
14908 "This last part of this routine only works on single input shuffles");
14909
14910 SmallVector<int> InLaneMask;
14911 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
14912
14913 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
14914 "In-lane shuffle mask expected");
14915
14916 // If we're not using both lanes in each lane and the inlane mask is not
14917 // repeating, then we're better off splitting.
14918 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
14919 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
14920 /*SimpleOnly*/ false);
14921
14922 // Flip the lanes, and shuffle the results which should now be in-lane.
14923 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
14924 SDValue Flipped = DAG.getBitcast(PVT, V1);
14925 Flipped =
14926 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
14927 Flipped = DAG.getBitcast(VT, Flipped);
14928 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
14929}
14930
14931/// Handle lowering 2-lane 128-bit shuffles.
14933 SDValue V2, ArrayRef<int> Mask,
14934 const APInt &Zeroable,
14935 const X86Subtarget &Subtarget,
14936 SelectionDAG &DAG) {
14937 if (V2.isUndef()) {
14938 // Attempt to match VBROADCAST*128 subvector broadcast load.
14939 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
14940 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
14941 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
14943 MVT MemVT = VT.getHalfNumVectorElementsVT();
14944 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
14945 auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
14947 VT, MemVT, Ld, Ofs, DAG))
14948 return BcstLd;
14949 }
14950
14951 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
14952 if (Subtarget.hasAVX2())
14953 return SDValue();
14954 }
14955
14956 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
14957
14958 SmallVector<int, 4> WidenedMask;
14959 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
14960 return SDValue();
14961
14962 bool IsLowZero = (Zeroable & 0x3) == 0x3;
14963 bool IsHighZero = (Zeroable & 0xc) == 0xc;
14964
14965 // Try to use an insert into a zero vector.
14966 if (WidenedMask[0] == 0 && IsHighZero) {
14967 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
14968 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
14969 DAG.getIntPtrConstant(0, DL));
14970 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
14971 getZeroVector(VT, Subtarget, DAG, DL), LoV,
14972 DAG.getIntPtrConstant(0, DL));
14973 }
14974
14975 // TODO: If minimizing size and one of the inputs is a zero vector and the
14976 // the zero vector has only one use, we could use a VPERM2X128 to save the
14977 // instruction bytes needed to explicitly generate the zero vector.
14978
14979 // Blends are faster and handle all the non-lane-crossing cases.
14980 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
14981 Subtarget, DAG))
14982 return Blend;
14983
14984 // If either input operand is a zero vector, use VPERM2X128 because its mask
14985 // allows us to replace the zero input with an implicit zero.
14986 if (!IsLowZero && !IsHighZero) {
14987 // Check for patterns which can be matched with a single insert of a 128-bit
14988 // subvector.
14989 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
14990 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
14991
14992 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
14993 // this will likely become vinsertf128 which can't fold a 256-bit memop.
14994 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
14995 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
14996 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
14997 OnlyUsesV1 ? V1 : V2,
14998 DAG.getIntPtrConstant(0, DL));
14999 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
15000 DAG.getIntPtrConstant(2, DL));
15001 }
15002 }
15003
15004 // Try to use SHUF128 if possible.
15005 if (Subtarget.hasVLX()) {
15006 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
15007 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
15008 ((WidenedMask[1] % 2) << 1);
15009 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
15010 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15011 }
15012 }
15013 }
15014
15015 // Otherwise form a 128-bit permutation. After accounting for undefs,
15016 // convert the 64-bit shuffle mask selection values into 128-bit
15017 // selection bits by dividing the indexes by 2 and shifting into positions
15018 // defined by a vperm2*128 instruction's immediate control byte.
15019
15020 // The immediate permute control byte looks like this:
15021 // [1:0] - select 128 bits from sources for low half of destination
15022 // [2] - ignore
15023 // [3] - zero low half of destination
15024 // [5:4] - select 128 bits from sources for high half of destination
15025 // [6] - ignore
15026 // [7] - zero high half of destination
15027
15028 assert((WidenedMask[0] >= 0 || IsLowZero) &&
15029 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
15030
15031 unsigned PermMask = 0;
15032 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15033 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15034
15035 // Check the immediate mask and replace unused sources with undef.
15036 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15037 V1 = DAG.getUNDEF(VT);
15038 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15039 V2 = DAG.getUNDEF(VT);
15040
15041 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15042 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15043}
15044
15045/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15046/// shuffling each lane.
15047///
15048/// This attempts to create a repeated lane shuffle where each lane uses one
15049/// or two of the lanes of the inputs. The lanes of the input vectors are
15050/// shuffled in one or two independent shuffles to get the lanes into the
15051/// position needed by the final shuffle.
15053 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15054 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15055 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
15056
15057 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15058 return SDValue();
15059
15060 int NumElts = Mask.size();
15061 int NumLanes = VT.getSizeInBits() / 128;
15062 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15063 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15064 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15065
15066 // First pass will try to fill in the RepeatMask from lanes that need two
15067 // sources.
15068 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15069 int Srcs[2] = {-1, -1};
15070 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15071 for (int i = 0; i != NumLaneElts; ++i) {
15072 int M = Mask[(Lane * NumLaneElts) + i];
15073 if (M < 0)
15074 continue;
15075 // Determine which of the possible input lanes (NumLanes from each source)
15076 // this element comes from. Assign that as one of the sources for this
15077 // lane. We can assign up to 2 sources for this lane. If we run out
15078 // sources we can't do anything.
15079 int LaneSrc = M / NumLaneElts;
15080 int Src;
15081 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15082 Src = 0;
15083 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15084 Src = 1;
15085 else
15086 return SDValue();
15087
15088 Srcs[Src] = LaneSrc;
15089 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15090 }
15091
15092 // If this lane has two sources, see if it fits with the repeat mask so far.
15093 if (Srcs[1] < 0)
15094 continue;
15095
15096 LaneSrcs[Lane][0] = Srcs[0];
15097 LaneSrcs[Lane][1] = Srcs[1];
15098
15099 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
15100 assert(M1.size() == M2.size() && "Unexpected mask size");
15101 for (int i = 0, e = M1.size(); i != e; ++i)
15102 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
15103 return false;
15104 return true;
15105 };
15106
15107 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
15108 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
15109 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
15110 int M = Mask[i];
15111 if (M < 0)
15112 continue;
15113 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
15114 "Unexpected mask element");
15115 MergedMask[i] = M;
15116 }
15117 };
15118
15119 if (MatchMasks(InLaneMask, RepeatMask)) {
15120 // Merge this lane mask into the final repeat mask.
15121 MergeMasks(InLaneMask, RepeatMask);
15122 continue;
15123 }
15124
15125 // Didn't find a match. Swap the operands and try again.
15126 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15128
15129 if (MatchMasks(InLaneMask, RepeatMask)) {
15130 // Merge this lane mask into the final repeat mask.
15131 MergeMasks(InLaneMask, RepeatMask);
15132 continue;
15133 }
15134
15135 // Couldn't find a match with the operands in either order.
15136 return SDValue();
15137 }
15138
15139 // Now handle any lanes with only one source.
15140 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15141 // If this lane has already been processed, skip it.
15142 if (LaneSrcs[Lane][0] >= 0)
15143 continue;
15144
15145 for (int i = 0; i != NumLaneElts; ++i) {
15146 int M = Mask[(Lane * NumLaneElts) + i];
15147 if (M < 0)
15148 continue;
15149
15150 // If RepeatMask isn't defined yet we can define it ourself.
15151 if (RepeatMask[i] < 0)
15152 RepeatMask[i] = M % NumLaneElts;
15153
15154 if (RepeatMask[i] < NumElts) {
15155 if (RepeatMask[i] != M % NumLaneElts)
15156 return SDValue();
15157 LaneSrcs[Lane][0] = M / NumLaneElts;
15158 } else {
15159 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15160 return SDValue();
15161 LaneSrcs[Lane][1] = M / NumLaneElts;
15162 }
15163 }
15164
15165 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15166 return SDValue();
15167 }
15168
15169 SmallVector<int, 16> NewMask(NumElts, -1);
15170 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15171 int Src = LaneSrcs[Lane][0];
15172 for (int i = 0; i != NumLaneElts; ++i) {
15173 int M = -1;
15174 if (Src >= 0)
15175 M = Src * NumLaneElts + i;
15176 NewMask[Lane * NumLaneElts + i] = M;
15177 }
15178 }
15179 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15180 // Ensure we didn't get back the shuffle we started with.
15181 // FIXME: This is a hack to make up for some splat handling code in
15182 // getVectorShuffle.
15183 if (isa<ShuffleVectorSDNode>(NewV1) &&
15184 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15185 return SDValue();
15186
15187 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15188 int Src = LaneSrcs[Lane][1];
15189 for (int i = 0; i != NumLaneElts; ++i) {
15190 int M = -1;
15191 if (Src >= 0)
15192 M = Src * NumLaneElts + i;
15193 NewMask[Lane * NumLaneElts + i] = M;
15194 }
15195 }
15196 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15197 // Ensure we didn't get back the shuffle we started with.
15198 // FIXME: This is a hack to make up for some splat handling code in
15199 // getVectorShuffle.
15200 if (isa<ShuffleVectorSDNode>(NewV2) &&
15201 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15202 return SDValue();
15203
15204 for (int i = 0; i != NumElts; ++i) {
15205 if (Mask[i] < 0) {
15206 NewMask[i] = -1;
15207 continue;
15208 }
15209 NewMask[i] = RepeatMask[i % NumLaneElts];
15210 if (NewMask[i] < 0)
15211 continue;
15212
15213 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15214 }
15215 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
15216}
15217
15218/// If the input shuffle mask results in a vector that is undefined in all upper
15219/// or lower half elements and that mask accesses only 2 halves of the
15220/// shuffle's operands, return true. A mask of half the width with mask indexes
15221/// adjusted to access the extracted halves of the original shuffle operands is
15222/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
15223/// lower half of each input operand is accessed.
15224static bool
15226 int &HalfIdx1, int &HalfIdx2) {
15227 assert((Mask.size() == HalfMask.size() * 2) &&
15228 "Expected input mask to be twice as long as output");
15229
15230 // Exactly one half of the result must be undef to allow narrowing.
15231 bool UndefLower = isUndefLowerHalf(Mask);
15232 bool UndefUpper = isUndefUpperHalf(Mask);
15233 if (UndefLower == UndefUpper)
15234 return false;
15235
15236 unsigned HalfNumElts = HalfMask.size();
15237 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15238 HalfIdx1 = -1;
15239 HalfIdx2 = -1;
15240 for (unsigned i = 0; i != HalfNumElts; ++i) {
15241 int M = Mask[i + MaskIndexOffset];
15242 if (M < 0) {
15243 HalfMask[i] = M;
15244 continue;
15245 }
15246
15247 // Determine which of the 4 half vectors this element is from.
15248 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15249 int HalfIdx = M / HalfNumElts;
15250
15251 // Determine the element index into its half vector source.
15252 int HalfElt = M % HalfNumElts;
15253
15254 // We can shuffle with up to 2 half vectors, set the new 'half'
15255 // shuffle mask accordingly.
15256 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15257 HalfMask[i] = HalfElt;
15258 HalfIdx1 = HalfIdx;
15259 continue;
15260 }
15261 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15262 HalfMask[i] = HalfElt + HalfNumElts;
15263 HalfIdx2 = HalfIdx;
15264 continue;
15265 }
15266
15267 // Too many half vectors referenced.
15268 return false;
15269 }
15270
15271 return true;
15272}
15273
15274/// Given the output values from getHalfShuffleMask(), create a half width
15275/// shuffle of extracted vectors followed by an insert back to full width.
15277 ArrayRef<int> HalfMask, int HalfIdx1,
15278 int HalfIdx2, bool UndefLower,
15279 SelectionDAG &DAG, bool UseConcat = false) {
15280 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
15281 assert(V1.getValueType().isSimple() && "Expecting only simple types");
15282
15283 MVT VT = V1.getSimpleValueType();
15284 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15285 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15286
15287 auto getHalfVector = [&](int HalfIdx) {
15288 if (HalfIdx < 0)
15289 return DAG.getUNDEF(HalfVT);
15290 SDValue V = (HalfIdx < 2 ? V1 : V2);
15291 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15292 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15293 DAG.getIntPtrConstant(HalfIdx, DL));
15294 };
15295
15296 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15297 SDValue Half1 = getHalfVector(HalfIdx1);
15298 SDValue Half2 = getHalfVector(HalfIdx2);
15299 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15300 if (UseConcat) {
15301 SDValue Op0 = V;
15302 SDValue Op1 = DAG.getUNDEF(HalfVT);
15303 if (UndefLower)
15304 std::swap(Op0, Op1);
15305 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15306 }
15307
15308 unsigned Offset = UndefLower ? HalfNumElts : 0;
15309 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15311}
15312
15313/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15314/// This allows for fast cases such as subvector extraction/insertion
15315/// or shuffling smaller vector types which can lower more efficiently.
15317 SDValue V2, ArrayRef<int> Mask,
15318 const X86Subtarget &Subtarget,
15319 SelectionDAG &DAG) {
15320 assert((VT.is256BitVector() || VT.is512BitVector()) &&
15321 "Expected 256-bit or 512-bit vector");
15322
15323 bool UndefLower = isUndefLowerHalf(Mask);
15324 if (!UndefLower && !isUndefUpperHalf(Mask))
15325 return SDValue();
15326
15327 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
15328 "Completely undef shuffle mask should have been simplified already");
15329
15330 // Upper half is undef and lower half is whole upper subvector.
15331 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15332 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15333 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15334 if (!UndefLower &&
15335 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15336 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15337 DAG.getIntPtrConstant(HalfNumElts, DL));
15338 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15339 DAG.getIntPtrConstant(0, DL));
15340 }
15341
15342 // Lower half is undef and upper half is whole lower subvector.
15343 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15344 if (UndefLower &&
15345 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15346 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15347 DAG.getIntPtrConstant(0, DL));
15348 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15349 DAG.getIntPtrConstant(HalfNumElts, DL));
15350 }
15351
15352 int HalfIdx1, HalfIdx2;
15353 SmallVector<int, 8> HalfMask(HalfNumElts);
15354 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15355 return SDValue();
15356
15357 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
15358
15359 // Only shuffle the halves of the inputs when useful.
15360 unsigned NumLowerHalves =
15361 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15362 unsigned NumUpperHalves =
15363 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15364 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
15365
15366 // Determine the larger pattern of undef/halves, then decide if it's worth
15367 // splitting the shuffle based on subtarget capabilities and types.
15368 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
15369 if (!UndefLower) {
15370 // XXXXuuuu: no insert is needed.
15371 // Always extract lowers when setting lower - these are all free subreg ops.
15372 if (NumUpperHalves == 0)
15373 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15374 UndefLower, DAG);
15375
15376 if (NumUpperHalves == 1) {
15377 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
15378 if (Subtarget.hasAVX2()) {
15379 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
15380 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
15381 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
15382 (!isSingleSHUFPSMask(HalfMask) ||
15383 Subtarget.hasFastVariableCrossLaneShuffle()))
15384 return SDValue();
15385 // If this is a unary shuffle (assume that the 2nd operand is
15386 // canonicalized to undef), then we can use vpermpd. Otherwise, we
15387 // are better off extracting the upper half of 1 operand and using a
15388 // narrow shuffle.
15389 if (EltWidth == 64 && V2.isUndef())
15390 return SDValue();
15391 }
15392 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15393 if (Subtarget.hasAVX512() && VT.is512BitVector())
15394 return SDValue();
15395 // Extract + narrow shuffle is better than the wide alternative.
15396 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15397 UndefLower, DAG);
15398 }
15399
15400 // Don't extract both uppers, instead shuffle and then extract.
15401 assert(NumUpperHalves == 2 && "Half vector count went wrong");
15402 return SDValue();
15403 }
15404
15405 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
15406 if (NumUpperHalves == 0) {
15407 // AVX2 has efficient 64-bit element cross-lane shuffles.
15408 // TODO: Refine to account for unary shuffle, splat, and other masks?
15409 if (Subtarget.hasAVX2() && EltWidth == 64)
15410 return SDValue();
15411 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15412 if (Subtarget.hasAVX512() && VT.is512BitVector())
15413 return SDValue();
15414 // Narrow shuffle + insert is better than the wide alternative.
15415 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15416 UndefLower, DAG);
15417 }
15418
15419 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
15420 return SDValue();
15421}
15422
15423/// Handle case where shuffle sources are coming from the same 128-bit lane and
15424/// every lane can be represented as the same repeating mask - allowing us to
15425/// shuffle the sources with the repeating shuffle and then permute the result
15426/// to the destination lanes.
15428 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15429 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15430 int NumElts = VT.getVectorNumElements();
15431 int NumLanes = VT.getSizeInBits() / 128;
15432 int NumLaneElts = NumElts / NumLanes;
15433
15434 // On AVX2 we may be able to just shuffle the lowest elements and then
15435 // broadcast the result.
15436 if (Subtarget.hasAVX2()) {
15437 for (unsigned BroadcastSize : {16, 32, 64}) {
15438 if (BroadcastSize <= VT.getScalarSizeInBits())
15439 continue;
15440 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
15441
15442 // Attempt to match a repeating pattern every NumBroadcastElts,
15443 // accounting for UNDEFs but only references the lowest 128-bit
15444 // lane of the inputs.
15445 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
15446 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15447 for (int j = 0; j != NumBroadcastElts; ++j) {
15448 int M = Mask[i + j];
15449 if (M < 0)
15450 continue;
15451 int &R = RepeatMask[j];
15452 if (0 != ((M % NumElts) / NumLaneElts))
15453 return false;
15454 if (0 <= R && R != M)
15455 return false;
15456 R = M;
15457 }
15458 return true;
15459 };
15460
15461 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
15462 if (!FindRepeatingBroadcastMask(RepeatMask))
15463 continue;
15464
15465 // Shuffle the (lowest) repeated elements in place for broadcast.
15466 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
15467
15468 // Shuffle the actual broadcast.
15469 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
15470 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15471 for (int j = 0; j != NumBroadcastElts; ++j)
15472 BroadcastMask[i + j] = j;
15473
15474 // Avoid returning the same shuffle operation. For example,
15475 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
15476 if (BroadcastMask == Mask)
15477 return SDValue();
15478
15479 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
15480 BroadcastMask);
15481 }
15482 }
15483
15484 // Bail if the shuffle mask doesn't cross 128-bit lanes.
15485 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
15486 return SDValue();
15487
15488 // Bail if we already have a repeated lane shuffle mask.
15489 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15490 return SDValue();
15491
15492 // Helper to look for repeated mask in each split sublane, and that those
15493 // sublanes can then be permuted into place.
15494 auto ShuffleSubLanes = [&](int SubLaneScale) {
15495 int NumSubLanes = NumLanes * SubLaneScale;
15496 int NumSubLaneElts = NumLaneElts / SubLaneScale;
15497
15498 // Check that all the sources are coming from the same lane and see if we
15499 // can form a repeating shuffle mask (local to each sub-lane). At the same
15500 // time, determine the source sub-lane for each destination sub-lane.
15501 int TopSrcSubLane = -1;
15502 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
15503 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
15504 SubLaneScale,
15505 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
15506
15507 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
15508 // Extract the sub-lane mask, check that it all comes from the same lane
15509 // and normalize the mask entries to come from the first lane.
15510 int SrcLane = -1;
15511 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
15512 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15513 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
15514 if (M < 0)
15515 continue;
15516 int Lane = (M % NumElts) / NumLaneElts;
15517 if ((0 <= SrcLane) && (SrcLane != Lane))
15518 return SDValue();
15519 SrcLane = Lane;
15520 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
15521 SubLaneMask[Elt] = LocalM;
15522 }
15523
15524 // Whole sub-lane is UNDEF.
15525 if (SrcLane < 0)
15526 continue;
15527
15528 // Attempt to match against the candidate repeated sub-lane masks.
15529 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
15530 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
15531 for (int i = 0; i != NumSubLaneElts; ++i) {
15532 if (M1[i] < 0 || M2[i] < 0)
15533 continue;
15534 if (M1[i] != M2[i])
15535 return false;
15536 }
15537 return true;
15538 };
15539
15540 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
15541 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
15542 continue;
15543
15544 // Merge the sub-lane mask into the matching repeated sub-lane mask.
15545 for (int i = 0; i != NumSubLaneElts; ++i) {
15546 int M = SubLaneMask[i];
15547 if (M < 0)
15548 continue;
15549 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
15550 "Unexpected mask element");
15551 RepeatedSubLaneMask[i] = M;
15552 }
15553
15554 // Track the top most source sub-lane - by setting the remaining to
15555 // UNDEF we can greatly simplify shuffle matching.
15556 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
15557 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
15558 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
15559 break;
15560 }
15561
15562 // Bail if we failed to find a matching repeated sub-lane mask.
15563 if (Dst2SrcSubLanes[DstSubLane] < 0)
15564 return SDValue();
15565 }
15566 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
15567 "Unexpected source lane");
15568
15569 // Create a repeating shuffle mask for the entire vector.
15570 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
15571 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
15572 int Lane = SubLane / SubLaneScale;
15573 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
15574 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15575 int M = RepeatedSubLaneMask[Elt];
15576 if (M < 0)
15577 continue;
15578 int Idx = (SubLane * NumSubLaneElts) + Elt;
15579 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
15580 }
15581 }
15582
15583 // Shuffle each source sub-lane to its destination.
15584 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
15585 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
15586 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
15587 if (SrcSubLane < 0)
15588 continue;
15589 for (int j = 0; j != NumSubLaneElts; ++j)
15590 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
15591 }
15592
15593 // Avoid returning the same shuffle operation.
15594 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
15595 if (RepeatedMask == Mask || SubLaneMask == Mask)
15596 return SDValue();
15597
15598 SDValue RepeatedShuffle =
15599 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
15600
15601 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
15602 SubLaneMask);
15603 };
15604
15605 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
15606 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
15607 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
15608 // Otherwise we can only permute whole 128-bit lanes.
15609 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
15610 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
15611 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
15612 MinSubLaneScale = 2;
15613 MaxSubLaneScale =
15614 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
15615 }
15616 if (Subtarget.hasBWI() && VT == MVT::v64i8)
15617 MinSubLaneScale = MaxSubLaneScale = 4;
15618
15619 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
15620 if (SDValue Shuffle = ShuffleSubLanes(Scale))
15621 return Shuffle;
15622
15623 return SDValue();
15624}
15625
15627 bool &ForceV1Zero, bool &ForceV2Zero,
15628 unsigned &ShuffleImm, ArrayRef<int> Mask,
15629 const APInt &Zeroable) {
15630 int NumElts = VT.getVectorNumElements();
15631 assert(VT.getScalarSizeInBits() == 64 &&
15632 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
15633 "Unexpected data type for VSHUFPD");
15634 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
15635 "Illegal shuffle mask");
15636
15637 bool ZeroLane[2] = { true, true };
15638 for (int i = 0; i < NumElts; ++i)
15639 ZeroLane[i & 1] &= Zeroable[i];
15640
15641 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
15642 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
15643 ShuffleImm = 0;
15644 bool ShufpdMask = true;
15645 bool CommutableMask = true;
15646 for (int i = 0; i < NumElts; ++i) {
15647 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
15648 continue;
15649 if (Mask[i] < 0)
15650 return false;
15651 int Val = (i & 6) + NumElts * (i & 1);
15652 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
15653 if (Mask[i] < Val || Mask[i] > Val + 1)
15654 ShufpdMask = false;
15655 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
15656 CommutableMask = false;
15657 ShuffleImm |= (Mask[i] % 2) << i;
15658 }
15659
15660 if (!ShufpdMask && !CommutableMask)
15661 return false;
15662
15663 if (!ShufpdMask && CommutableMask)
15664 std::swap(V1, V2);
15665
15666 ForceV1Zero = ZeroLane[0];
15667 ForceV2Zero = ZeroLane[1];
15668 return true;
15669}
15670
15672 SDValue V2, ArrayRef<int> Mask,
15673 const APInt &Zeroable,
15674 const X86Subtarget &Subtarget,
15675 SelectionDAG &DAG) {
15676 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
15677 "Unexpected data type for VSHUFPD");
15678
15679 unsigned Immediate = 0;
15680 bool ForceV1Zero = false, ForceV2Zero = false;
15681 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
15682 Mask, Zeroable))
15683 return SDValue();
15684
15685 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
15686 if (ForceV1Zero)
15687 V1 = getZeroVector(VT, Subtarget, DAG, DL);
15688 if (ForceV2Zero)
15689 V2 = getZeroVector(VT, Subtarget, DAG, DL);
15690
15691 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
15692 DAG.getTargetConstant(Immediate, DL, MVT::i8));
15693}
15694
15695// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
15696// by zeroable elements in the remaining 24 elements. Turn this into two
15697// vmovqb instructions shuffled together.
15699 SDValue V1, SDValue V2,
15700 ArrayRef<int> Mask,
15701 const APInt &Zeroable,
15702 SelectionDAG &DAG) {
15703 assert(VT == MVT::v32i8 && "Unexpected type!");
15704
15705 // The first 8 indices should be every 8th element.
15706 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
15707 return SDValue();
15708
15709 // Remaining elements need to be zeroable.
15710 if (Zeroable.countl_one() < (Mask.size() - 8))
15711 return SDValue();
15712
15713 V1 = DAG.getBitcast(MVT::v4i64, V1);
15714 V2 = DAG.getBitcast(MVT::v4i64, V2);
15715
15716 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
15717 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
15718
15719 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
15720 // the upper bits of the result using an unpckldq.
15721 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
15722 { 0, 1, 2, 3, 16, 17, 18, 19,
15723 4, 5, 6, 7, 20, 21, 22, 23 });
15724 // Insert the unpckldq into a zero vector to widen to v32i8.
15725 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
15726 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
15727 DAG.getIntPtrConstant(0, DL));
15728}
15729
15730// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
15731// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
15732// =>
15733// ul = unpckl v1, v2
15734// uh = unpckh v1, v2
15735// a = vperm ul, uh
15736// b = vperm ul, uh
15737//
15738// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
15739// and permute. We cannot directly match v3 because it is split into two
15740// 256-bit vectors in earlier isel stages. Therefore, this function matches a
15741// pair of 256-bit shuffles and makes sure the masks are consecutive.
15742//
15743// Once unpck and permute nodes are created, the permute corresponding to this
15744// shuffle is returned, while the other permute replaces the other half of the
15745// shuffle in the selection dag.
15747 SDValue V1, SDValue V2,
15748 ArrayRef<int> Mask,
15749 SelectionDAG &DAG) {
15750 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
15751 VT != MVT::v32i8)
15752 return SDValue();
15753 // <B0, B1, B0+1, B1+1, ..., >
15754 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
15755 unsigned Begin1) {
15756 size_t Size = Mask.size();
15757 assert(Size % 2 == 0 && "Expected even mask size");
15758 for (unsigned I = 0; I < Size; I += 2) {
15759 if (Mask[I] != (int)(Begin0 + I / 2) ||
15760 Mask[I + 1] != (int)(Begin1 + I / 2))
15761 return false;
15762 }
15763 return true;
15764 };
15765 // Check which half is this shuffle node
15766 int NumElts = VT.getVectorNumElements();
15767 size_t FirstQtr = NumElts / 2;
15768 size_t ThirdQtr = NumElts + NumElts / 2;
15769 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
15770 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
15771 if (!IsFirstHalf && !IsSecondHalf)
15772 return SDValue();
15773
15774 // Find the intersection between shuffle users of V1 and V2.
15775 SmallVector<SDNode *, 2> Shuffles;
15776 for (SDNode *User : V1->uses())
15777 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
15778 User->getOperand(1) == V2)
15779 Shuffles.push_back(User);
15780 // Limit user size to two for now.
15781 if (Shuffles.size() != 2)
15782 return SDValue();
15783 // Find out which half of the 512-bit shuffles is each smaller shuffle
15784 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
15785 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
15786 SDNode *FirstHalf;
15787 SDNode *SecondHalf;
15788 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
15789 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
15790 FirstHalf = Shuffles[0];
15791 SecondHalf = Shuffles[1];
15792 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
15793 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
15794 FirstHalf = Shuffles[1];
15795 SecondHalf = Shuffles[0];
15796 } else {
15797 return SDValue();
15798 }
15799 // Lower into unpck and perm. Return the perm of this shuffle and replace
15800 // the other.
15801 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
15802 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
15803 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
15804 DAG.getTargetConstant(0x20, DL, MVT::i8));
15805 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
15806 DAG.getTargetConstant(0x31, DL, MVT::i8));
15807 if (IsFirstHalf) {
15808 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
15809 return Perm1;
15810 }
15811 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
15812 return Perm2;
15813}
15814
15815/// Handle lowering of 4-lane 64-bit floating point shuffles.
15816///
15817/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
15818/// isn't available.
15820 const APInt &Zeroable, SDValue V1, SDValue V2,
15821 const X86Subtarget &Subtarget,
15822 SelectionDAG &DAG) {
15823 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
15824 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
15825 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
15826
15827 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
15828 Subtarget, DAG))
15829 return V;
15830
15831 if (V2.isUndef()) {
15832 // Check for being able to broadcast a single element.
15833 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
15834 Mask, Subtarget, DAG))
15835 return Broadcast;
15836
15837 // Use low duplicate instructions for masks that match their pattern.
15838 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
15839 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
15840
15841 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
15842 // Non-half-crossing single input shuffles can be lowered with an
15843 // interleaved permutation.
15844 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
15845 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
15846 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
15847 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
15848 }
15849
15850 // With AVX2 we have direct support for this permutation.
15851 if (Subtarget.hasAVX2())
15852 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
15853 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15854
15855 // Try to create an in-lane repeating shuffle mask and then shuffle the
15856 // results into the target lanes.
15858 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
15859 return V;
15860
15861 // Try to permute the lanes and then use a per-lane permute.
15862 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
15863 Mask, DAG, Subtarget))
15864 return V;
15865
15866 // Otherwise, fall back.
15867 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
15868 DAG, Subtarget);
15869 }
15870
15871 // Use dedicated unpack instructions for masks that match their pattern.
15872 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
15873 return V;
15874
15875 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
15876 Zeroable, Subtarget, DAG))
15877 return Blend;
15878
15879 // Check if the blend happens to exactly fit that of SHUFPD.
15880 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
15881 Zeroable, Subtarget, DAG))
15882 return Op;
15883
15884 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
15885 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
15886
15887 // If we have lane crossing shuffles AND they don't all come from the lower
15888 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15889 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
15890 // canonicalize to a blend of splat which isn't necessary for this combine.
15891 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
15892 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
15893 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
15894 (V2.getOpcode() != ISD::BUILD_VECTOR))
15895 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
15896
15897 // If we have one input in place, then we can permute the other input and
15898 // blend the result.
15899 if (V1IsInPlace || V2IsInPlace)
15900 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
15901 Subtarget, DAG);
15902
15903 // Try to create an in-lane repeating shuffle mask and then shuffle the
15904 // results into the target lanes.
15906 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
15907 return V;
15908
15909 // Try to simplify this by merging 128-bit lanes to enable a lane-based
15910 // shuffle. However, if we have AVX2 and either inputs are already in place,
15911 // we will be able to shuffle even across lanes the other input in a single
15912 // instruction so skip this pattern.
15913 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
15915 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
15916 return V;
15917
15918 // If we have VLX support, we can use VEXPAND.
15919 if (Subtarget.hasVLX())
15920 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
15921 DAG, Subtarget))
15922 return V;
15923
15924 // If we have AVX2 then we always want to lower with a blend because an v4 we
15925 // can fully permute the elements.
15926 if (Subtarget.hasAVX2())
15927 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
15928 Subtarget, DAG);
15929
15930 // Otherwise fall back on generic lowering.
15931 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
15932 Subtarget, DAG);
15933}
15934
15935/// Handle lowering of 4-lane 64-bit integer shuffles.
15936///
15937/// This routine is only called when we have AVX2 and thus a reasonable
15938/// instruction set for v4i64 shuffling..
15940 const APInt &Zeroable, SDValue V1, SDValue V2,
15941 const X86Subtarget &Subtarget,
15942 SelectionDAG &DAG) {
15943 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
15944 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
15945 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
15946 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
15947
15948 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
15949 Subtarget, DAG))
15950 return V;
15951
15952 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
15953 Zeroable, Subtarget, DAG))
15954 return Blend;
15955
15956 // Check for being able to broadcast a single element.
15957 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
15958 Subtarget, DAG))
15959 return Broadcast;
15960
15961 // Try to use shift instructions if fast.
15962 if (Subtarget.preferLowerShuffleAsShift())
15963 if (SDValue Shift =
15964 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
15965 Subtarget, DAG, /*BitwiseOnly*/ true))
15966 return Shift;
15967
15968 if (V2.isUndef()) {
15969 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
15970 // can use lower latency instructions that will operate on both lanes.
15971 SmallVector<int, 2> RepeatedMask;
15972 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
15973 SmallVector<int, 4> PSHUFDMask;
15974 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
15975 return DAG.getBitcast(
15976 MVT::v4i64,
15977 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
15978 DAG.getBitcast(MVT::v8i32, V1),
15979 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
15980 }
15981
15982 // AVX2 provides a direct instruction for permuting a single input across
15983 // lanes.
15984 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
15985 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15986 }
15987
15988 // Try to use shift instructions.
15989 if (SDValue Shift =
15990 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
15991 DAG, /*BitwiseOnly*/ false))
15992 return Shift;
15993
15994 // If we have VLX support, we can use VALIGN or VEXPAND.
15995 if (Subtarget.hasVLX()) {
15996 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
15997 Zeroable, Subtarget, DAG))
15998 return Rotate;
15999
16000 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
16001 DAG, Subtarget))
16002 return V;
16003 }
16004
16005 // Try to use PALIGNR.
16006 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
16007 Subtarget, DAG))
16008 return Rotate;
16009
16010 // Use dedicated unpack instructions for masks that match their pattern.
16011 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
16012 return V;
16013
16014 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16015 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16016
16017 // If we have one input in place, then we can permute the other input and
16018 // blend the result.
16019 if (V1IsInPlace || V2IsInPlace)
16020 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16021 Subtarget, DAG);
16022
16023 // Try to create an in-lane repeating shuffle mask and then shuffle the
16024 // results into the target lanes.
16026 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16027 return V;
16028
16029 // Try to lower to PERMQ(BLENDD(V1,V2)).
16030 if (SDValue V =
16031 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
16032 return V;
16033
16034 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16035 // shuffle. However, if we have AVX2 and either inputs are already in place,
16036 // we will be able to shuffle even across lanes the other input in a single
16037 // instruction so skip this pattern.
16038 if (!V1IsInPlace && !V2IsInPlace)
16040 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16041 return Result;
16042
16043 // Otherwise fall back on generic blend lowering.
16044 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16045 Subtarget, DAG);
16046}
16047
16048/// Handle lowering of 8-lane 32-bit floating point shuffles.
16049///
16050/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16051/// isn't available.
16053 const APInt &Zeroable, SDValue V1, SDValue V2,
16054 const X86Subtarget &Subtarget,
16055 SelectionDAG &DAG) {
16056 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16057 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16058 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16059
16060 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16061 Zeroable, Subtarget, DAG))
16062 return Blend;
16063
16064 // Check for being able to broadcast a single element.
16065 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16066 Subtarget, DAG))
16067 return Broadcast;
16068
16069 if (!Subtarget.hasAVX2()) {
16070 SmallVector<int> InLaneMask;
16071 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16072
16073 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
16074 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
16075 /*SimpleOnly*/ true))
16076 return R;
16077 }
16078 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16079 Zeroable, Subtarget, DAG))
16080 return DAG.getBitcast(MVT::v8f32, ZExt);
16081
16082 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16083 // options to efficiently lower the shuffle.
16084 SmallVector<int, 4> RepeatedMask;
16085 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16086 assert(RepeatedMask.size() == 4 &&
16087 "Repeated masks must be half the mask width!");
16088
16089 // Use even/odd duplicate instructions for masks that match their pattern.
16090 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16091 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16092 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16093 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16094
16095 if (V2.isUndef())
16096 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16097 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16098
16099 // Use dedicated unpack instructions for masks that match their pattern.
16100 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
16101 return V;
16102
16103 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16104 // have already handled any direct blends.
16105 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16106 }
16107
16108 // Try to create an in-lane repeating shuffle mask and then shuffle the
16109 // results into the target lanes.
16111 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16112 return V;
16113
16114 // If we have a single input shuffle with different shuffle patterns in the
16115 // two 128-bit lanes use the variable mask to VPERMILPS.
16116 if (V2.isUndef()) {
16117 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16118 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16119 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16120 }
16121 if (Subtarget.hasAVX2()) {
16122 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16123 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
16124 }
16125 // Otherwise, fall back.
16126 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
16127 DAG, Subtarget);
16128 }
16129
16130 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16131 // shuffle.
16133 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16134 return Result;
16135
16136 // If we have VLX support, we can use VEXPAND.
16137 if (Subtarget.hasVLX())
16138 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
16139 DAG, Subtarget))
16140 return V;
16141
16142 // Try to match an interleave of two v8f32s and lower them as unpck and
16143 // permutes using ymms. This needs to go before we try to split the vectors.
16144 //
16145 // TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits
16146 // this path inadvertently.
16147 if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())
16148 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
16149 Mask, DAG))
16150 return V;
16151
16152 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16153 // since after split we get a more efficient code using vpunpcklwd and
16154 // vpunpckhwd instrs than vblend.
16155 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
16156 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
16157 DAG);
16158
16159 // If we have AVX2 then we always want to lower with a blend because at v8 we
16160 // can fully permute the elements.
16161 if (Subtarget.hasAVX2())
16162 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
16163 Subtarget, DAG);
16164
16165 // Otherwise fall back on generic lowering.
16166 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
16167 Subtarget, DAG);
16168}
16169
16170/// Handle lowering of 8-lane 32-bit integer shuffles.
16171///
16172/// This routine is only called when we have AVX2 and thus a reasonable
16173/// instruction set for v8i32 shuffling..
16175 const APInt &Zeroable, SDValue V1, SDValue V2,
16176 const X86Subtarget &Subtarget,
16177 SelectionDAG &DAG) {
16178 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16179 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16180 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16181 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
16182
16183 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16184
16185 // Whenever we can lower this as a zext, that instruction is strictly faster
16186 // than any alternative. It also allows us to fold memory operands into the
16187 // shuffle in many cases.
16188 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16189 Zeroable, Subtarget, DAG))
16190 return ZExt;
16191
16192 // Try to match an interleave of two v8i32s and lower them as unpck and
16193 // permutes using ymms. This needs to go before we try to split the vectors.
16194 if (!Subtarget.hasAVX512())
16195 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
16196 Mask, DAG))
16197 return V;
16198
16199 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16200 // since after split we get a more efficient code than vblend by using
16201 // vpunpcklwd and vpunpckhwd instrs.
16202 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
16203 !Subtarget.hasAVX512())
16204 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
16205 DAG);
16206
16207 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
16208 Zeroable, Subtarget, DAG))
16209 return Blend;
16210
16211 // Check for being able to broadcast a single element.
16212 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
16213 Subtarget, DAG))
16214 return Broadcast;
16215
16216 // Try to use shift instructions if fast.
16217 if (Subtarget.preferLowerShuffleAsShift()) {
16218 if (SDValue Shift =
16219 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16220 Subtarget, DAG, /*BitwiseOnly*/ true))
16221 return Shift;
16222 if (NumV2Elements == 0)
16223 if (SDValue Rotate =
16224 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16225 return Rotate;
16226 }
16227
16228 // If the shuffle mask is repeated in each 128-bit lane we can use more
16229 // efficient instructions that mirror the shuffles across the two 128-bit
16230 // lanes.
16231 SmallVector<int, 4> RepeatedMask;
16232 bool Is128BitLaneRepeatedShuffle =
16233 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
16234 if (Is128BitLaneRepeatedShuffle) {
16235 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16236 if (V2.isUndef())
16237 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
16238 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16239
16240 // Use dedicated unpack instructions for masks that match their pattern.
16241 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
16242 return V;
16243 }
16244
16245 // Try to use shift instructions.
16246 if (SDValue Shift =
16247 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
16248 DAG, /*BitwiseOnly*/ false))
16249 return Shift;
16250
16251 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
16252 if (SDValue Rotate =
16253 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16254 return Rotate;
16255
16256 // If we have VLX support, we can use VALIGN or EXPAND.
16257 if (Subtarget.hasVLX()) {
16258 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16259 Zeroable, Subtarget, DAG))
16260 return Rotate;
16261
16262 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
16263 DAG, Subtarget))
16264 return V;
16265 }
16266
16267 // Try to use byte rotation instructions.
16268 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
16269 Subtarget, DAG))
16270 return Rotate;
16271
16272 // Try to create an in-lane repeating shuffle mask and then shuffle the
16273 // results into the target lanes.
16275 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16276 return V;
16277
16278 if (V2.isUndef()) {
16279 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16280 // because that should be faster than the variable permute alternatives.
16281 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
16282 return V;
16283
16284 // If the shuffle patterns aren't repeated but it's a single input, directly
16285 // generate a cross-lane VPERMD instruction.
16286 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16287 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
16288 }
16289
16290 // Assume that a single SHUFPS is faster than an alternative sequence of
16291 // multiple instructions (even if the CPU has a domain penalty).
16292 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16293 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16294 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
16295 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
16296 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
16297 CastV1, CastV2, DAG);
16298 return DAG.getBitcast(MVT::v8i32, ShufPS);
16299 }
16300
16301 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16302 // shuffle.
16304 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16305 return Result;
16306
16307 // Otherwise fall back on generic blend lowering.
16308 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
16309 Subtarget, DAG);
16310}
16311
16312/// Handle lowering of 16-lane 16-bit integer shuffles.
16313///
16314/// This routine is only called when we have AVX2 and thus a reasonable
16315/// instruction set for v16i16 shuffling..
16317 const APInt &Zeroable, SDValue V1, SDValue V2,
16318 const X86Subtarget &Subtarget,
16319 SelectionDAG &DAG) {
16320 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16321 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16322 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16323 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
16324
16325 // Whenever we can lower this as a zext, that instruction is strictly faster
16326 // than any alternative. It also allows us to fold memory operands into the
16327 // shuffle in many cases.
16329 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16330 return ZExt;
16331
16332 // Check for being able to broadcast a single element.
16333 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16334 Subtarget, DAG))
16335 return Broadcast;
16336
16337 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16338 Zeroable, Subtarget, DAG))
16339 return Blend;
16340
16341 // Use dedicated unpack instructions for masks that match their pattern.
16342 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
16343 return V;
16344
16345 // Use dedicated pack instructions for masks that match their pattern.
16346 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
16347 Subtarget))
16348 return V;
16349
16350 // Try to use lower using a truncation.
16351 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16352 Subtarget, DAG))
16353 return V;
16354
16355 // Try to use shift instructions.
16356 if (SDValue Shift =
16357 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16358 Subtarget, DAG, /*BitwiseOnly*/ false))
16359 return Shift;
16360
16361 // Try to use byte rotation instructions.
16362 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
16363 Subtarget, DAG))
16364 return Rotate;
16365
16366 // Try to create an in-lane repeating shuffle mask and then shuffle the
16367 // results into the target lanes.
16369 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16370 return V;
16371
16372 if (V2.isUndef()) {
16373 // Try to use bit rotation instructions.
16374 if (SDValue Rotate =
16375 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
16376 return Rotate;
16377
16378 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16379 // because that should be faster than the variable permute alternatives.
16380 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
16381 return V;
16382
16383 // There are no generalized cross-lane shuffle operations available on i16
16384 // element types.
16385 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
16387 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16388 return V;
16389
16390 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
16391 DAG, Subtarget);
16392 }
16393
16394 SmallVector<int, 8> RepeatedMask;
16395 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
16396 // As this is a single-input shuffle, the repeated mask should be
16397 // a strictly valid v8i16 mask that we can pass through to the v8i16
16398 // lowering to handle even the v16 case.
16400 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
16401 }
16402 }
16403
16404 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
16405 Zeroable, Subtarget, DAG))
16406 return PSHUFB;
16407
16408 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
16409 if (Subtarget.hasBWI())
16410 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
16411
16412 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16413 // shuffle.
16415 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16416 return Result;
16417
16418 // Try to permute the lanes and then use a per-lane permute.
16420 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16421 return V;
16422
16423 // Try to match an interleave of two v16i16s and lower them as unpck and
16424 // permutes using ymms.
16425 if (!Subtarget.hasAVX512())
16426 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
16427 Mask, DAG))
16428 return V;
16429
16430 // Otherwise fall back on generic lowering.
16431 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
16432 Subtarget, DAG);
16433}
16434
16435/// Handle lowering of 32-lane 8-bit integer shuffles.
16436///
16437/// This routine is only called when we have AVX2 and thus a reasonable
16438/// instruction set for v32i8 shuffling..
16440 const APInt &Zeroable, SDValue V1, SDValue V2,
16441 const X86Subtarget &Subtarget,
16442 SelectionDAG &DAG) {
16443 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
16444 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
16445 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
16446 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
16447
16448 // Whenever we can lower this as a zext, that instruction is strictly faster
16449 // than any alternative. It also allows us to fold memory operands into the
16450 // shuffle in many cases.
16451 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
16452 Zeroable, Subtarget, DAG))
16453 return ZExt;
16454
16455 // Check for being able to broadcast a single element.
16456 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
16457 Subtarget, DAG))
16458 return Broadcast;
16459
16460 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
16461 Zeroable, Subtarget, DAG))
16462 return Blend;
16463
16464 // Use dedicated unpack instructions for masks that match their pattern.
16465 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
16466 return V;
16467
16468 // Use dedicated pack instructions for masks that match their pattern.
16469 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
16470 Subtarget))
16471 return V;
16472
16473 // Try to use lower using a truncation.
16474 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
16475 Subtarget, DAG))
16476 return V;
16477
16478 // Try to use shift instructions.
16479 if (SDValue Shift =
16480 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
16481 DAG, /*BitwiseOnly*/ false))
16482 return Shift;
16483
16484 // Try to use byte rotation instructions.
16485 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
16486 Subtarget, DAG))
16487 return Rotate;
16488
16489 // Try to use bit rotation instructions.
16490 if (V2.isUndef())
16491 if (SDValue Rotate =
16492 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
16493 return Rotate;
16494
16495 // Try to create an in-lane repeating shuffle mask and then shuffle the
16496 // results into the target lanes.
16498 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16499 return V;
16500
16501 // There are no generalized cross-lane shuffle operations available on i8
16502 // element types.
16503 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
16504 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16505 // because that should be faster than the variable permute alternatives.
16506 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
16507 return V;
16508
16510 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16511 return V;
16512
16513 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
16514 DAG, Subtarget);
16515 }
16516
16517 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
16518 Zeroable, Subtarget, DAG))
16519 return PSHUFB;
16520
16521 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
16522 if (Subtarget.hasVBMI())
16523 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
16524
16525 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16526 // shuffle.
16528 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16529 return Result;
16530
16531 // Try to permute the lanes and then use a per-lane permute.
16533 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16534 return V;
16535
16536 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16537 // by zeroable elements in the remaining 24 elements. Turn this into two
16538 // vmovqb instructions shuffled together.
16539 if (Subtarget.hasVLX())
16540 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
16541 Mask, Zeroable, DAG))
16542 return V;
16543
16544 // Try to match an interleave of two v32i8s and lower them as unpck and
16545 // permutes using ymms.
16546 if (!Subtarget.hasAVX512())
16547 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
16548 Mask, DAG))
16549 return V;
16550
16551 // Otherwise fall back on generic lowering.
16552 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
16553 Subtarget, DAG);
16554}
16555
16556/// High-level routine to lower various 256-bit x86 vector shuffles.
16557///
16558/// This routine either breaks down the specific type of a 256-bit x86 vector
16559/// shuffle or splits it into two 128-bit shuffles and fuses the results back
16560/// together based on the available instructions.
16562 SDValue V1, SDValue V2, const APInt &Zeroable,
16563 const X86Subtarget &Subtarget,
16564 SelectionDAG &DAG) {
16565 // If we have a single input to the zero element, insert that into V1 if we
16566 // can do so cheaply.
16567 int NumElts = VT.getVectorNumElements();
16568 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
16569
16570 if (NumV2Elements == 1 && Mask[0] >= NumElts)
16572 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
16573 return Insertion;
16574
16575 // Handle special cases where the lower or upper half is UNDEF.
16576 if (SDValue V =
16577 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
16578 return V;
16579
16580 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
16581 // can check for those subtargets here and avoid much of the subtarget
16582 // querying in the per-vector-type lowering routines. With AVX1 we have
16583 // essentially *zero* ability to manipulate a 256-bit vector with integer
16584 // types. Since we'll use floating point types there eventually, just
16585 // immediately cast everything to a float and operate entirely in that domain.
16586 if (VT.isInteger() && !Subtarget.hasAVX2()) {
16587 int ElementBits = VT.getScalarSizeInBits();
16588 if (ElementBits < 32) {
16589 // No floating point type available, if we can't use the bit operations
16590 // for masking/blending then decompose into 128-bit vectors.
16591 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
16592 Subtarget, DAG))
16593 return V;
16594 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
16595 return V;
16596 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
16597 }
16598
16599 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
16601 V1 = DAG.getBitcast(FpVT, V1);
16602 V2 = DAG.getBitcast(FpVT, V2);
16603 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
16604 }
16605
16606 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
16607 V1 = DAG.getBitcast(MVT::v16i16, V1);
16608 V2 = DAG.getBitcast(MVT::v16i16, V2);
16609 return DAG.getBitcast(VT,
16610 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
16611 }
16612
16613 switch (VT.SimpleTy) {
16614 case MVT::v4f64:
16615 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16616 case MVT::v4i64:
16617 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16618 case MVT::v8f32:
16619 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16620 case MVT::v8i32:
16621 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16622 case MVT::v16i16:
16623 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16624 case MVT::v32i8:
16625 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16626
16627 default:
16628 llvm_unreachable("Not a valid 256-bit x86 vector type!");
16629 }
16630}
16631
16632/// Try to lower a vector shuffle as a 128-bit shuffles.
16634 const APInt &Zeroable, SDValue V1, SDValue V2,
16635 const X86Subtarget &Subtarget,
16636 SelectionDAG &DAG) {
16637 assert(VT.getScalarSizeInBits() == 64 &&
16638 "Unexpected element type size for 128bit shuffle.");
16639
16640 // To handle 256 bit vector requires VLX and most probably
16641 // function lowerV2X128VectorShuffle() is better solution.
16642 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
16643
16644 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
16645 SmallVector<int, 4> Widened128Mask;
16646 if (!canWidenShuffleElements(Mask, Widened128Mask))
16647 return SDValue();
16648 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
16649
16650 // Try to use an insert into a zero vector.
16651 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
16652 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
16653 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
16654 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
16655 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16656 DAG.getIntPtrConstant(0, DL));
16657 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16658 getZeroVector(VT, Subtarget, DAG, DL), LoV,
16659 DAG.getIntPtrConstant(0, DL));
16660 }
16661
16662 // Check for patterns which can be matched with a single insert of a 256-bit
16663 // subvector.
16664 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
16665 if (OnlyUsesV1 ||
16666 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
16667 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
16668 SDValue SubVec =
16669 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
16670 DAG.getIntPtrConstant(0, DL));
16671 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16672 DAG.getIntPtrConstant(4, DL));
16673 }
16674
16675 // See if this is an insertion of the lower 128-bits of V2 into V1.
16676 bool IsInsert = true;
16677 int V2Index = -1;
16678 for (int i = 0; i < 4; ++i) {
16679 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
16680 if (Widened128Mask[i] < 0)
16681 continue;
16682
16683 // Make sure all V1 subvectors are in place.
16684 if (Widened128Mask[i] < 4) {
16685 if (Widened128Mask[i] != i) {
16686 IsInsert = false;
16687 break;
16688 }
16689 } else {
16690 // Make sure we only have a single V2 index and its the lowest 128-bits.
16691 if (V2Index >= 0 || Widened128Mask[i] != 4) {
16692 IsInsert = false;
16693 break;
16694 }
16695 V2Index = i;
16696 }
16697 }
16698 if (IsInsert && V2Index >= 0) {
16699 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16700 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
16701 DAG.getIntPtrConstant(0, DL));
16702 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
16703 }
16704
16705 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
16706 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
16707 // possible we at least ensure the lanes stay sequential to help later
16708 // combines.
16709 SmallVector<int, 2> Widened256Mask;
16710 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
16711 Widened128Mask.clear();
16712 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
16713 }
16714
16715 // Try to lower to vshuf64x2/vshuf32x4.
16716 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
16717 int PermMask[4] = {-1, -1, -1, -1};
16718 // Ensure elements came from the same Op.
16719 for (int i = 0; i < 4; ++i) {
16720 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
16721 if (Widened128Mask[i] < 0)
16722 continue;
16723
16724 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
16725 unsigned OpIndex = i / 2;
16726 if (Ops[OpIndex].isUndef())
16727 Ops[OpIndex] = Op;
16728 else if (Ops[OpIndex] != Op)
16729 return SDValue();
16730
16731 PermMask[i] = Widened128Mask[i] % 4;
16732 }
16733
16734 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
16735 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
16736}
16737
16738/// Handle lowering of 8-lane 64-bit floating point shuffles.
16740 const APInt &Zeroable, SDValue V1, SDValue V2,
16741 const X86Subtarget &Subtarget,
16742 SelectionDAG &DAG) {
16743 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
16744 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
16745 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16746
16747 if (V2.isUndef()) {
16748 // Use low duplicate instructions for masks that match their pattern.
16749 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
16750 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
16751
16752 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
16753 // Non-half-crossing single input shuffles can be lowered with an
16754 // interleaved permutation.
16755 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16756 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
16757 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
16758 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
16759 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
16760 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16761 }
16762
16763 SmallVector<int, 4> RepeatedMask;
16764 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
16765 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
16766 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16767 }
16768
16769 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
16770 V2, Subtarget, DAG))
16771 return Shuf128;
16772
16773 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
16774 return Unpck;
16775
16776 // Check if the blend happens to exactly fit that of SHUFPD.
16777 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
16778 Zeroable, Subtarget, DAG))
16779 return Op;
16780
16781 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
16782 DAG, Subtarget))
16783 return V;
16784
16785 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
16786 Zeroable, Subtarget, DAG))
16787 return Blend;
16788
16789 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
16790}
16791
16792/// Handle lowering of 16-lane 32-bit floating point shuffles.
16794 const APInt &Zeroable, SDValue V1, SDValue V2,
16795 const X86Subtarget &Subtarget,
16796 SelectionDAG &DAG) {
16797 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
16798 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
16799 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16800
16801 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16802 // options to efficiently lower the shuffle.
16803 SmallVector<int, 4> RepeatedMask;
16804 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
16805 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16806
16807 // Use even/odd duplicate instructions for masks that match their pattern.
16808 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16809 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
16810 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16811 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
16812
16813 if (V2.isUndef())
16814 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
16815 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16816
16817 // Use dedicated unpack instructions for masks that match their pattern.
16818 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
16819 return V;
16820
16821 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
16822 Zeroable, Subtarget, DAG))
16823 return Blend;
16824
16825 // Otherwise, fall back to a SHUFPS sequence.
16826 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
16827 }
16828
16829 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
16830 Zeroable, Subtarget, DAG))
16831 return Blend;
16832
16834 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
16835 return DAG.getBitcast(MVT::v16f32, ZExt);
16836
16837 // Try to create an in-lane repeating shuffle mask and then shuffle the
16838 // results into the target lanes.
16840 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
16841 return V;
16842
16843 // If we have a single input shuffle with different shuffle patterns in the
16844 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
16845 if (V2.isUndef() &&
16846 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
16847 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
16848 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
16849 }
16850
16851 // If we have AVX512F support, we can use VEXPAND.
16852 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
16853 V1, V2, DAG, Subtarget))
16854 return V;
16855
16856 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
16857}
16858
16859/// Handle lowering of 8-lane 64-bit integer shuffles.
16861 const APInt &Zeroable, SDValue V1, SDValue V2,
16862 const X86Subtarget &Subtarget,
16863 SelectionDAG &DAG) {
16864 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
16865 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
16866 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16867
16868 // Try to use shift instructions if fast.
16869 if (Subtarget.preferLowerShuffleAsShift())
16870 if (SDValue Shift =
16871 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
16872 Subtarget, DAG, /*BitwiseOnly*/ true))
16873 return Shift;
16874
16875 if (V2.isUndef()) {
16876 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16877 // can use lower latency instructions that will operate on all four
16878 // 128-bit lanes.
16879 SmallVector<int, 2> Repeated128Mask;
16880 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
16881 SmallVector<int, 4> PSHUFDMask;
16882 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
16883 return DAG.getBitcast(
16884 MVT::v8i64,
16885 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
16886 DAG.getBitcast(MVT::v16i32, V1),
16887 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16888 }
16889
16890 SmallVector<int, 4> Repeated256Mask;
16891 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
16892 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
16893 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
16894 }
16895
16896 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
16897 V2, Subtarget, DAG))
16898 return Shuf128;
16899
16900 // Try to use shift instructions.
16901 if (SDValue Shift =
16902 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
16903 DAG, /*BitwiseOnly*/ false))
16904 return Shift;
16905
16906 // Try to use VALIGN.
16907 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
16908 Zeroable, Subtarget, DAG))
16909 return Rotate;
16910
16911 // Try to use PALIGNR.
16912 if (Subtarget.hasBWI())
16913 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
16914 Subtarget, DAG))
16915 return Rotate;
16916
16917 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
16918 return Unpck;
16919
16920 // If we have AVX512F support, we can use VEXPAND.
16921 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
16922 DAG, Subtarget))
16923 return V;
16924
16925 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
16926 Zeroable, Subtarget, DAG))
16927 return Blend;
16928
16929 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
16930}
16931
16932/// Handle lowering of 16-lane 32-bit integer shuffles.
16934 const APInt &Zeroable, SDValue V1, SDValue V2,
16935 const X86Subtarget &Subtarget,
16936 SelectionDAG &DAG) {
16937 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
16938 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
16939 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16940
16941 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
16942
16943 // Whenever we can lower this as a zext, that instruction is strictly faster
16944 // than any alternative. It also allows us to fold memory operands into the
16945 // shuffle in many cases.
16947 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
16948 return ZExt;
16949
16950 // Try to use shift instructions if fast.
16951 if (Subtarget.preferLowerShuffleAsShift()) {
16952 if (SDValue Shift =
16953 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
16954 Subtarget, DAG, /*BitwiseOnly*/ true))
16955 return Shift;
16956 if (NumV2Elements == 0)
16957 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
16958 Subtarget, DAG))
16959 return Rotate;
16960 }
16961
16962 // If the shuffle mask is repeated in each 128-bit lane we can use more
16963 // efficient instructions that mirror the shuffles across the four 128-bit
16964 // lanes.
16965 SmallVector<int, 4> RepeatedMask;
16966 bool Is128BitLaneRepeatedShuffle =
16967 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
16968 if (Is128BitLaneRepeatedShuffle) {
16969 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16970 if (V2.isUndef())
16971 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
16972 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16973
16974 // Use dedicated unpack instructions for masks that match their pattern.
16975 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
16976 return V;
16977 }
16978
16979 // Try to use shift instructions.
16980 if (SDValue Shift =
16981 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
16982 Subtarget, DAG, /*BitwiseOnly*/ false))
16983 return Shift;
16984
16985 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
16986 if (SDValue Rotate =
16987 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
16988 return Rotate;
16989
16990 // Try to use VALIGN.
16991 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
16992 Zeroable, Subtarget, DAG))
16993 return Rotate;
16994
16995 // Try to use byte rotation instructions.
16996 if (Subtarget.hasBWI())
16997 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
16998 Subtarget, DAG))
16999 return Rotate;
17000
17001 // Assume that a single SHUFPS is faster than using a permv shuffle.
17002 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17003 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17004 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17005 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17006 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17007 CastV1, CastV2, DAG);
17008 return DAG.getBitcast(MVT::v16i32, ShufPS);
17009 }
17010
17011 // Try to create an in-lane repeating shuffle mask and then shuffle the
17012 // results into the target lanes.
17014 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17015 return V;
17016
17017 // If we have AVX512F support, we can use VEXPAND.
17018 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
17019 DAG, Subtarget))
17020 return V;
17021
17022 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17023 Zeroable, Subtarget, DAG))
17024 return Blend;
17025
17026 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17027}
17028
17029/// Handle lowering of 32-lane 16-bit integer shuffles.
17031 const APInt &Zeroable, SDValue V1, SDValue V2,
17032 const X86Subtarget &Subtarget,
17033 SelectionDAG &DAG) {
17034 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17035 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17036 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17037 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17038
17039 // Whenever we can lower this as a zext, that instruction is strictly faster
17040 // than any alternative. It also allows us to fold memory operands into the
17041 // shuffle in many cases.
17043 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17044 return ZExt;
17045
17046 // Use dedicated unpack instructions for masks that match their pattern.
17047 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
17048 return V;
17049
17050 // Use dedicated pack instructions for masks that match their pattern.
17051 if (SDValue V =
17052 lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
17053 return V;
17054
17055 // Try to use shift instructions.
17056 if (SDValue Shift =
17057 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
17058 Subtarget, DAG, /*BitwiseOnly*/ false))
17059 return Shift;
17060
17061 // Try to use byte rotation instructions.
17062 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17063 Subtarget, DAG))
17064 return Rotate;
17065
17066 if (V2.isUndef()) {
17067 // Try to use bit rotation instructions.
17068 if (SDValue Rotate =
17069 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17070 return Rotate;
17071
17072 SmallVector<int, 8> RepeatedMask;
17073 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17074 // As this is a single-input shuffle, the repeated mask should be
17075 // a strictly valid v8i16 mask that we can pass through to the v8i16
17076 // lowering to handle even the v32 case.
17077 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17078 RepeatedMask, Subtarget, DAG);
17079 }
17080 }
17081
17082 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17083 Zeroable, Subtarget, DAG))
17084 return Blend;
17085
17086 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17087 Zeroable, Subtarget, DAG))
17088 return PSHUFB;
17089
17090 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17091}
17092
17093/// Handle lowering of 64-lane 8-bit integer shuffles.
17095 const APInt &Zeroable, SDValue V1, SDValue V2,
17096 const X86Subtarget &Subtarget,
17097 SelectionDAG &DAG) {
17098 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17099 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17100 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
17101 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17102
17103 // Whenever we can lower this as a zext, that instruction is strictly faster
17104 // than any alternative. It also allows us to fold memory operands into the
17105 // shuffle in many cases.
17107 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17108 return ZExt;
17109
17110 // Use dedicated unpack instructions for masks that match their pattern.
17111 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
17112 return V;
17113
17114 // Use dedicated pack instructions for masks that match their pattern.
17115 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
17116 Subtarget))
17117 return V;
17118
17119 // Try to use shift instructions.
17120 if (SDValue Shift =
17121 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
17122 DAG, /*BitwiseOnly*/ false))
17123 return Shift;
17124
17125 // Try to use byte rotation instructions.
17126 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17127 Subtarget, DAG))
17128 return Rotate;
17129
17130 // Try to use bit rotation instructions.
17131 if (V2.isUndef())
17132 if (SDValue Rotate =
17133 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
17134 return Rotate;
17135
17136 // Lower as AND if possible.
17137 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
17138 Zeroable, Subtarget, DAG))
17139 return Masked;
17140
17141 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
17142 Zeroable, Subtarget, DAG))
17143 return PSHUFB;
17144
17145 // Try to create an in-lane repeating shuffle mask and then shuffle the
17146 // results into the target lanes.
17148 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17149 return V;
17150
17152 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
17153 return Result;
17154
17155 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
17156 Zeroable, Subtarget, DAG))
17157 return Blend;
17158
17159 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
17160 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
17161 // PALIGNR will be cheaper than the second PSHUFB+OR.
17162 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
17163 Mask, Subtarget, DAG))
17164 return V;
17165
17166 // If we can't directly blend but can use PSHUFB, that will be better as it
17167 // can both shuffle and set up the inefficient blend.
17168 bool V1InUse, V2InUse;
17169 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
17170 DAG, V1InUse, V2InUse);
17171 }
17172
17173 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17174 // shuffle.
17175 if (!V2.isUndef())
17177 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17178 return Result;
17179
17180 // VBMI can use VPERMV/VPERMV3 byte shuffles.
17181 if (Subtarget.hasVBMI())
17182 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
17183
17184 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17185}
17186
17187/// High-level routine to lower various 512-bit x86 vector shuffles.
17188///
17189/// This routine either breaks down the specific type of a 512-bit x86 vector
17190/// shuffle or splits it into two 256-bit shuffles and fuses the results back
17191/// together based on the available instructions.
17193 MVT VT, SDValue V1, SDValue V2,
17194 const APInt &Zeroable,
17195 const X86Subtarget &Subtarget,
17196 SelectionDAG &DAG) {
17197 assert(Subtarget.hasAVX512() &&
17198 "Cannot lower 512-bit vectors w/ basic ISA!");
17199
17200 // If we have a single input to the zero element, insert that into V1 if we
17201 // can do so cheaply.
17202 int NumElts = Mask.size();
17203 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17204
17205 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17207 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17208 return Insertion;
17209
17210 // Handle special cases where the lower or upper half is UNDEF.
17211 if (SDValue V =
17212 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17213 return V;
17214
17215 // Check for being able to broadcast a single element.
17216 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
17217 Subtarget, DAG))
17218 return Broadcast;
17219
17220 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
17221 // Try using bit ops for masking and blending before falling back to
17222 // splitting.
17223 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17224 Subtarget, DAG))
17225 return V;
17226 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17227 return V;
17228
17229 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17230 }
17231
17232 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
17233 if (!Subtarget.hasBWI())
17234 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17235 /*SimpleOnly*/ false);
17236
17237 V1 = DAG.getBitcast(MVT::v32i16, V1);
17238 V2 = DAG.getBitcast(MVT::v32i16, V2);
17239 return DAG.getBitcast(VT,
17240 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
17241 }
17242
17243 // Dispatch to each element type for lowering. If we don't have support for
17244 // specific element type shuffles at 512 bits, immediately split them and
17245 // lower them. Each lowering routine of a given type is allowed to assume that
17246 // the requisite ISA extensions for that element type are available.
17247 switch (VT.SimpleTy) {
17248 case MVT::v8f64:
17249 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17250 case MVT::v16f32:
17251 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17252 case MVT::v8i64:
17253 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17254 case MVT::v16i32:
17255 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17256 case MVT::v32i16:
17257 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17258 case MVT::v64i8:
17259 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17260
17261 default:
17262 llvm_unreachable("Not a valid 512-bit x86 vector type!");
17263 }
17264}
17265
17267 MVT VT, SDValue V1, SDValue V2,
17268 const X86Subtarget &Subtarget,
17269 SelectionDAG &DAG) {
17270 // Shuffle should be unary.
17271 if (!V2.isUndef())
17272 return SDValue();
17273
17274 int ShiftAmt = -1;
17275 int NumElts = Mask.size();
17276 for (int i = 0; i != NumElts; ++i) {
17277 int M = Mask[i];
17278 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
17279 "Unexpected mask index.");
17280 if (M < 0)
17281 continue;
17282
17283 // The first non-undef element determines our shift amount.
17284 if (ShiftAmt < 0) {
17285 ShiftAmt = M - i;
17286 // Need to be shifting right.
17287 if (ShiftAmt <= 0)
17288 return SDValue();
17289 }
17290 // All non-undef elements must shift by the same amount.
17291 if (ShiftAmt != M - i)
17292 return SDValue();
17293 }
17294 assert(ShiftAmt >= 0 && "All undef?");
17295
17296 // Great we found a shift right.
17297 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
17298 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
17299 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17300 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17301 DAG.getIntPtrConstant(0, DL));
17302}
17303
17304// Determine if this shuffle can be implemented with a KSHIFT instruction.
17305// Returns the shift amount if possible or -1 if not. This is a simplified
17306// version of matchShuffleAsShift.
17307static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17308 int MaskOffset, const APInt &Zeroable) {
17309 int Size = Mask.size();
17310
17311 auto CheckZeros = [&](int Shift, bool Left) {
17312 for (int j = 0; j < Shift; ++j)
17313 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17314 return false;
17315
17316 return true;
17317 };
17318
17319 auto MatchShift = [&](int Shift, bool Left) {
17320 unsigned Pos = Left ? Shift : 0;
17321 unsigned Low = Left ? 0 : Shift;
17322 unsigned Len = Size - Shift;
17323 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17324 };
17325
17326 for (int Shift = 1; Shift != Size; ++Shift)
17327 for (bool Left : {true, false})
17328 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17330 return Shift;
17331 }
17332
17333 return -1;
17334}
17335
17336
17337// Lower vXi1 vector shuffles.
17338// There is no a dedicated instruction on AVX-512 that shuffles the masks.
17339// The only way to shuffle bits is to sign-extend the mask vector to SIMD
17340// vector, shuffle and then truncate it back.
17342 MVT VT, SDValue V1, SDValue V2,
17343 const APInt &Zeroable,
17344 const X86Subtarget &Subtarget,
17345 SelectionDAG &DAG) {
17346 assert(Subtarget.hasAVX512() &&
17347 "Cannot lower 512-bit vectors w/o basic ISA!");
17348
17349 int NumElts = Mask.size();
17350 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17351
17352 // Try to recognize shuffles that are just padding a subvector with zeros.
17353 int SubvecElts = 0;
17354 int Src = -1;
17355 for (int i = 0; i != NumElts; ++i) {
17356 if (Mask[i] >= 0) {
17357 // Grab the source from the first valid mask. All subsequent elements need
17358 // to use this same source.
17359 if (Src < 0)
17360 Src = Mask[i] / NumElts;
17361 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
17362 break;
17363 }
17364
17365 ++SubvecElts;
17366 }
17367 assert(SubvecElts != NumElts && "Identity shuffle?");
17368
17369 // Clip to a power 2.
17370 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
17371
17372 // Make sure the number of zeroable bits in the top at least covers the bits
17373 // not covered by the subvector.
17374 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
17375 assert(Src >= 0 && "Expected a source!");
17376 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
17377 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
17378 Src == 0 ? V1 : V2,
17379 DAG.getIntPtrConstant(0, DL));
17380 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17381 DAG.getConstant(0, DL, VT),
17382 Extract, DAG.getIntPtrConstant(0, DL));
17383 }
17384
17385 // Try a simple shift right with undef elements. Later we'll try with zeros.
17386 if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
17387 DAG))
17388 return Shift;
17389
17390 // Try to match KSHIFTs.
17391 unsigned Offset = 0;
17392 for (SDValue V : { V1, V2 }) {
17393 unsigned Opcode;
17394 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
17395 if (ShiftAmt >= 0) {
17396 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
17397 MVT WideVT = Res.getSimpleValueType();
17398 // Widened right shifts need two shifts to ensure we shift in zeroes.
17399 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
17400 int WideElts = WideVT.getVectorNumElements();
17401 // Shift left to put the original vector in the MSBs of the new size.
17402 Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
17403 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
17404 // Increase the shift amount to account for the left shift.
17405 ShiftAmt += WideElts - NumElts;
17406 }
17407
17408 Res = DAG.getNode(Opcode, DL, WideVT, Res,
17409 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17410 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17411 DAG.getIntPtrConstant(0, DL));
17412 }
17413 Offset += NumElts; // Increment for next iteration.
17414 }
17415
17416 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
17417 // ops instead.
17418 // TODO: What other unary shuffles would benefit from this?
17419 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
17420 SDValue Op0 = V1.getOperand(0);
17421 SDValue Op1 = V1.getOperand(1);
17422 ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
17423 EVT OpVT = Op0.getValueType();
17424 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
17425 return DAG.getSetCC(
17426 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
17427 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
17428 }
17429
17430 MVT ExtVT;
17431 switch (VT.SimpleTy) {
17432 default:
17433 llvm_unreachable("Expected a vector of i1 elements");
17434 case MVT::v2i1:
17435 ExtVT = MVT::v2i64;
17436 break;
17437 case MVT::v4i1:
17438 ExtVT = MVT::v4i32;
17439 break;
17440 case MVT::v8i1:
17441 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
17442 // shuffle.
17443 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
17444 break;
17445 case MVT::v16i1:
17446 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17447 // 256-bit operation available.
17448 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
17449 break;
17450 case MVT::v32i1:
17451 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17452 // 256-bit operation available.
17453 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
17454 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
17455 break;
17456 case MVT::v64i1:
17457 // Fall back to scalarization. FIXME: We can do better if the shuffle
17458 // can be partitioned cleanly.
17459 if (!Subtarget.useBWIRegs())
17460 return SDValue();
17461 ExtVT = MVT::v64i8;
17462 break;
17463 }
17464
17465 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
17466 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
17467
17468 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
17469 // i1 was sign extended we can use X86ISD::CVT2MASK.
17470 int NumElems = VT.getVectorNumElements();
17471 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
17472 (Subtarget.hasDQI() && (NumElems < 32)))
17473 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
17474 Shuffle, ISD::SETGT);
17475
17476 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
17477}
17478
17479/// Helper function that returns true if the shuffle mask should be
17480/// commuted to improve canonicalization.
17482 int NumElements = Mask.size();
17483
17484 int NumV1Elements = 0, NumV2Elements = 0;
17485 for (int M : Mask)
17486 if (M < 0)
17487 continue;
17488 else if (M < NumElements)
17489 ++NumV1Elements;
17490 else
17491 ++NumV2Elements;
17492
17493 // Commute the shuffle as needed such that more elements come from V1 than
17494 // V2. This allows us to match the shuffle pattern strictly on how many
17495 // elements come from V1 without handling the symmetric cases.
17496 if (NumV2Elements > NumV1Elements)
17497 return true;
17498
17499 assert(NumV1Elements > 0 && "No V1 indices");
17500
17501 if (NumV2Elements == 0)
17502 return false;
17503
17504 // When the number of V1 and V2 elements are the same, try to minimize the
17505 // number of uses of V2 in the low half of the vector. When that is tied,
17506 // ensure that the sum of indices for V1 is equal to or lower than the sum
17507 // indices for V2. When those are equal, try to ensure that the number of odd
17508 // indices for V1 is lower than the number of odd indices for V2.
17509 if (NumV1Elements == NumV2Elements) {
17510 int LowV1Elements = 0, LowV2Elements = 0;
17511 for (int M : Mask.slice(0, NumElements / 2))
17512 if (M >= NumElements)
17513 ++LowV2Elements;
17514 else if (M >= 0)
17515 ++LowV1Elements;
17516 if (LowV2Elements > LowV1Elements)
17517 return true;
17518 if (LowV2Elements == LowV1Elements) {
17519 int SumV1Indices = 0, SumV2Indices = 0;
17520 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17521 if (Mask[i] >= NumElements)
17522 SumV2Indices += i;
17523 else if (Mask[i] >= 0)
17524 SumV1Indices += i;
17525 if (SumV2Indices < SumV1Indices)
17526 return true;
17527 if (SumV2Indices == SumV1Indices) {
17528 int NumV1OddIndices = 0, NumV2OddIndices = 0;
17529 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17530 if (Mask[i] >= NumElements)
17531 NumV2OddIndices += i % 2;
17532 else if (Mask[i] >= 0)
17533 NumV1OddIndices += i % 2;
17534 if (NumV2OddIndices < NumV1OddIndices)
17535 return true;
17536 }
17537 }
17538 }
17539
17540 return false;
17541}
17542
17544 const X86Subtarget &Subtarget) {
17545 if (!Subtarget.hasAVX512())
17546 return false;
17547
17548 if (!V.getValueType().isSimple())
17549 return false;
17550
17551 MVT VT = V.getSimpleValueType().getScalarType();
17552 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
17553 return false;
17554
17555 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
17556 // are preferable to blendw/blendvb/masked-mov.
17557 if ((VT == MVT::i16 || VT == MVT::i8) &&
17558 V.getSimpleValueType().getSizeInBits() < 512)
17559 return false;
17560
17561 auto HasMaskOperation = [&](SDValue V) {
17562 // TODO: Currently we only check limited opcode. We probably extend
17563 // it to all binary operation by checking TLI.isBinOp().
17564 switch (V->getOpcode()) {
17565 default:
17566 return false;
17567 case ISD::ADD:
17568 case ISD::SUB:
17569 case ISD::AND:
17570 case ISD::XOR:
17571 case ISD::OR:
17572 case ISD::SMAX:
17573 case ISD::SMIN:
17574 case ISD::UMAX:
17575 case ISD::UMIN:
17576 case ISD::ABS:
17577 case ISD::SHL:
17578 case ISD::SRL:
17579 case ISD::SRA:
17580 case ISD::MUL:
17581 break;
17582 }
17583 if (!V->hasOneUse())
17584 return false;
17585
17586 return true;
17587 };
17588
17589 if (HasMaskOperation(V))
17590 return true;
17591
17592 return false;
17593}
17594
17595// Forward declaration.
17598 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
17599 const X86Subtarget &Subtarget);
17600
17601 /// Top-level lowering for x86 vector shuffles.
17602///
17603/// This handles decomposition, canonicalization, and lowering of all x86
17604/// vector shuffles. Most of the specific lowering strategies are encapsulated
17605/// above in helper routines. The canonicalization attempts to widen shuffles
17606/// to involve fewer lanes of wider elements, consolidate symmetric patterns
17607/// s.t. only one of the two inputs needs to be tested, etc.
17609 SelectionDAG &DAG) {
17610 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
17611 ArrayRef<int> OrigMask = SVOp->getMask();
17612 SDValue V1 = Op.getOperand(0);
17613 SDValue V2 = Op.getOperand(1);
17614 MVT VT = Op.getSimpleValueType();
17615 int NumElements = VT.getVectorNumElements();
17616 SDLoc DL(Op);
17617 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
17618
17619 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
17620 "Can't lower MMX shuffles");
17621
17622 bool V1IsUndef = V1.isUndef();
17623 bool V2IsUndef = V2.isUndef();
17624 if (V1IsUndef && V2IsUndef)
17625 return DAG.getUNDEF(VT);
17626
17627 // When we create a shuffle node we put the UNDEF node to second operand,
17628 // but in some cases the first operand may be transformed to UNDEF.
17629 // In this case we should just commute the node.
17630 if (V1IsUndef)
17631 return DAG.getCommutedVectorShuffle(*SVOp);
17632
17633 // Check for non-undef masks pointing at an undef vector and make the masks
17634 // undef as well. This makes it easier to match the shuffle based solely on
17635 // the mask.
17636 if (V2IsUndef &&
17637 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
17638 SmallVector<int, 8> NewMask(OrigMask);
17639 for (int &M : NewMask)
17640 if (M >= NumElements)
17641 M = -1;
17642 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
17643 }
17644
17645 // Check for illegal shuffle mask element index values.
17646 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
17647 (void)MaskUpperLimit;
17648 assert(llvm::all_of(OrigMask,
17649 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
17650 "Out of bounds shuffle index");
17651
17652 // We actually see shuffles that are entirely re-arrangements of a set of
17653 // zero inputs. This mostly happens while decomposing complex shuffles into
17654 // simple ones. Directly lower these as a buildvector of zeros.
17655 APInt KnownUndef, KnownZero;
17656 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
17657
17658 APInt Zeroable = KnownUndef | KnownZero;
17659 if (Zeroable.isAllOnes())
17660 return getZeroVector(VT, Subtarget, DAG, DL);
17661
17662 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
17663
17664 // Try to collapse shuffles into using a vector type with fewer elements but
17665 // wider element types. We cap this to not form integers or floating point
17666 // elements wider than 64 bits. It does not seem beneficial to form i128
17667 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
17668 SmallVector<int, 16> WidenedMask;
17669 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
17670 !canCombineAsMaskOperation(V1, Subtarget) &&
17671 !canCombineAsMaskOperation(V2, Subtarget) &&
17672 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
17673 // Shuffle mask widening should not interfere with a broadcast opportunity
17674 // by obfuscating the operands with bitcasts.
17675 // TODO: Avoid lowering directly from this top-level function: make this
17676 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
17677 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
17678 Subtarget, DAG))
17679 return Broadcast;
17680
17681 MVT NewEltVT = VT.isFloatingPoint()
17684 int NewNumElts = NumElements / 2;
17685 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
17686 // Make sure that the new vector type is legal. For example, v2f64 isn't
17687 // legal on SSE1.
17688 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
17689 if (V2IsZero) {
17690 // Modify the new Mask to take all zeros from the all-zero vector.
17691 // Choose indices that are blend-friendly.
17692 bool UsedZeroVector = false;
17693 assert(is_contained(WidenedMask, SM_SentinelZero) &&
17694 "V2's non-undef elements are used?!");
17695 for (int i = 0; i != NewNumElts; ++i)
17696 if (WidenedMask[i] == SM_SentinelZero) {
17697 WidenedMask[i] = i + NewNumElts;
17698 UsedZeroVector = true;
17699 }
17700 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
17701 // some elements to be undef.
17702 if (UsedZeroVector)
17703 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
17704 }
17705 V1 = DAG.getBitcast(NewVT, V1);
17706 V2 = DAG.getBitcast(NewVT, V2);
17707 return DAG.getBitcast(
17708 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
17709 }
17710 }
17711
17712 SmallVector<SDValue> Ops = {V1, V2};
17713 SmallVector<int> Mask(OrigMask);
17714
17715 // Canonicalize the shuffle with any horizontal ops inputs.
17716 // NOTE: This may update Ops and Mask.
17718 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
17719 return DAG.getBitcast(VT, HOp);
17720
17721 V1 = DAG.getBitcast(VT, Ops[0]);
17722 V2 = DAG.getBitcast(VT, Ops[1]);
17723 assert(NumElements == (int)Mask.size() &&
17724 "canonicalizeShuffleMaskWithHorizOp "
17725 "shouldn't alter the shuffle mask size");
17726
17727 // Commute the shuffle if it will improve canonicalization.
17730 std::swap(V1, V2);
17731 }
17732
17733 // For each vector width, delegate to a specialized lowering routine.
17734 if (VT.is128BitVector())
17735 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17736
17737 if (VT.is256BitVector())
17738 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17739
17740 if (VT.is512BitVector())
17741 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17742
17743 if (Is1BitVector)
17744 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17745
17746 llvm_unreachable("Unimplemented!");
17747}
17748
17749/// Try to lower a VSELECT instruction to a vector shuffle.
17751 const X86Subtarget &Subtarget,
17752 SelectionDAG &DAG) {
17753 SDValue Cond = Op.getOperand(0);
17754 SDValue LHS = Op.getOperand(1);
17755 SDValue RHS = Op.getOperand(2);
17756 MVT VT = Op.getSimpleValueType();
17757
17758 // Only non-legal VSELECTs reach this lowering, convert those into generic
17759 // shuffles and re-use the shuffle lowering path for blends.
17763 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
17764 }
17765
17766 return SDValue();
17767}
17768
17769SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
17770 SDValue Cond = Op.getOperand(0);
17771 SDValue LHS = Op.getOperand(1);
17772 SDValue RHS = Op.getOperand(2);
17773
17774 SDLoc dl(Op);
17775 MVT VT = Op.getSimpleValueType();
17776 if (isSoftF16(VT, Subtarget)) {
17778 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
17779 DAG.getBitcast(NVT, LHS),
17780 DAG.getBitcast(NVT, RHS)));
17781 }
17782
17783 // A vselect where all conditions and data are constants can be optimized into
17784 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
17788 return SDValue();
17789
17790 // Try to lower this to a blend-style vector shuffle. This can handle all
17791 // constant condition cases.
17792 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
17793 return BlendOp;
17794
17795 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
17796 // with patterns on the mask registers on AVX-512.
17797 MVT CondVT = Cond.getSimpleValueType();
17798 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
17799 if (CondEltSize == 1)
17800 return Op;
17801
17802 // Variable blends are only legal from SSE4.1 onward.
17803 if (!Subtarget.hasSSE41())
17804 return SDValue();
17805
17806 unsigned EltSize = VT.getScalarSizeInBits();
17807 unsigned NumElts = VT.getVectorNumElements();
17808
17809 // Expand v32i16/v64i8 without BWI.
17810 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
17811 return SDValue();
17812
17813 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
17814 // into an i1 condition so that we can use the mask-based 512-bit blend
17815 // instructions.
17816 if (VT.getSizeInBits() == 512) {
17817 // Build a mask by testing the condition against zero.
17818 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
17819 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
17820 DAG.getConstant(0, dl, CondVT),
17821 ISD::SETNE);
17822 // Now return a new VSELECT using the mask.
17823 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
17824 }
17825
17826 // SEXT/TRUNC cases where the mask doesn't match the destination size.
17827 if (CondEltSize != EltSize) {
17828 // If we don't have a sign splat, rely on the expansion.
17829 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
17830 return SDValue();
17831
17832 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
17833 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
17834 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
17835 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
17836 }
17837
17838 // Only some types will be legal on some subtargets. If we can emit a legal
17839 // VSELECT-matching blend, return Op, and but if we need to expand, return
17840 // a null value.
17841 switch (VT.SimpleTy) {
17842 default:
17843 // Most of the vector types have blends past SSE4.1.
17844 return Op;
17845
17846 case MVT::v32i8:
17847 // The byte blends for AVX vectors were introduced only in AVX2.
17848 if (Subtarget.hasAVX2())
17849 return Op;
17850
17851 return SDValue();
17852
17853 case MVT::v8i16:
17854 case MVT::v16i16: {
17855 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
17856 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
17857 Cond = DAG.getBitcast(CastVT, Cond);
17858 LHS = DAG.getBitcast(CastVT, LHS);
17859 RHS = DAG.getBitcast(CastVT, RHS);
17860 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
17861 return DAG.getBitcast(VT, Select);
17862 }
17863 }
17864}
17865
17867 MVT VT = Op.getSimpleValueType();
17868 SDValue Vec = Op.getOperand(0);
17869 SDValue Idx = Op.getOperand(1);
17870 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
17871 SDLoc dl(Op);
17872
17874 return SDValue();
17875
17876 if (VT.getSizeInBits() == 8) {
17877 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
17878 // we're going to zero extend the register or fold the store.
17881 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
17882 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
17883 DAG.getBitcast(MVT::v4i32, Vec), Idx));
17884
17885 unsigned IdxVal = Idx->getAsZExtVal();
17886 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
17887 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
17888 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
17889 }
17890
17891 if (VT == MVT::f32) {
17892 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
17893 // the result back to FR32 register. It's only worth matching if the
17894 // result has a single use which is a store or a bitcast to i32. And in
17895 // the case of a store, it's not worth it if the index is a constant 0,
17896 // because a MOVSSmr can be used instead, which is smaller and faster.
17897 if (!Op.hasOneUse())
17898 return SDValue();
17899 SDNode *User = *Op.getNode()->use_begin();
17900 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
17901 (User->getOpcode() != ISD::BITCAST ||
17902 User->getValueType(0) != MVT::i32))
17903 return SDValue();
17904 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
17905 DAG.getBitcast(MVT::v4i32, Vec), Idx);
17906 return DAG.getBitcast(MVT::f32, Extract);
17907 }
17908
17909 if (VT == MVT::i32 || VT == MVT::i64)
17910 return Op;
17911
17912 return SDValue();
17913}
17914
17915/// Extract one bit from mask vector, like v16i1 or v8i1.
17916/// AVX-512 feature.
17918 const X86Subtarget &Subtarget) {
17919 SDValue Vec = Op.getOperand(0);
17920 SDLoc dl(Vec);
17921 MVT VecVT = Vec.getSimpleValueType();
17922 SDValue Idx = Op.getOperand(1);
17923 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
17924 MVT EltVT = Op.getSimpleValueType();
17925
17926 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
17927 "Unexpected vector type in ExtractBitFromMaskVector");
17928
17929 // variable index can't be handled in mask registers,
17930 // extend vector to VR512/128
17931 if (!IdxC) {
17932 unsigned NumElts = VecVT.getVectorNumElements();
17933 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
17934 // than extending to 128/256bit.
17935 if (NumElts == 1) {
17936 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
17938 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
17939 }
17940 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
17941 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
17942 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
17943 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
17944 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
17945 }
17946
17947 unsigned IdxVal = IdxC->getZExtValue();
17948 if (IdxVal == 0) // the operation is legal
17949 return Op;
17950
17951 // Extend to natively supported kshift.
17952 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
17953
17954 // Use kshiftr instruction to move to the lower element.
17955 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
17956 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
17957
17958 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
17959 DAG.getIntPtrConstant(0, dl));
17960}
17961
17962// Helper to find all the extracted elements from a vector.
17964 MVT VT = N->getSimpleValueType(0);
17965 unsigned NumElts = VT.getVectorNumElements();
17966 APInt DemandedElts = APInt::getZero(NumElts);
17967 for (SDNode *User : N->uses()) {
17968 switch (User->getOpcode()) {
17969 case X86ISD::PEXTRB:
17970 case X86ISD::PEXTRW:
17972 if (!isa<ConstantSDNode>(User->getOperand(1))) {
17973 DemandedElts.setAllBits();
17974 return DemandedElts;
17975 }
17976 DemandedElts.setBit(User->getConstantOperandVal(1));
17977 break;
17978 case ISD::BITCAST: {
17979 if (!User->getValueType(0).isSimple() ||
17980 !User->getValueType(0).isVector()) {
17981 DemandedElts.setAllBits();
17982 return DemandedElts;
17983 }
17984 APInt DemandedSrcElts = getExtractedDemandedElts(User);
17985 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
17986 break;
17987 }
17988 default:
17989 DemandedElts.setAllBits();
17990 return DemandedElts;
17991 }
17992 }
17993 return DemandedElts;
17994}
17995
17996SDValue
17997X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
17998 SelectionDAG &DAG) const {
17999 SDLoc dl(Op);
18000 SDValue Vec = Op.getOperand(0);
18001 MVT VecVT = Vec.getSimpleValueType();
18002 SDValue Idx = Op.getOperand(1);
18003 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18004
18005 if (VecVT.getVectorElementType() == MVT::i1)
18006 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18007
18008 if (!IdxC) {
18009 // Its more profitable to go through memory (1 cycles throughput)
18010 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
18011 // IACA tool was used to get performance estimation
18012 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18013 //
18014 // example : extractelement <16 x i8> %a, i32 %i
18015 //
18016 // Block Throughput: 3.00 Cycles
18017 // Throughput Bottleneck: Port5
18018 //
18019 // | Num Of | Ports pressure in cycles | |
18020 // | Uops | 0 - DV | 5 | 6 | 7 | |
18021 // ---------------------------------------------
18022 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
18023 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
18024 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
18025 // Total Num Of Uops: 4
18026 //
18027 //
18028 // Block Throughput: 1.00 Cycles
18029 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18030 //
18031 // | | Ports pressure in cycles | |
18032 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18033 // ---------------------------------------------------------
18034 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18035 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18036 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
18037 // Total Num Of Uops: 4
18038
18039 return SDValue();
18040 }
18041
18042 unsigned IdxVal = IdxC->getZExtValue();
18043
18044 // If this is a 256-bit vector result, first extract the 128-bit vector and
18045 // then extract the element from the 128-bit vector.
18046 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18047 // Get the 128-bit vector.
18048 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18049 MVT EltVT = VecVT.getVectorElementType();
18050
18051 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18052 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
18053
18054 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18055 // this can be done with a mask.
18056 IdxVal &= ElemsPerChunk - 1;
18057 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18058 DAG.getIntPtrConstant(IdxVal, dl));
18059 }
18060
18061 assert(VecVT.is128BitVector() && "Unexpected vector length");
18062
18063 MVT VT = Op.getSimpleValueType();
18064
18065 if (VT == MVT::i16) {
18066 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18067 // we're going to zero extend the register or fold the store (SSE41 only).
18068 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
18069 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
18070 if (Subtarget.hasFP16())
18071 return Op;
18072
18073 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18074 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18075 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18076 }
18077
18078 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18079 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18080 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18081 }
18082
18083 if (Subtarget.hasSSE41())
18084 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18085 return Res;
18086
18087 // Only extract a single element from a v16i8 source - determine the common
18088 // DWORD/WORD that all extractions share, and extract the sub-byte.
18089 // TODO: Add QWORD MOVQ extraction?
18090 if (VT == MVT::i8) {
18091 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
18092 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
18093
18094 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18095 int DWordIdx = IdxVal / 4;
18096 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
18097 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18098 DAG.getBitcast(MVT::v4i32, Vec),
18099 DAG.getIntPtrConstant(DWordIdx, dl));
18100 int ShiftVal = (IdxVal % 4) * 8;
18101 if (ShiftVal != 0)
18102 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18103 DAG.getConstant(ShiftVal, dl, MVT::i8));
18104 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18105 }
18106
18107 int WordIdx = IdxVal / 2;
18108 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
18109 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18110 DAG.getBitcast(MVT::v8i16, Vec),
18111 DAG.getIntPtrConstant(WordIdx, dl));
18112 int ShiftVal = (IdxVal % 2) * 8;
18113 if (ShiftVal != 0)
18114 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18115 DAG.getConstant(ShiftVal, dl, MVT::i8));
18116 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18117 }
18118 }
18119
18120 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
18121 if (IdxVal == 0)
18122 return Op;
18123
18124 // Shuffle the element to the lowest element, then movss or movsh.
18126 Mask[0] = static_cast<int>(IdxVal);
18127 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18128 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18129 DAG.getIntPtrConstant(0, dl));
18130 }
18131
18132 if (VT.getSizeInBits() == 64) {
18133 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18134 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18135 // to match extract_elt for f64.
18136 if (IdxVal == 0)
18137 return Op;
18138
18139 // UNPCKHPD the element to the lowest double word, then movsd.
18140 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18141 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18142 int Mask[2] = { 1, -1 };
18143 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18144 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18145 DAG.getIntPtrConstant(0, dl));
18146 }
18147
18148 return SDValue();
18149}
18150
18151/// Insert one bit to mask vector, like v16i1 or v8i1.
18152/// AVX-512 feature.
18154 const X86Subtarget &Subtarget) {
18155 SDLoc dl(Op);
18156 SDValue Vec = Op.getOperand(0);
18157 SDValue Elt = Op.getOperand(1);
18158 SDValue Idx = Op.getOperand(2);
18159 MVT VecVT = Vec.getSimpleValueType();
18160
18161 if (!isa<ConstantSDNode>(Idx)) {
18162 // Non constant index. Extend source and destination,
18163 // insert element and then truncate the result.
18164 unsigned NumElts = VecVT.getVectorNumElements();
18165 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18166 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18167 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18168 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18169 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18170 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18171 }
18172
18173 // Copy into a k-register, extract to v1i1 and insert_subvector.
18174 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18175 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18176}
18177
18178SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18179 SelectionDAG &DAG) const {
18180 MVT VT = Op.getSimpleValueType();
18181 MVT EltVT = VT.getVectorElementType();
18182 unsigned NumElts = VT.getVectorNumElements();
18183 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18184
18185 if (EltVT == MVT::i1)
18186 return InsertBitToMaskVector(Op, DAG, Subtarget);
18187
18188 SDLoc dl(Op);
18189 SDValue N0 = Op.getOperand(0);
18190 SDValue N1 = Op.getOperand(1);
18191 SDValue N2 = Op.getOperand(2);
18192 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18193
18194 if (EltVT == MVT::bf16) {
18196 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
18197 DAG.getBitcast(IVT, N0),
18198 DAG.getBitcast(MVT::i16, N1), N2);
18199 return DAG.getBitcast(VT, Res);
18200 }
18201
18202 if (!N2C) {
18203 // Variable insertion indices, usually we're better off spilling to stack,
18204 // but AVX512 can use a variable compare+select by comparing against all
18205 // possible vector indices, and FP insertion has less gpr->simd traffic.
18206 if (!(Subtarget.hasBWI() ||
18207 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18208 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
18209 return SDValue();
18210
18211 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18212 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18213 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18214 return SDValue();
18215
18216 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18217 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18218 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18219
18220 SmallVector<SDValue, 16> RawIndices;
18221 for (unsigned I = 0; I != NumElts; ++I)
18222 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18223 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18224
18225 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18226 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18228 }
18229
18230 if (N2C->getAPIntValue().uge(NumElts))
18231 return SDValue();
18232 uint64_t IdxVal = N2C->getZExtValue();
18233
18234 bool IsZeroElt = X86::isZeroNode(N1);
18235 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18236
18237 if (IsZeroElt || IsAllOnesElt) {
18238 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
18239 // We don't deal with i8 0 since it appears to be handled elsewhere.
18240 if (IsAllOnesElt &&
18241 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
18242 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
18243 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
18244 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
18245 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
18246 CstVectorElts[IdxVal] = OnesCst;
18247 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
18248 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
18249 }
18250 // See if we can do this more efficiently with a blend shuffle with a
18251 // rematerializable vector.
18252 if (Subtarget.hasSSE41() &&
18253 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
18254 SmallVector<int, 8> BlendMask;
18255 for (unsigned i = 0; i != NumElts; ++i)
18256 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18257 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
18258 : getOnesVector(VT, DAG, dl);
18259 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
18260 }
18261 }
18262
18263 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
18264 // into that, and then insert the subvector back into the result.
18265 if (VT.is256BitVector() || VT.is512BitVector()) {
18266 // With a 256-bit vector, we can insert into the zero element efficiently
18267 // using a blend if we have AVX or AVX2 and the right data type.
18268 if (VT.is256BitVector() && IdxVal == 0) {
18269 // TODO: It is worthwhile to cast integer to floating point and back
18270 // and incur a domain crossing penalty if that's what we'll end up
18271 // doing anyway after extracting to a 128-bit vector.
18272 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
18273 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
18274 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18275 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
18276 DAG.getTargetConstant(1, dl, MVT::i8));
18277 }
18278 }
18279
18280 unsigned NumEltsIn128 = 128 / EltSizeInBits;
18281 assert(isPowerOf2_32(NumEltsIn128) &&
18282 "Vectors will always have power-of-two number of elements.");
18283
18284 // If we are not inserting into the low 128-bit vector chunk,
18285 // then prefer the broadcast+blend sequence.
18286 // FIXME: relax the profitability check iff all N1 uses are insertions.
18287 if (IdxVal >= NumEltsIn128 &&
18288 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
18289 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
18290 X86::mayFoldLoad(N1, Subtarget)))) {
18291 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
18292 SmallVector<int, 8> BlendMask;
18293 for (unsigned i = 0; i != NumElts; ++i)
18294 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18295 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
18296 }
18297
18298 // Get the desired 128-bit vector chunk.
18299 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
18300
18301 // Insert the element into the desired chunk.
18302 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
18303 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
18304
18305 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
18306 DAG.getIntPtrConstant(IdxIn128, dl));
18307
18308 // Insert the changed part back into the bigger vector
18309 return insert128BitVector(N0, V, IdxVal, DAG, dl);
18310 }
18311 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
18312
18313 // This will be just movw/movd/movq/movsh/movss/movsd.
18314 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
18315 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
18316 EltVT == MVT::f16 || EltVT == MVT::i64) {
18317 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18318 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18319 }
18320
18321 // We can't directly insert an i8 or i16 into a vector, so zero extend
18322 // it to i32 first.
18323 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
18324 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
18325 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
18326 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
18327 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18328 return DAG.getBitcast(VT, N1);
18329 }
18330 }
18331
18332 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
18333 // argument. SSE41 required for pinsrb.
18334 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
18335 unsigned Opc;
18336 if (VT == MVT::v8i16) {
18337 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
18338 Opc = X86ISD::PINSRW;
18339 } else {
18340 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
18341 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
18342 Opc = X86ISD::PINSRB;
18343 }
18344
18345 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
18346 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
18347 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
18348 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
18349 }
18350
18351 if (Subtarget.hasSSE41()) {
18352 if (EltVT == MVT::f32) {
18353 // Bits [7:6] of the constant are the source select. This will always be
18354 // zero here. The DAG Combiner may combine an extract_elt index into
18355 // these bits. For example (insert (extract, 3), 2) could be matched by
18356 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
18357 // Bits [5:4] of the constant are the destination select. This is the
18358 // value of the incoming immediate.
18359 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
18360 // combine either bitwise AND or insert of float 0.0 to set these bits.
18361
18362 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
18363 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
18364 // If this is an insertion of 32-bits into the low 32-bits of
18365 // a vector, we prefer to generate a blend with immediate rather
18366 // than an insertps. Blends are simpler operations in hardware and so
18367 // will always have equal or better performance than insertps.
18368 // But if optimizing for size and there's a load folding opportunity,
18369 // generate insertps because blendps does not have a 32-bit memory
18370 // operand form.
18371 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18372 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
18373 DAG.getTargetConstant(1, dl, MVT::i8));
18374 }
18375 // Create this as a scalar to vector..
18376 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18377 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
18378 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
18379 }
18380
18381 // PINSR* works with constant index.
18382 if (EltVT == MVT::i32 || EltVT == MVT::i64)
18383 return Op;
18384 }
18385
18386 return SDValue();
18387}
18388
18390 SelectionDAG &DAG) {
18391 SDLoc dl(Op);
18392 MVT OpVT = Op.getSimpleValueType();
18393
18394 // It's always cheaper to replace a xor+movd with xorps and simplifies further
18395 // combines.
18396 if (X86::isZeroNode(Op.getOperand(0)))
18397 return getZeroVector(OpVT, Subtarget, DAG, dl);
18398
18399 // If this is a 256-bit vector result, first insert into a 128-bit
18400 // vector and then insert into the 256-bit vector.
18401 if (!OpVT.is128BitVector()) {
18402 // Insert into a 128-bit vector.
18403 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
18405 OpVT.getVectorNumElements() / SizeFactor);
18406
18407 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
18408
18409 // Insert the 128-bit vector.
18410 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
18411 }
18412 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
18413 "Expected an SSE type!");
18414
18415 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
18416 // tblgen.
18417 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
18418 return Op;
18419
18420 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
18421 return DAG.getBitcast(
18422 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
18423}
18424
18425// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
18426// simple superregister reference or explicit instructions to insert
18427// the upper bits of a vector.
18429 SelectionDAG &DAG) {
18430 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
18431
18432 return insert1BitVector(Op, DAG, Subtarget);
18433}
18434
18436 SelectionDAG &DAG) {
18437 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
18438 "Only vXi1 extract_subvectors need custom lowering");
18439
18440 SDLoc dl(Op);
18441 SDValue Vec = Op.getOperand(0);
18442 uint64_t IdxVal = Op.getConstantOperandVal(1);
18443
18444 if (IdxVal == 0) // the operation is legal
18445 return Op;
18446
18447 // Extend to natively supported kshift.
18448 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18449
18450 // Shift to the LSB.
18451 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18452 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18453
18454 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
18455 DAG.getIntPtrConstant(0, dl));
18456}
18457
18458// Returns the appropriate wrapper opcode for a global reference.
18459unsigned X86TargetLowering::getGlobalWrapperKind(
18460 const GlobalValue *GV, const unsigned char OpFlags) const {
18461 // References to absolute symbols are never PC-relative.
18462 if (GV && GV->isAbsoluteSymbolRef())
18463 return X86ISD::Wrapper;
18464
18465 // The following OpFlags under RIP-rel PIC use RIP.
18466 if (Subtarget.isPICStyleRIPRel() &&
18467 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
18468 OpFlags == X86II::MO_DLLIMPORT))
18469 return X86ISD::WrapperRIP;
18470
18471 // GOTPCREL references must always use RIP.
18472 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
18473 return X86ISD::WrapperRIP;
18474
18475 return X86ISD::Wrapper;
18476}
18477
18478// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
18479// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
18480// one of the above mentioned nodes. It has to be wrapped because otherwise
18481// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
18482// be used to form addressing mode. These wrapped nodes will be selected
18483// into MOV32ri.
18484SDValue
18485X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
18486 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
18487
18488 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18489 // global base reg.
18490 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18491
18492 auto PtrVT = getPointerTy(DAG.getDataLayout());
18494 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
18495 SDLoc DL(CP);
18496 Result =
18497 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
18498 // With PIC, the address is actually $g + Offset.
18499 if (OpFlag) {
18500 Result =
18501 DAG.getNode(ISD::ADD, DL, PtrVT,
18502 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18503 }
18504
18505 return Result;
18506}
18507
18508SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
18509 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
18510
18511 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18512 // global base reg.
18513 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18514
18515 auto PtrVT = getPointerTy(DAG.getDataLayout());
18516 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
18517 SDLoc DL(JT);
18518 Result =
18519 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
18520
18521 // With PIC, the address is actually $g + Offset.
18522 if (OpFlag)
18523 Result =
18524 DAG.getNode(ISD::ADD, DL, PtrVT,
18525 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18526
18527 return Result;
18528}
18529
18530SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
18531 SelectionDAG &DAG) const {
18532 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
18533}
18534
18535SDValue
18536X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
18537 // Create the TargetBlockAddressAddress node.
18538 unsigned char OpFlags =
18540 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
18541 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
18542 SDLoc dl(Op);
18543 auto PtrVT = getPointerTy(DAG.getDataLayout());
18544 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
18545 Result =
18546 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
18547
18548 // With PIC, the address is actually $g + Offset.
18549 if (isGlobalRelativeToPICBase(OpFlags)) {
18550 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
18551 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
18552 }
18553
18554 return Result;
18555}
18556
18557/// Creates target global address or external symbol nodes for calls or
18558/// other uses.
18559SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
18560 bool ForCall) const {
18561 // Unpack the global address or external symbol.
18562 SDLoc dl(Op);
18563 const GlobalValue *GV = nullptr;
18564 int64_t Offset = 0;
18565 const char *ExternalSym = nullptr;
18566 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
18567 GV = G->getGlobal();
18568 Offset = G->getOffset();
18569 } else {
18570 const auto *ES = cast<ExternalSymbolSDNode>(Op);
18571 ExternalSym = ES->getSymbol();
18572 }
18573
18574 // Calculate some flags for address lowering.
18576 unsigned char OpFlags;
18577 if (ForCall)
18578 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
18579 else
18580 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
18581 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
18582 bool NeedsLoad = isGlobalStubReference(OpFlags);
18583
18585 auto PtrVT = getPointerTy(DAG.getDataLayout());
18587
18588 if (GV) {
18589 // Create a target global address if this is a global. If possible, fold the
18590 // offset into the global address reference. Otherwise, ADD it on later.
18591 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
18592 // allowed because if the address of foo is 0, the ELF R_X86_64_32
18593 // relocation will compute to a negative value, which is invalid.
18594 int64_t GlobalOffset = 0;
18595 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
18597 std::swap(GlobalOffset, Offset);
18598 }
18599 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
18600 } else {
18601 // If this is not a global address, this must be an external symbol.
18602 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
18603 }
18604
18605 // If this is a direct call, avoid the wrapper if we don't need to do any
18606 // loads or adds. This allows SDAG ISel to match direct calls.
18607 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
18608 return Result;
18609
18610 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
18611
18612 // With PIC, the address is actually $g + Offset.
18613 if (HasPICReg) {
18614 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
18615 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
18616 }
18617
18618 // For globals that require a load from a stub to get the address, emit the
18619 // load.
18620 if (NeedsLoad)
18621 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
18623
18624 // If there was a non-zero offset that we didn't fold, create an explicit
18625 // addition for it.
18626 if (Offset != 0)
18627 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
18628 DAG.getConstant(Offset, dl, PtrVT));
18629
18630 return Result;
18631}
18632
18633SDValue
18634X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
18635 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
18636}
18637
18638static SDValue
18640 SDValue *InGlue, const EVT PtrVT, unsigned ReturnReg,
18641 unsigned char OperandFlags, bool LocalDynamic = false) {
18643 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18644 SDLoc dl(GA);
18645 SDValue TGA;
18646 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
18647 if (LocalDynamic && UseTLSDESC) {
18648 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
18649 auto UI = TGA->use_begin();
18650 // Reuse existing GetTLSADDR node if we can find it.
18651 if (UI != TGA->use_end())
18652 return SDValue(*UI->use_begin()->use_begin(), 0);
18653 } else {
18654 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
18655 GA->getOffset(), OperandFlags);
18656 }
18657
18658 X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC
18659 : LocalDynamic ? X86ISD::TLSBASEADDR
18661
18662 if (InGlue) {
18663 SDValue Ops[] = { Chain, TGA, *InGlue };
18664 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
18665 } else {
18666 SDValue Ops[] = { Chain, TGA };
18667 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
18668 }
18669
18670 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
18671 MFI.setAdjustsStack(true);
18672 MFI.setHasCalls(true);
18673
18674 SDValue Glue = Chain.getValue(1);
18675 SDValue Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
18676
18677 if (!UseTLSDESC)
18678 return Ret;
18679
18680 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
18681 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
18682
18684 SDValue Offset =
18685 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
18687 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
18688}
18689
18690// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
18691static SDValue
18693 const EVT PtrVT) {
18694 SDValue InGlue;
18695 SDLoc dl(GA); // ? function entry point might be better
18696 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
18698 SDLoc(), PtrVT), InGlue);
18699 InGlue = Chain.getValue(1);
18700
18701 return GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX, X86II::MO_TLSGD);
18702}
18703
18704// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
18705static SDValue
18707 const EVT PtrVT) {
18708 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
18709 X86::RAX, X86II::MO_TLSGD);
18710}
18711
18712// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
18713static SDValue
18715 const EVT PtrVT) {
18716 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
18717 X86::EAX, X86II::MO_TLSGD);
18718}
18719
18721 SelectionDAG &DAG, const EVT PtrVT,
18722 bool Is64Bit, bool Is64BitLP64) {
18723 SDLoc dl(GA);
18724
18725 // Get the start address of the TLS block for this module.
18729
18730 SDValue Base;
18731 if (Is64Bit) {
18732 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
18733 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
18734 X86II::MO_TLSLD, /*LocalDynamic=*/true);
18735 } else {
18736 SDValue InGlue;
18737 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
18738 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InGlue);
18739 InGlue = Chain.getValue(1);
18740 Base = GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX,
18741 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
18742 }
18743
18744 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
18745 // of Base.
18746
18747 // Build x@dtpoff.
18748 unsigned char OperandFlags = X86II::MO_DTPOFF;
18749 unsigned WrapperKind = X86ISD::Wrapper;
18750 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
18751 GA->getValueType(0),
18752 GA->getOffset(), OperandFlags);
18753 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
18754
18755 // Add x@dtpoff with the base.
18756 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
18757}
18758
18759// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
18761 const EVT PtrVT, TLSModel::Model model,
18762 bool is64Bit, bool isPIC) {
18763 SDLoc dl(GA);
18764
18765 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
18767 PointerType::get(*DAG.getContext(), is64Bit ? 257 : 256));
18768
18769 SDValue ThreadPointer =
18770 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
18772
18773 unsigned char OperandFlags = 0;
18774 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
18775 // initialexec.
18776 unsigned WrapperKind = X86ISD::Wrapper;
18777 if (model == TLSModel::LocalExec) {
18778 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
18779 } else if (model == TLSModel::InitialExec) {
18780 if (is64Bit) {
18781 OperandFlags = X86II::MO_GOTTPOFF;
18782 WrapperKind = X86ISD::WrapperRIP;
18783 } else {
18784 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
18785 }
18786 } else {
18787 llvm_unreachable("Unexpected model");
18788 }
18789
18790 // emit "addl x@ntpoff,%eax" (local exec)
18791 // or "addl x@indntpoff,%eax" (initial exec)
18792 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
18793 SDValue TGA =
18794 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
18795 GA->getOffset(), OperandFlags);
18796 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
18797
18798 if (model == TLSModel::InitialExec) {
18799 if (isPIC && !is64Bit) {
18800 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
18801 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
18802 Offset);
18803 }
18804
18805 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
18807 }
18808
18809 // The address of the thread local variable is the add of the thread
18810 // pointer with the offset of the variable.
18811 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
18812}
18813
18814SDValue
18815X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
18816
18817 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
18818
18819 if (DAG.getTarget().useEmulatedTLS())
18820 return LowerToTLSEmulatedModel(GA, DAG);
18821
18822 const GlobalValue *GV = GA->getGlobal();
18823 auto PtrVT = getPointerTy(DAG.getDataLayout());
18824 bool PositionIndependent = isPositionIndependent();
18825
18826 if (Subtarget.isTargetELF()) {
18827 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
18828 switch (model) {
18830 if (Subtarget.is64Bit()) {
18831 if (Subtarget.isTarget64BitLP64())
18832 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
18833 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
18834 }
18835 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
18837 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
18838 Subtarget.isTarget64BitLP64());
18841 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
18842 PositionIndependent);
18843 }
18844 llvm_unreachable("Unknown TLS model.");
18845 }
18846
18847 if (Subtarget.isTargetDarwin()) {
18848 // Darwin only has one model of TLS. Lower to that.
18849 unsigned char OpFlag = 0;
18850 unsigned WrapperKind = 0;
18851
18852 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18853 // global base reg.
18854 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
18855 if (PIC32) {
18856 OpFlag = X86II::MO_TLVP_PIC_BASE;
18857 WrapperKind = X86ISD::Wrapper;
18858 } else {
18859 OpFlag = X86II::MO_TLVP;
18860 WrapperKind = X86ISD::WrapperRIP;
18861 }
18862 SDLoc DL(Op);
18864 GA->getValueType(0),
18865 GA->getOffset(), OpFlag);
18866 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
18867
18868 // With PIC32, the address is actually $g + Offset.
18869 if (PIC32)
18870 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
18871 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
18872 Offset);
18873
18874 // Lowering the machine isd will make sure everything is in the right
18875 // location.
18876 SDValue Chain = DAG.getEntryNode();
18877 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18878 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
18879 SDValue Args[] = { Chain, Offset };
18880 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
18881 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
18882
18883 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
18885 MFI.setAdjustsStack(true);
18886
18887 // And our return value (tls address) is in the standard call return value
18888 // location.
18889 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
18890 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
18891 }
18892
18893 if (Subtarget.isOSWindows()) {
18894 // Just use the implicit TLS architecture
18895 // Need to generate something similar to:
18896 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
18897 // ; from TEB
18898 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
18899 // mov rcx, qword [rdx+rcx*8]
18900 // mov eax, .tls$:tlsvar
18901 // [rax+rcx] contains the address
18902 // Windows 64bit: gs:0x58
18903 // Windows 32bit: fs:__tls_array
18904
18905 SDLoc dl(GA);
18906 SDValue Chain = DAG.getEntryNode();
18907
18908 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
18909 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
18910 // use its literal value of 0x2C.
18912 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), 256)
18913 : PointerType::get(*DAG.getContext(), 257));
18914
18915 SDValue TlsArray = Subtarget.is64Bit()
18916 ? DAG.getIntPtrConstant(0x58, dl)
18917 : (Subtarget.isTargetWindowsGNU()
18918 ? DAG.getIntPtrConstant(0x2C, dl)
18919 : DAG.getExternalSymbol("_tls_array", PtrVT));
18920
18922 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
18923
18924 SDValue res;
18926 res = ThreadPointer;
18927 } else {
18928 // Load the _tls_index variable
18929 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
18930 if (Subtarget.is64Bit())
18931 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
18932 MachinePointerInfo(), MVT::i32);
18933 else
18934 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
18935
18936 const DataLayout &DL = DAG.getDataLayout();
18937 SDValue Scale =
18938 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
18939 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
18940
18941 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
18942 }
18943
18944 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
18945
18946 // Get the offset of start of .tls section
18947 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
18948 GA->getValueType(0),
18950 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
18951
18952 // The address of the thread local variable is the add of the thread
18953 // pointer with the offset of the variable.
18954 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
18955 }
18956
18957 llvm_unreachable("TLS not implemented for this target.");
18958}
18959
18961 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
18963 TLSModel::Model Model = TM.getTLSModel(&GV);
18964 switch (Model) {
18967 // We can include the %fs segment register in addressing modes.
18968 return true;
18971 // These models do not result in %fs relative addresses unless
18972 // TLS descriptior are used.
18973 //
18974 // Even in the case of TLS descriptors we currently have no way to model
18975 // the difference between %fs access and the computations needed for the
18976 // offset and returning `true` for TLS-desc currently duplicates both
18977 // which is detrimental :-/
18978 return false;
18979 }
18980 }
18981 return false;
18982}
18983
18984/// Lower SRA_PARTS and friends, which return two i32 values
18985/// and take a 2 x i32 value to shift plus a shift amount.
18986/// TODO: Can this be moved to general expansion code?
18988 SDValue Lo, Hi;
18989 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
18990 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
18991}
18992
18993// Try to use a packed vector operation to handle i64 on 32-bit targets when
18994// AVX512DQ is enabled.
18996 SelectionDAG &DAG,
18997 const X86Subtarget &Subtarget) {
18998 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
18999 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19000 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19001 Op.getOpcode() == ISD::UINT_TO_FP) &&
19002 "Unexpected opcode!");
19003 bool IsStrict = Op->isStrictFPOpcode();
19004 unsigned OpNo = IsStrict ? 1 : 0;
19005 SDValue Src = Op.getOperand(OpNo);
19006 MVT SrcVT = Src.getSimpleValueType();
19007 MVT VT = Op.getSimpleValueType();
19008
19009 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19010 (VT != MVT::f32 && VT != MVT::f64))
19011 return SDValue();
19012
19013 // Pack the i64 into a vector, do the operation and extract.
19014
19015 // Using 256-bit to ensure result is 128-bits for f32 case.
19016 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19017 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19018 MVT VecVT = MVT::getVectorVT(VT, NumElts);
19019
19020 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19021 if (IsStrict) {
19022 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19023 {Op.getOperand(0), InVec});
19024 SDValue Chain = CvtVec.getValue(1);
19025 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19026 DAG.getIntPtrConstant(0, dl));
19027 return DAG.getMergeValues({Value, Chain}, dl);
19028 }
19029
19030 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19031
19032 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19033 DAG.getIntPtrConstant(0, dl));
19034}
19035
19036// Try to use a packed vector operation to handle i64 on 32-bit targets.
19038 const X86Subtarget &Subtarget) {
19039 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19040 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19041 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19042 Op.getOpcode() == ISD::UINT_TO_FP) &&
19043 "Unexpected opcode!");
19044 bool IsStrict = Op->isStrictFPOpcode();
19045 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19046 MVT SrcVT = Src.getSimpleValueType();
19047 MVT VT = Op.getSimpleValueType();
19048
19049 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
19050 return SDValue();
19051
19052 // Pack the i64 into a vector, do the operation and extract.
19053
19054 assert(Subtarget.hasFP16() && "Expected FP16");
19055
19056 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
19057 if (IsStrict) {
19058 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
19059 {Op.getOperand(0), InVec});
19060 SDValue Chain = CvtVec.getValue(1);
19061 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19062 DAG.getIntPtrConstant(0, dl));
19063 return DAG.getMergeValues({Value, Chain}, dl);
19064 }
19065
19066 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
19067
19068 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19069 DAG.getIntPtrConstant(0, dl));
19070}
19071
19072static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19073 const X86Subtarget &Subtarget) {
19074 switch (Opcode) {
19075 case ISD::SINT_TO_FP:
19076 // TODO: Handle wider types with AVX/AVX512.
19077 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19078 return false;
19079 // CVTDQ2PS or (V)CVTDQ2PD
19080 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19081
19082 case ISD::UINT_TO_FP:
19083 // TODO: Handle wider types and i64 elements.
19084 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19085 return false;
19086 // VCVTUDQ2PS or VCVTUDQ2PD
19087 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19088
19089 default:
19090 return false;
19091 }
19092}
19093
19094/// Given a scalar cast operation that is extracted from a vector, try to
19095/// vectorize the cast op followed by extraction. This will avoid an expensive
19096/// round-trip between XMM and GPR.
19098 SelectionDAG &DAG,
19099 const X86Subtarget &Subtarget) {
19100 // TODO: This could be enhanced to handle smaller integer types by peeking
19101 // through an extend.
19102 SDValue Extract = Cast.getOperand(0);
19103 MVT DestVT = Cast.getSimpleValueType();
19104 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19105 !isa<ConstantSDNode>(Extract.getOperand(1)))
19106 return SDValue();
19107
19108 // See if we have a 128-bit vector cast op for this type of cast.
19109 SDValue VecOp = Extract.getOperand(0);
19110 MVT FromVT = VecOp.getSimpleValueType();
19111 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19112 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19113 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19114 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19115 return SDValue();
19116
19117 // If we are extracting from a non-zero element, first shuffle the source
19118 // vector to allow extracting from element zero.
19119 if (!isNullConstant(Extract.getOperand(1))) {
19120 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19121 Mask[0] = Extract.getConstantOperandVal(1);
19122 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19123 }
19124 // If the source vector is wider than 128-bits, extract the low part. Do not
19125 // create an unnecessarily wide vector cast op.
19126 if (FromVT != Vec128VT)
19127 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19128
19129 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19130 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19131 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19132 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19133 DAG.getIntPtrConstant(0, DL));
19134}
19135
19136/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19137/// try to vectorize the cast ops. This will avoid an expensive round-trip
19138/// between XMM and GPR.
19139static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
19140 SelectionDAG &DAG,
19141 const X86Subtarget &Subtarget) {
19142 // TODO: Allow FP_TO_UINT.
19143 SDValue CastToInt = CastToFP.getOperand(0);
19144 MVT VT = CastToFP.getSimpleValueType();
19145 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19146 return SDValue();
19147
19148 MVT IntVT = CastToInt.getSimpleValueType();
19149 SDValue X = CastToInt.getOperand(0);
19150 MVT SrcVT = X.getSimpleValueType();
19151 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19152 return SDValue();
19153
19154 // See if we have 128-bit vector cast instructions for this type of cast.
19155 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19156 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19157 IntVT != MVT::i32)
19158 return SDValue();
19159
19160 unsigned SrcSize = SrcVT.getSizeInBits();
19161 unsigned IntSize = IntVT.getSizeInBits();
19162 unsigned VTSize = VT.getSizeInBits();
19163 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19164 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19165 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19166
19167 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19168 unsigned ToIntOpcode =
19169 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19170 unsigned ToFPOpcode =
19171 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19172
19173 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19174 //
19175 // We are not defining the high elements (for example, zero them) because
19176 // that could nullify any performance advantage that we hoped to gain from
19177 // this vector op hack. We do not expect any adverse effects (like denorm
19178 // penalties) with cast ops.
19179 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
19180 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19181 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19182 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19183 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19184}
19185
19187 SelectionDAG &DAG,
19188 const X86Subtarget &Subtarget) {
19189 bool IsStrict = Op->isStrictFPOpcode();
19190 MVT VT = Op->getSimpleValueType(0);
19191 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19192
19193 if (Subtarget.hasDQI()) {
19194 assert(!Subtarget.hasVLX() && "Unexpected features");
19195
19196 assert((Src.getSimpleValueType() == MVT::v2i64 ||
19197 Src.getSimpleValueType() == MVT::v4i64) &&
19198 "Unsupported custom type");
19199
19200 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19201 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19202 "Unexpected VT!");
19203 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19204
19205 // Need to concat with zero vector for strict fp to avoid spurious
19206 // exceptions.
19207 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19208 : DAG.getUNDEF(MVT::v8i64);
19209 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19210 DAG.getIntPtrConstant(0, DL));
19211 SDValue Res, Chain;
19212 if (IsStrict) {
19213 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19214 {Op->getOperand(0), Src});
19215 Chain = Res.getValue(1);
19216 } else {
19217 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19218 }
19219
19220 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19221 DAG.getIntPtrConstant(0, DL));
19222
19223 if (IsStrict)
19224 return DAG.getMergeValues({Res, Chain}, DL);
19225 return Res;
19226 }
19227
19228 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19229 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19230 if (VT != MVT::v4f32 || IsSigned)
19231 return SDValue();
19232
19233 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
19234 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
19235 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
19236 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
19237 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
19238 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
19239 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
19240 SmallVector<SDValue, 4> SignCvts(4);
19241 SmallVector<SDValue, 4> Chains(4);
19242 for (int i = 0; i != 4; ++i) {
19243 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
19244 DAG.getIntPtrConstant(i, DL));
19245 if (IsStrict) {
19246 SignCvts[i] =
19247 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
19248 {Op.getOperand(0), Elt});
19249 Chains[i] = SignCvts[i].getValue(1);
19250 } else {
19251 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
19252 }
19253 }
19254 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
19255
19256 SDValue Slow, Chain;
19257 if (IsStrict) {
19258 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
19259 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
19260 {Chain, SignCvt, SignCvt});
19261 Chain = Slow.getValue(1);
19262 } else {
19263 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
19264 }
19265
19266 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
19267 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
19268
19269 if (IsStrict)
19270 return DAG.getMergeValues({Cvt, Chain}, DL);
19271
19272 return Cvt;
19273}
19274
19276 SelectionDAG &DAG) {
19277 bool IsStrict = Op->isStrictFPOpcode();
19278 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19279 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19280 MVT VT = Op.getSimpleValueType();
19281 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
19282
19283 SDValue Rnd = DAG.getIntPtrConstant(0, dl);
19284 if (IsStrict)
19285 return DAG.getNode(
19286 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
19287 {Chain,
19288 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
19289 Rnd});
19290 return DAG.getNode(ISD::FP_ROUND, dl, VT,
19291 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
19292}
19293
19294static bool isLegalConversion(MVT VT, bool IsSigned,
19295 const X86Subtarget &Subtarget) {
19296 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
19297 return true;
19298 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
19299 return true;
19300 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
19301 return true;
19302 if (Subtarget.useAVX512Regs()) {
19303 if (VT == MVT::v16i32)
19304 return true;
19305 if (VT == MVT::v8i64 && Subtarget.hasDQI())
19306 return true;
19307 }
19308 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
19309 (VT == MVT::v2i64 || VT == MVT::v4i64))
19310 return true;
19311 return false;
19312}
19313
19314SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
19315 SelectionDAG &DAG) const {
19316 bool IsStrict = Op->isStrictFPOpcode();
19317 unsigned OpNo = IsStrict ? 1 : 0;
19318 SDValue Src = Op.getOperand(OpNo);
19319 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19320 MVT SrcVT = Src.getSimpleValueType();
19321 MVT VT = Op.getSimpleValueType();
19322 SDLoc dl(Op);
19323
19324 if (isSoftF16(VT, Subtarget))
19325 return promoteXINT_TO_FP(Op, dl, DAG);
19326 else if (isLegalConversion(SrcVT, true, Subtarget))
19327 return Op;
19328
19329 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
19330 return LowerWin64_INT128_TO_FP(Op, DAG);
19331
19332 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
19333 return Extract;
19334
19335 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
19336 return R;
19337
19338 if (SrcVT.isVector()) {
19339 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
19340 // Note: Since v2f64 is a legal type. We don't need to zero extend the
19341 // source for strict FP.
19342 if (IsStrict)
19343 return DAG.getNode(
19344 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
19345 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19346 DAG.getUNDEF(SrcVT))});
19347 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
19348 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19349 DAG.getUNDEF(SrcVT)));
19350 }
19351 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
19352 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
19353
19354 return SDValue();
19355 }
19356
19357 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
19358 "Unknown SINT_TO_FP to lower!");
19359
19360 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
19361
19362 // These are really Legal; return the operand so the caller accepts it as
19363 // Legal.
19364 if (SrcVT == MVT::i32 && UseSSEReg)
19365 return Op;
19366 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
19367 return Op;
19368
19369 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
19370 return V;
19371 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
19372 return V;
19373
19374 // SSE doesn't have an i16 conversion so we need to promote.
19375 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
19376 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
19377 if (IsStrict)
19378 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
19379 {Chain, Ext});
19380
19381 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
19382 }
19383
19384 if (VT == MVT::f128 || !Subtarget.hasX87())
19385 return SDValue();
19386
19387 SDValue ValueToStore = Src;
19388 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
19389 // Bitcasting to f64 here allows us to do a single 64-bit store from
19390 // an SSE register, avoiding the store forwarding penalty that would come
19391 // with two 32-bit stores.
19392 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
19393
19394 unsigned Size = SrcVT.getStoreSize();
19395 Align Alignment(Size);
19397 auto PtrVT = getPointerTy(MF.getDataLayout());
19398 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
19399 MachinePointerInfo MPI =
19401 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19402 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
19403 std::pair<SDValue, SDValue> Tmp =
19404 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
19405
19406 if (IsStrict)
19407 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
19408
19409 return Tmp.first;
19410}
19411
19412std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
19413 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
19414 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
19415 // Build the FILD
19416 SDVTList Tys;
19417 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
19418 if (useSSE)
19419 Tys = DAG.getVTList(MVT::f80, MVT::Other);
19420 else
19421 Tys = DAG.getVTList(DstVT, MVT::Other);
19422
19423 SDValue FILDOps[] = {Chain, Pointer};
19424 SDValue Result =
19425 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
19426 Alignment, MachineMemOperand::MOLoad);
19427 Chain = Result.getValue(1);
19428
19429 if (useSSE) {
19431 unsigned SSFISize = DstVT.getStoreSize();
19432 int SSFI =
19433 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
19434 auto PtrVT = getPointerTy(MF.getDataLayout());
19435 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19436 Tys = DAG.getVTList(MVT::Other);
19437 SDValue FSTOps[] = {Chain, Result, StackSlot};
19440 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
19441
19442 Chain =
19443 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
19444 Result = DAG.getLoad(
19445 DstVT, DL, Chain, StackSlot,
19447 Chain = Result.getValue(1);
19448 }
19449
19450 return { Result, Chain };
19451}
19452
19453/// Horizontal vector math instructions may be slower than normal math with
19454/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
19455/// implementation, and likely shuffle complexity of the alternate sequence.
19456static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
19457 const X86Subtarget &Subtarget) {
19458 bool IsOptimizingSize = DAG.shouldOptForSize();
19459 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
19460 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
19461}
19462
19463/// 64-bit unsigned integer to double expansion.
19465 SelectionDAG &DAG,
19466 const X86Subtarget &Subtarget) {
19467 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
19468 // when converting 0 when rounding toward negative infinity. Caller will
19469 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
19470 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
19471 // This algorithm is not obvious. Here it is what we're trying to output:
19472 /*
19473 movq %rax, %xmm0
19474 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
19475 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
19476 #ifdef __SSE3__
19477 haddpd %xmm0, %xmm0
19478 #else
19479 pshufd $0x4e, %xmm0, %xmm1
19480 addpd %xmm1, %xmm0
19481 #endif
19482 */
19483
19485
19486 // Build some magic constants.
19487 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
19489 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
19490 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
19491
19493 CV1.push_back(
19494 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19495 APInt(64, 0x4330000000000000ULL))));
19496 CV1.push_back(
19497 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19498 APInt(64, 0x4530000000000000ULL))));
19499 Constant *C1 = ConstantVector::get(CV1);
19500 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
19501
19502 // Load the 64-bit value into an XMM register.
19503 SDValue XR1 =
19504 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
19505 SDValue CLod0 = DAG.getLoad(
19506 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
19508 SDValue Unpck1 =
19509 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
19510
19511 SDValue CLod1 = DAG.getLoad(
19512 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
19514 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
19515 // TODO: Are there any fast-math-flags to propagate here?
19516 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
19517 SDValue Result;
19518
19519 if (Subtarget.hasSSE3() &&
19520 shouldUseHorizontalOp(true, DAG, Subtarget)) {
19521 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
19522 } else {
19523 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
19524 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
19525 }
19526 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
19527 DAG.getIntPtrConstant(0, dl));
19528 return Result;
19529}
19530
19531/// 32-bit unsigned integer to float expansion.
19533 SelectionDAG &DAG,
19534 const X86Subtarget &Subtarget) {
19535 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19536 // FP constant to bias correct the final result.
19537 SDValue Bias = DAG.getConstantFP(
19538 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
19539
19540 // Load the 32-bit value into an XMM register.
19541 SDValue Load =
19542 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
19543
19544 // Zero out the upper parts of the register.
19545 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
19546
19547 // Or the load with the bias.
19548 SDValue Or = DAG.getNode(
19549 ISD::OR, dl, MVT::v2i64,
19550 DAG.getBitcast(MVT::v2i64, Load),
19551 DAG.getBitcast(MVT::v2i64,
19552 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
19553 Or =
19554 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
19555 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
19556
19557 if (Op.getNode()->isStrictFPOpcode()) {
19558 // Subtract the bias.
19559 // TODO: Are there any fast-math-flags to propagate here?
19560 SDValue Chain = Op.getOperand(0);
19561 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
19562 {Chain, Or, Bias});
19563
19564 if (Op.getValueType() == Sub.getValueType())
19565 return Sub;
19566
19567 // Handle final rounding.
19568 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
19569 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
19570
19571 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
19572 }
19573
19574 // Subtract the bias.
19575 // TODO: Are there any fast-math-flags to propagate here?
19576 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
19577
19578 // Handle final rounding.
19579 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
19580}
19581
19583 SelectionDAG &DAG,
19584 const X86Subtarget &Subtarget) {
19585 if (Op.getSimpleValueType() != MVT::v2f64)
19586 return SDValue();
19587
19588 bool IsStrict = Op->isStrictFPOpcode();
19589
19590 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
19591 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
19592
19593 if (Subtarget.hasAVX512()) {
19594 if (!Subtarget.hasVLX()) {
19595 // Let generic type legalization widen this.
19596 if (!IsStrict)
19597 return SDValue();
19598 // Otherwise pad the integer input with 0s and widen the operation.
19599 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
19600 DAG.getConstant(0, DL, MVT::v2i32));
19601 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
19602 {Op.getOperand(0), N0});
19603 SDValue Chain = Res.getValue(1);
19604 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
19605 DAG.getIntPtrConstant(0, DL));
19606 return DAG.getMergeValues({Res, Chain}, DL);
19607 }
19608
19609 // Legalize to v4i32 type.
19610 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
19611 DAG.getUNDEF(MVT::v2i32));
19612 if (IsStrict)
19613 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
19614 {Op.getOperand(0), N0});
19615 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
19616 }
19617
19618 // Zero extend to 2i64, OR with the floating point representation of 2^52.
19619 // This gives us the floating point equivalent of 2^52 + the i32 integer
19620 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
19621 // point leaving just our i32 integers in double format.
19622 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
19623 SDValue VBias = DAG.getConstantFP(
19624 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
19625 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
19626 DAG.getBitcast(MVT::v2i64, VBias));
19627 Or = DAG.getBitcast(MVT::v2f64, Or);
19628
19629 if (IsStrict)
19630 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
19631 {Op.getOperand(0), Or, VBias});
19632 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
19633}
19634
19636 SelectionDAG &DAG,
19637 const X86Subtarget &Subtarget) {
19638 bool IsStrict = Op->isStrictFPOpcode();
19639 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
19640 MVT VecIntVT = V.getSimpleValueType();
19641 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
19642 "Unsupported custom type");
19643
19644 if (Subtarget.hasAVX512()) {
19645 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
19646 assert(!Subtarget.hasVLX() && "Unexpected features");
19647 MVT VT = Op->getSimpleValueType(0);
19648
19649 // v8i32->v8f64 is legal with AVX512 so just return it.
19650 if (VT == MVT::v8f64)
19651 return Op;
19652
19653 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
19654 "Unexpected VT!");
19655 MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
19656 MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
19657 // Need to concat with zero vector for strict fp to avoid spurious
19658 // exceptions.
19659 SDValue Tmp =
19660 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
19661 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
19662 DAG.getIntPtrConstant(0, DL));
19663 SDValue Res, Chain;
19664 if (IsStrict) {
19665 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
19666 {Op->getOperand(0), V});
19667 Chain = Res.getValue(1);
19668 } else {
19669 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
19670 }
19671
19672 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19673 DAG.getIntPtrConstant(0, DL));
19674
19675 if (IsStrict)
19676 return DAG.getMergeValues({Res, Chain}, DL);
19677 return Res;
19678 }
19679
19680 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
19681 Op->getSimpleValueType(0) == MVT::v4f64) {
19682 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
19683 Constant *Bias = ConstantFP::get(
19684 *DAG.getContext(),
19685 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
19686 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
19687 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
19688 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
19689 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
19690 SDValue VBias = DAG.getMemIntrinsicNode(
19691 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
19694
19695 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
19696 DAG.getBitcast(MVT::v4i64, VBias));
19697 Or = DAG.getBitcast(MVT::v4f64, Or);
19698
19699 if (IsStrict)
19700 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
19701 {Op.getOperand(0), Or, VBias});
19702 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
19703 }
19704
19705 // The algorithm is the following:
19706 // #ifdef __SSE4_1__
19707 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
19708 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
19709 // (uint4) 0x53000000, 0xaa);
19710 // #else
19711 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
19712 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
19713 // #endif
19714 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
19715 // return (float4) lo + fhi;
19716
19717 bool Is128 = VecIntVT == MVT::v4i32;
19718 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
19719 // If we convert to something else than the supported type, e.g., to v4f64,
19720 // abort early.
19721 if (VecFloatVT != Op->getSimpleValueType(0))
19722 return SDValue();
19723
19724 // In the #idef/#else code, we have in common:
19725 // - The vector of constants:
19726 // -- 0x4b000000
19727 // -- 0x53000000
19728 // - A shift:
19729 // -- v >> 16
19730
19731 // Create the splat vector for 0x4b000000.
19732 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
19733 // Create the splat vector for 0x53000000.
19734 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
19735
19736 // Create the right shift.
19737 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
19738 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
19739
19740 SDValue Low, High;
19741 if (Subtarget.hasSSE41()) {
19742 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
19743 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
19744 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
19745 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
19746 // Low will be bitcasted right away, so do not bother bitcasting back to its
19747 // original type.
19748 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
19749 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
19750 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
19751 // (uint4) 0x53000000, 0xaa);
19752 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
19753 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
19754 // High will be bitcasted right away, so do not bother bitcasting back to
19755 // its original type.
19756 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
19757 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
19758 } else {
19759 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
19760 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
19761 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
19762 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
19763
19764 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
19765 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
19766 }
19767
19768 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
19769 SDValue VecCstFSub = DAG.getConstantFP(
19770 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
19771
19772 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
19773 // NOTE: By using fsub of a positive constant instead of fadd of a negative
19774 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
19775 // enabled. See PR24512.
19776 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
19777 // TODO: Are there any fast-math-flags to propagate here?
19778 // (float4) lo;
19779 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
19780 // return (float4) lo + fhi;
19781 if (IsStrict) {
19782 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
19783 {Op.getOperand(0), HighBitcast, VecCstFSub});
19784 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
19785 {FHigh.getValue(1), LowBitcast, FHigh});
19786 }
19787
19788 SDValue FHigh =
19789 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
19790 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
19791}
19792
19794 const X86Subtarget &Subtarget) {
19795 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19796 SDValue N0 = Op.getOperand(OpNo);
19797 MVT SrcVT = N0.getSimpleValueType();
19798
19799 switch (SrcVT.SimpleTy) {
19800 default:
19801 llvm_unreachable("Custom UINT_TO_FP is not supported!");
19802 case MVT::v2i32:
19803 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
19804 case MVT::v4i32:
19805 case MVT::v8i32:
19806 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
19807 case MVT::v2i64:
19808 case MVT::v4i64:
19809 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
19810 }
19811}
19812
19813SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
19814 SelectionDAG &DAG) const {
19815 bool IsStrict = Op->isStrictFPOpcode();
19816 unsigned OpNo = IsStrict ? 1 : 0;
19817 SDValue Src = Op.getOperand(OpNo);
19818 SDLoc dl(Op);
19819 auto PtrVT = getPointerTy(DAG.getDataLayout());
19820 MVT SrcVT = Src.getSimpleValueType();
19821 MVT DstVT = Op->getSimpleValueType(0);
19822 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
19823
19824 // Bail out when we don't have native conversion instructions.
19825 if (DstVT == MVT::f128)
19826 return SDValue();
19827
19828 if (isSoftF16(DstVT, Subtarget))
19829 return promoteXINT_TO_FP(Op, dl, DAG);
19830 else if (isLegalConversion(SrcVT, false, Subtarget))
19831 return Op;
19832
19833 if (DstVT.isVector())
19834 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
19835
19836 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
19837 return LowerWin64_INT128_TO_FP(Op, DAG);
19838
19839 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
19840 return Extract;
19841
19842 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
19843 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
19844 // Conversions from unsigned i32 to f32/f64 are legal,
19845 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
19846 return Op;
19847 }
19848
19849 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
19850 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
19851 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
19852 if (IsStrict)
19853 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
19854 {Chain, Src});
19855 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
19856 }
19857
19858 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
19859 return V;
19860 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
19861 return V;
19862
19863 // The transform for i64->f64 isn't correct for 0 when rounding to negative
19864 // infinity. It produces -0.0, so disable under strictfp.
19865 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
19866 !IsStrict)
19867 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
19868 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
19869 // negative infinity. So disable under strictfp. Using FILD instead.
19870 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
19871 !IsStrict)
19872 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
19873 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
19874 (DstVT == MVT::f32 || DstVT == MVT::f64))
19875 return SDValue();
19876
19877 // Make a 64-bit buffer, and use it to build an FILD.
19878 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
19879 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
19880 Align SlotAlign(8);
19881 MachinePointerInfo MPI =
19883 if (SrcVT == MVT::i32) {
19884 SDValue OffsetSlot =
19885 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
19886 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
19887 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
19888 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
19889 std::pair<SDValue, SDValue> Tmp =
19890 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
19891 if (IsStrict)
19892 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
19893
19894 return Tmp.first;
19895 }
19896
19897 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
19898 SDValue ValueToStore = Src;
19899 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
19900 // Bitcasting to f64 here allows us to do a single 64-bit store from
19901 // an SSE register, avoiding the store forwarding penalty that would come
19902 // with two 32-bit stores.
19903 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
19904 }
19905 SDValue Store =
19906 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
19907 // For i64 source, we need to add the appropriate power of 2 if the input
19908 // was negative. We must be careful to do the computation in x87 extended
19909 // precision, not in SSE.
19910 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
19911 SDValue Ops[] = {Store, StackSlot};
19912 SDValue Fild =
19913 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
19914 SlotAlign, MachineMemOperand::MOLoad);
19915 Chain = Fild.getValue(1);
19916
19917 // Check whether the sign bit is set.
19918 SDValue SignSet = DAG.getSetCC(
19919 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
19920 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
19921
19922 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
19923 APInt FF(64, 0x5F80000000000000ULL);
19924 SDValue FudgePtr =
19925 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
19926 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
19927
19928 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
19929 SDValue Zero = DAG.getIntPtrConstant(0, dl);
19930 SDValue Four = DAG.getIntPtrConstant(4, dl);
19931 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
19932 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
19933
19934 // Load the value out, extending it from f32 to f80.
19935 SDValue Fudge = DAG.getExtLoad(
19936 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
19938 CPAlignment);
19939 Chain = Fudge.getValue(1);
19940 // Extend everything to 80 bits to force it to be done on x87.
19941 // TODO: Are there any fast-math-flags to propagate here?
19942 if (IsStrict) {
19943 unsigned Opc = ISD::STRICT_FADD;
19944 // Windows needs the precision control changed to 80bits around this add.
19945 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
19947
19948 SDValue Add =
19949 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
19950 // STRICT_FP_ROUND can't handle equal types.
19951 if (DstVT == MVT::f80)
19952 return Add;
19953 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
19954 {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
19955 }
19956 unsigned Opc = ISD::FADD;
19957 // Windows needs the precision control changed to 80bits around this add.
19958 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
19959 Opc = X86ISD::FP80_ADD;
19960
19961 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
19962 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
19963 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
19964}
19965
19966// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
19967// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
19968// just return an SDValue().
19969// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
19970// to i16, i32 or i64, and we lower it to a legal sequence and return the
19971// result.
19972SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
19973 bool IsSigned,
19974 SDValue &Chain) const {
19975 bool IsStrict = Op->isStrictFPOpcode();
19976 SDLoc DL(Op);
19977
19978 EVT DstTy = Op.getValueType();
19979 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
19980 EVT TheVT = Value.getValueType();
19981 auto PtrVT = getPointerTy(DAG.getDataLayout());
19982
19983 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
19984 // f16 must be promoted before using the lowering in this routine.
19985 // fp128 does not use this lowering.
19986 return SDValue();
19987 }
19988
19989 // If using FIST to compute an unsigned i64, we'll need some fixup
19990 // to handle values above the maximum signed i64. A FIST is always
19991 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
19992 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
19993
19994 // FIXME: This does not generate an invalid exception if the input does not
19995 // fit in i32. PR44019
19996 if (!IsSigned && DstTy != MVT::i64) {
19997 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
19998 // The low 32 bits of the fist result will have the correct uint32 result.
19999 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
20000 DstTy = MVT::i64;
20001 }
20002
20003 assert(DstTy.getSimpleVT() <= MVT::i64 &&
20004 DstTy.getSimpleVT() >= MVT::i16 &&
20005 "Unknown FP_TO_INT to lower!");
20006
20007 // We lower FP->int64 into FISTP64 followed by a load from a temporary
20008 // stack slot.
20010 unsigned MemSize = DstTy.getStoreSize();
20011 int SSFI =
20012 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20013 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20014
20015 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20016
20017 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20018
20019 if (UnsignedFixup) {
20020 //
20021 // Conversion to unsigned i64 is implemented with a select,
20022 // depending on whether the source value fits in the range
20023 // of a signed i64. Let Thresh be the FP equivalent of
20024 // 0x8000000000000000ULL.
20025 //
20026 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20027 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20028 // FistSrc = (Value - FltOfs);
20029 // Fist-to-mem64 FistSrc
20030 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20031 // to XOR'ing the high 32 bits with Adjust.
20032 //
20033 // Being a power of 2, Thresh is exactly representable in all FP formats.
20034 // For X87 we'd like to use the smallest FP type for this constant, but
20035 // for DAG type consistency we have to match the FP operand type.
20036
20037 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20039 bool LosesInfo = false;
20040 if (TheVT == MVT::f64)
20041 // The rounding mode is irrelevant as the conversion should be exact.
20043 &LosesInfo);
20044 else if (TheVT == MVT::f80)
20045 Status = Thresh.convert(APFloat::x87DoubleExtended(),
20046 APFloat::rmNearestTiesToEven, &LosesInfo);
20047
20048 assert(Status == APFloat::opOK && !LosesInfo &&
20049 "FP conversion should have been exact");
20050
20051 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20052
20053 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20054 *DAG.getContext(), TheVT);
20055 SDValue Cmp;
20056 if (IsStrict) {
20057 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20058 /*IsSignaling*/ true);
20059 Chain = Cmp.getValue(1);
20060 } else {
20061 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20062 }
20063
20064 // Our preferred lowering of
20065 //
20066 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20067 //
20068 // is
20069 //
20070 // (Value >= Thresh) << 63
20071 //
20072 // but since we can get here after LegalOperations, DAGCombine might do the
20073 // wrong thing if we create a select. So, directly create the preferred
20074 // version.
20075 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20076 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20077 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20078
20079 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20080 DAG.getConstantFP(0.0, DL, TheVT));
20081
20082 if (IsStrict) {
20083 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20084 { Chain, Value, FltOfs });
20085 Chain = Value.getValue(1);
20086 } else
20087 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20088 }
20089
20091
20092 // FIXME This causes a redundant load/store if the SSE-class value is already
20093 // in memory, such as if it is on the callstack.
20094 if (isScalarFPTypeInSSEReg(TheVT)) {
20095 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
20096 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20097 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20098 SDValue Ops[] = { Chain, StackSlot };
20099
20100 unsigned FLDSize = TheVT.getStoreSize();
20101 assert(FLDSize <= MemSize && "Stack slot not big enough");
20103 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20104 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20105 Chain = Value.getValue(1);
20106 }
20107
20108 // Build the FP_TO_INT*_IN_MEM
20110 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20111 SDValue Ops[] = { Chain, Value, StackSlot };
20113 DAG.getVTList(MVT::Other),
20114 Ops, DstTy, MMO);
20115
20116 SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
20117 Chain = Res.getValue(1);
20118
20119 // If we need an unsigned fixup, XOR the result with adjust.
20120 if (UnsignedFixup)
20121 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20122
20123 return Res;
20124}
20125
20127 const X86Subtarget &Subtarget) {
20128 MVT VT = Op.getSimpleValueType();
20129 SDValue In = Op.getOperand(0);
20130 MVT InVT = In.getSimpleValueType();
20131 SDLoc dl(Op);
20132 unsigned Opc = Op.getOpcode();
20133
20134 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20135 assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
20136 "Unexpected extension opcode");
20138 "Expected same number of elements");
20139 assert((VT.getVectorElementType() == MVT::i16 ||
20140 VT.getVectorElementType() == MVT::i32 ||
20141 VT.getVectorElementType() == MVT::i64) &&
20142 "Unexpected element type");
20143 assert((InVT.getVectorElementType() == MVT::i8 ||
20144 InVT.getVectorElementType() == MVT::i16 ||
20145 InVT.getVectorElementType() == MVT::i32) &&
20146 "Unexpected element type");
20147
20148 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
20149
20150 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20151 assert(InVT == MVT::v32i8 && "Unexpected VT!");
20152 return splitVectorIntUnary(Op, DAG, dl);
20153 }
20154
20155 if (Subtarget.hasInt256())
20156 return Op;
20157
20158 // Optimize vectors in AVX mode:
20159 //
20160 // v8i16 -> v8i32
20161 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20162 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20163 // Concat upper and lower parts.
20164 //
20165 // v4i32 -> v4i64
20166 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20167 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20168 // Concat upper and lower parts.
20169 //
20170 MVT HalfVT = VT.getHalfNumVectorElementsVT();
20171 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20172
20173 // Short-circuit if we can determine that each 128-bit half is the same value.
20174 // Otherwise, this is difficult to match and optimize.
20175 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20176 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20177 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20178
20179 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20180 SDValue Undef = DAG.getUNDEF(InVT);
20181 bool NeedZero = Opc == ISD::ZERO_EXTEND;
20182 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20183 OpHi = DAG.getBitcast(HalfVT, OpHi);
20184
20185 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20186}
20187
20188// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20189static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20190 const SDLoc &dl, SelectionDAG &DAG) {
20191 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
20192 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20193 DAG.getIntPtrConstant(0, dl));
20194 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20195 DAG.getIntPtrConstant(8, dl));
20196 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20197 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20198 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20199 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20200}
20201
20203 const X86Subtarget &Subtarget,
20204 SelectionDAG &DAG) {
20205 MVT VT = Op->getSimpleValueType(0);
20206 SDValue In = Op->getOperand(0);
20207 MVT InVT = In.getSimpleValueType();
20208 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20209 SDLoc DL(Op);
20210 unsigned NumElts = VT.getVectorNumElements();
20211
20212 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20213 // avoids a constant pool load.
20214 if (VT.getVectorElementType() != MVT::i8) {
20215 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20216 return DAG.getNode(ISD::SRL, DL, VT, Extend,
20217 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20218 }
20219
20220 // Extend VT if BWI is not supported.
20221 MVT ExtVT = VT;
20222 if (!Subtarget.hasBWI()) {
20223 // If v16i32 is to be avoided, we'll need to split and concatenate.
20224 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
20225 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
20226
20227 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
20228 }
20229
20230 // Widen to 512-bits if VLX is not supported.
20231 MVT WideVT = ExtVT;
20232 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
20233 NumElts *= 512 / ExtVT.getSizeInBits();
20234 InVT = MVT::getVectorVT(MVT::i1, NumElts);
20235 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
20236 In, DAG.getIntPtrConstant(0, DL));
20237 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
20238 NumElts);
20239 }
20240
20241 SDValue One = DAG.getConstant(1, DL, WideVT);
20242 SDValue Zero = DAG.getConstant(0, DL, WideVT);
20243
20244 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
20245
20246 // Truncate if we had to extend above.
20247 if (VT != ExtVT) {
20248 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
20249 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
20250 }
20251
20252 // Extract back to 128/256-bit if we widened.
20253 if (WideVT != VT)
20254 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
20255 DAG.getIntPtrConstant(0, DL));
20256
20257 return SelectedVal;
20258}
20259
20261 SelectionDAG &DAG) {
20262 SDValue In = Op.getOperand(0);
20263 MVT SVT = In.getSimpleValueType();
20264
20265 if (SVT.getVectorElementType() == MVT::i1)
20266 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
20267
20268 assert(Subtarget.hasAVX() && "Expected AVX support");
20269 return LowerAVXExtend(Op, DAG, Subtarget);
20270}
20271
20272/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
20273/// It makes use of the fact that vectors with enough leading sign/zero bits
20274/// prevent the PACKSS/PACKUS from saturating the results.
20275/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
20276/// within each 128-bit lane.
20277static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
20278 const SDLoc &DL, SelectionDAG &DAG,
20279 const X86Subtarget &Subtarget) {
20280 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
20281 "Unexpected PACK opcode");
20282 assert(DstVT.isVector() && "VT not a vector?");
20283
20284 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
20285 if (!Subtarget.hasSSE2())
20286 return SDValue();
20287
20288 EVT SrcVT = In.getValueType();
20289
20290 // No truncation required, we might get here due to recursive calls.
20291 if (SrcVT == DstVT)
20292 return In;
20293
20294 unsigned NumElems = SrcVT.getVectorNumElements();
20295 if (NumElems < 2 || !isPowerOf2_32(NumElems) )
20296 return SDValue();
20297
20298 unsigned DstSizeInBits = DstVT.getSizeInBits();
20299 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
20300 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
20301 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
20302
20303 LLVMContext &Ctx = *DAG.getContext();
20304 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
20305 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
20306
20307 // Pack to the largest type possible:
20308 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
20309 EVT InVT = MVT::i16, OutVT = MVT::i8;
20310 if (SrcVT.getScalarSizeInBits() > 16 &&
20311 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
20312 InVT = MVT::i32;
20313 OutVT = MVT::i16;
20314 }
20315
20316 // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
20317 // On pre-AVX512, pack the src in both halves to help value tracking.
20318 if (SrcSizeInBits <= 128) {
20319 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
20320 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
20321 In = widenSubVector(In, false, Subtarget, DAG, DL, 128);
20322 SDValue LHS = DAG.getBitcast(InVT, In);
20323 SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS;
20324 SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS);
20325 Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);
20326 Res = DAG.getBitcast(PackedVT, Res);
20327 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20328 }
20329
20330 // Split lower/upper subvectors.
20331 SDValue Lo, Hi;
20332 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
20333
20334 // If Hi is undef, then don't bother packing it and widen the result instead.
20335 if (Hi.isUndef()) {
20336 EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);
20337 if (SDValue Res =
20338 truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))
20339 return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);
20340 }
20341
20342 unsigned SubSizeInBits = SrcSizeInBits / 2;
20343 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
20344 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
20345
20346 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
20347 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
20348 Lo = DAG.getBitcast(InVT, Lo);
20349 Hi = DAG.getBitcast(InVT, Hi);
20350 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20351 return DAG.getBitcast(DstVT, Res);
20352 }
20353
20354 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
20355 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
20356 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
20357 Lo = DAG.getBitcast(InVT, Lo);
20358 Hi = DAG.getBitcast(InVT, Hi);
20359 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20360
20361 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
20362 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
20363 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
20365 int Scale = 64 / OutVT.getScalarSizeInBits();
20366 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
20367 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
20368
20369 if (DstVT.is256BitVector())
20370 return DAG.getBitcast(DstVT, Res);
20371
20372 // If 512bit -> 128bit truncate another stage.
20373 Res = DAG.getBitcast(PackedVT, Res);
20374 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20375 }
20376
20377 // Recursively pack lower/upper subvectors, concat result and pack again.
20378 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
20379
20380 if (PackedVT.is128BitVector()) {
20381 // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
20382 // type legalization.
20383 SDValue Res =
20384 truncateVectorWithPACK(Opcode, PackedVT, In, DL, DAG, Subtarget);
20385 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20386 }
20387
20388 EVT HalfPackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
20389 Lo = truncateVectorWithPACK(Opcode, HalfPackedVT, Lo, DL, DAG, Subtarget);
20390 Hi = truncateVectorWithPACK(Opcode, HalfPackedVT, Hi, DL, DAG, Subtarget);
20391 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
20392 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20393}
20394
20395/// Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
20396/// e.g. trunc <8 x i32> X to <8 x i16> -->
20397/// MaskX = X & 0xffff (clear high bits to prevent saturation)
20398/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
20400 const X86Subtarget &Subtarget,
20401 SelectionDAG &DAG) {
20402 In = DAG.getZeroExtendInReg(In, DL, DstVT);
20403 return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget);
20404}
20405
20406/// Truncate using inreg sign extension and X86ISD::PACKSS.
20408 const X86Subtarget &Subtarget,
20409 SelectionDAG &DAG) {
20410 EVT SrcVT = In.getValueType();
20411 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In,
20412 DAG.getValueType(DstVT));
20413 return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
20414}
20415
20416/// Helper to determine if \p In truncated to \p DstVT has the necessary
20417/// signbits / leading zero bits to be truncated with PACKSS / PACKUS,
20418/// possibly by converting a SRL node to SRA for sign extension.
20419static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
20420 SDValue In, const SDLoc &DL,
20421 SelectionDAG &DAG,
20422 const X86Subtarget &Subtarget) {
20423 // Requires SSE2.
20424 if (!Subtarget.hasSSE2())
20425 return SDValue();
20426
20427 EVT SrcVT = In.getValueType();
20428 EVT DstSVT = DstVT.getVectorElementType();
20429 EVT SrcSVT = SrcVT.getVectorElementType();
20430 unsigned NumDstEltBits = DstSVT.getSizeInBits();
20431 unsigned NumSrcEltBits = SrcSVT.getSizeInBits();
20432
20433 // Check we have a truncation suited for PACKSS/PACKUS.
20434 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20435 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20436 return SDValue();
20437
20438 assert(NumSrcEltBits > NumDstEltBits && "Bad truncation");
20439 unsigned NumStages = Log2_32(NumSrcEltBits / NumDstEltBits);
20440
20441 // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
20442 // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
20443 // Truncation from v2i64 to v2i8 can be better handled with PSHUFB.
20444 if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) ||
20445 (DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) ||
20446 (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3()))
20447 return SDValue();
20448
20449 // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
20450 // split this for packing.
20451 if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&
20452 !isFreeToSplitVector(In.getNode(), DAG) &&
20453 (!Subtarget.hasAVX() || DAG.ComputeNumSignBits(In) != 64))
20454 return SDValue();
20455
20456 // Don't truncate AVX512 targets as multiple PACK nodes stages.
20457 if (Subtarget.hasAVX512() && NumStages > 1)
20458 return SDValue();
20459
20460 unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);
20461 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
20462
20463 // Truncate with PACKUS if we are truncating a vector with leading zero
20464 // bits that extend all the way to the packed/truncated value.
20465 // e.g. Masks, zext_in_reg, etc.
20466 // Pre-SSE41 we can only use PACKUSWB.
20467 KnownBits Known = DAG.computeKnownBits(In);
20468 if ((NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
20469 PackOpcode = X86ISD::PACKUS;
20470 return In;
20471 }
20472
20473 // Truncate with PACKSS if we are truncating a vector with sign-bits
20474 // that extend all the way to the packed/truncated value.
20475 // e.g. Comparison result, sext_in_reg, etc.
20476 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
20477
20478 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
20479 // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
20480 // see through BITCASTs later on and combines/simplifications can't then use
20481 // it.
20482 if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&
20483 !Subtarget.hasAVX512())
20484 return SDValue();
20485
20486 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
20487 if (MinSignBits < NumSignBits) {
20488 PackOpcode = X86ISD::PACKSS;
20489 return In;
20490 }
20491
20492 // If we have a srl that only generates signbits that we will discard in
20493 // the truncation then we can use PACKSS by converting the srl to a sra.
20494 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
20495 if (In.getOpcode() == ISD::SRL && In->hasOneUse())
20496 if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(In)) {
20497 if (*ShAmt == MinSignBits) {
20498 PackOpcode = X86ISD::PACKSS;
20499 return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
20500 }
20501 }
20502
20503 return SDValue();
20504}
20505
20506/// This function lowers a vector truncation of 'extended sign-bits' or
20507/// 'extended zero-bits' values.
20508/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
20510 const SDLoc &DL,
20511 const X86Subtarget &Subtarget,
20512 SelectionDAG &DAG) {
20513 MVT SrcVT = In.getSimpleValueType();
20514 MVT DstSVT = DstVT.getVectorElementType();
20515 MVT SrcSVT = SrcVT.getVectorElementType();
20516 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20517 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20518 return SDValue();
20519
20520 // If the upper half of the source is undef, then attempt to split and
20521 // only truncate the lower half.
20522 if (DstVT.getSizeInBits() >= 128) {
20523 SmallVector<SDValue> LowerOps;
20524 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
20525 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
20526 if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,
20527 Subtarget, DAG))
20528 return widenSubVector(Res, false, Subtarget, DAG, DL,
20529 DstVT.getSizeInBits());
20530 }
20531 }
20532
20533 unsigned PackOpcode;
20534 if (SDValue Src =
20535 matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG, Subtarget))
20536 return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
20537
20538 return SDValue();
20539}
20540
20541/// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into
20542/// X86ISD::PACKUS/X86ISD::PACKSS operations.
20544 const X86Subtarget &Subtarget,
20545 SelectionDAG &DAG) {
20546 MVT SrcVT = In.getSimpleValueType();
20547 MVT DstSVT = DstVT.getVectorElementType();
20548 MVT SrcSVT = SrcVT.getVectorElementType();
20549 unsigned NumElems = DstVT.getVectorNumElements();
20550 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20551 (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
20552 NumElems >= 8))
20553 return SDValue();
20554
20555 // SSSE3's pshufb results in less instructions in the cases below.
20556 if (Subtarget.hasSSSE3() && NumElems == 8) {
20557 if (SrcSVT == MVT::i16)
20558 return SDValue();
20559 if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))
20560 return SDValue();
20561 }
20562
20563 // If the upper half of the source is undef, then attempt to split and
20564 // only truncate the lower half.
20565 if (DstVT.getSizeInBits() >= 128) {
20566 SmallVector<SDValue> LowerOps;
20567 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
20568 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
20569 if (SDValue Res = LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))
20570 return widenSubVector(Res, false, Subtarget, DAG, DL,
20571 DstVT.getSizeInBits());
20572 }
20573 }
20574
20575 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
20576 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
20577 // truncate 2 x v4i32 to v8i16.
20578 if (Subtarget.hasSSE41() || DstSVT == MVT::i8)
20579 return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);
20580
20581 if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
20582 return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);
20583
20584 // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
20585 if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
20586 MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);
20587 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);
20588 return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);
20589 }
20590
20591 return SDValue();
20592}
20593
20595 SelectionDAG &DAG,
20596 const X86Subtarget &Subtarget) {
20597 MVT VT = Op.getSimpleValueType();
20598 SDValue In = Op.getOperand(0);
20599 MVT InVT = In.getSimpleValueType();
20600 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
20601
20602 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
20603 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
20604 if (InVT.getScalarSizeInBits() <= 16) {
20605 if (Subtarget.hasBWI()) {
20606 // legal, will go to VPMOVB2M, VPMOVW2M
20607 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
20608 // We need to shift to get the lsb into sign position.
20609 // Shift packed bytes not supported natively, bitcast to word
20610 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
20611 In = DAG.getNode(ISD::SHL, DL, ExtVT,
20612 DAG.getBitcast(ExtVT, In),
20613 DAG.getConstant(ShiftInx, DL, ExtVT));
20614 In = DAG.getBitcast(InVT, In);
20615 }
20616 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
20617 In, ISD::SETGT);
20618 }
20619 // Use TESTD/Q, extended vector to packed dword/qword.
20620 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
20621 "Unexpected vector type.");
20622 unsigned NumElts = InVT.getVectorNumElements();
20623 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
20624 // We need to change to a wider element type that we have support for.
20625 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
20626 // For 16 element vectors we extend to v16i32 unless we are explicitly
20627 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
20628 // we need to split into two 8 element vectors which we can extend to v8i32,
20629 // truncate and concat the results. There's an additional complication if
20630 // the original type is v16i8. In that case we can't split the v16i8
20631 // directly, so we need to shuffle high elements to low and use
20632 // sign_extend_vector_inreg.
20633 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
20634 SDValue Lo, Hi;
20635 if (InVT == MVT::v16i8) {
20636 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
20637 Hi = DAG.getVectorShuffle(
20638 InVT, DL, In, In,
20639 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
20640 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
20641 } else {
20642 assert(InVT == MVT::v16i16 && "Unexpected VT!");
20643 Lo = extract128BitVector(In, 0, DAG, DL);
20644 Hi = extract128BitVector(In, 8, DAG, DL);
20645 }
20646 // We're split now, just emit two truncates and a concat. The two
20647 // truncates will trigger legalization to come back to this function.
20648 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
20649 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
20650 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
20651 }
20652 // We either have 8 elements or we're allowed to use 512-bit vectors.
20653 // If we have VLX, we want to use the narrowest vector that can get the
20654 // job done so we use vXi32.
20655 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
20656 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
20657 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
20658 InVT = ExtVT;
20659 ShiftInx = InVT.getScalarSizeInBits() - 1;
20660 }
20661
20662 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
20663 // We need to shift to get the lsb into sign position.
20664 In = DAG.getNode(ISD::SHL, DL, InVT, In,
20665 DAG.getConstant(ShiftInx, DL, InVT));
20666 }
20667 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
20668 if (Subtarget.hasDQI())
20669 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
20670 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
20671}
20672
20673SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
20674 SDLoc DL(Op);
20675 MVT VT = Op.getSimpleValueType();
20676 SDValue In = Op.getOperand(0);
20677 MVT InVT = In.getSimpleValueType();
20679 "Invalid TRUNCATE operation");
20680
20681 // If we're called by the type legalizer, handle a few cases.
20682 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20683 if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(InVT)) {
20684 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
20685 VT.is128BitVector() && Subtarget.hasAVX512()) {
20686 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
20687 "Unexpected subtarget!");
20688 // The default behavior is to truncate one step, concatenate, and then
20689 // truncate the remainder. We'd rather produce two 64-bit results and
20690 // concatenate those.
20691 SDValue Lo, Hi;
20692 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
20693
20694 EVT LoVT, HiVT;
20695 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
20696
20697 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
20698 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
20699 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
20700 }
20701
20702 // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
20703 if (!Subtarget.hasAVX512() ||
20704 (InVT.is512BitVector() && VT.is256BitVector()))
20705 if (SDValue SignPack =
20706 LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
20707 return SignPack;
20708
20709 // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
20710 if (!Subtarget.hasAVX512())
20711 return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
20712
20713 // Otherwise let default legalization handle it.
20714 return SDValue();
20715 }
20716
20717 if (VT.getVectorElementType() == MVT::i1)
20718 return LowerTruncateVecI1(Op, DL, DAG, Subtarget);
20719
20720 // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
20721 // concat from subvectors to use VPTRUNC etc.
20722 if (!Subtarget.hasAVX512() || isFreeToSplitVector(In.getNode(), DAG))
20723 if (SDValue SignPack =
20724 LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
20725 return SignPack;
20726
20727 // vpmovqb/w/d, vpmovdb/w, vpmovwb
20728 if (Subtarget.hasAVX512()) {
20729 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
20730 assert(VT == MVT::v32i8 && "Unexpected VT!");
20731 return splitVectorIntUnary(Op, DAG, DL);
20732 }
20733
20734 // word to byte only under BWI. Otherwise we have to promoted to v16i32
20735 // and then truncate that. But we should only do that if we haven't been
20736 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
20737 // handled by isel patterns.
20738 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
20739 Subtarget.canExtendTo512DQ())
20740 return Op;
20741 }
20742
20743 // Handle truncation of V256 to V128 using shuffles.
20744 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
20745
20746 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
20747 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
20748 if (Subtarget.hasInt256()) {
20749 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
20750 In = DAG.getBitcast(MVT::v8i32, In);
20751 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
20752 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
20753 DAG.getIntPtrConstant(0, DL));
20754 }
20755
20756 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20757 DAG.getIntPtrConstant(0, DL));
20758 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20759 DAG.getIntPtrConstant(2, DL));
20760 static const int ShufMask[] = {0, 2, 4, 6};
20761 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
20762 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
20763 }
20764
20765 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
20766 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
20767 if (Subtarget.hasInt256()) {
20768 // The PSHUFB mask:
20769 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
20770 -1, -1, -1, -1, -1, -1, -1, -1,
20771 16, 17, 20, 21, 24, 25, 28, 29,
20772 -1, -1, -1, -1, -1, -1, -1, -1 };
20773 In = DAG.getBitcast(MVT::v32i8, In);
20774 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
20775 In = DAG.getBitcast(MVT::v4i64, In);
20776
20777 static const int ShufMask2[] = {0, 2, -1, -1};
20778 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
20779 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20780 DAG.getIntPtrConstant(0, DL));
20781 return DAG.getBitcast(MVT::v8i16, In);
20782 }
20783
20784 return Subtarget.hasSSE41()
20785 ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)
20786 : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);
20787 }
20788
20789 if (VT == MVT::v16i8 && InVT == MVT::v16i16)
20790 return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG);
20791
20792 llvm_unreachable("All 256->128 cases should have been handled above!");
20793}
20794
20795// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
20796// behaves on out of range inputs to generate optimized conversions.
20798 SelectionDAG &DAG,
20799 const X86Subtarget &Subtarget) {
20800 MVT SrcVT = Src.getSimpleValueType();
20801 unsigned DstBits = VT.getScalarSizeInBits();
20802 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
20803
20804 // Calculate the converted result for values in the range 0 to
20805 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
20806 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
20807 SDValue Big =
20808 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
20809 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
20810 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
20811
20812 // The "CVTTP2SI" instruction conveniently sets the sign bit if
20813 // and only if the value was out of range. So we can use that
20814 // as our indicator that we rather use "Big" instead of "Small".
20815 //
20816 // Use "Small" if "IsOverflown" has all bits cleared
20817 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
20818
20819 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
20820 // use the slightly slower blendv select instead.
20821 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
20822 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
20823 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
20824 }
20825
20826 SDValue IsOverflown =
20827 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
20828 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
20829 return DAG.getNode(ISD::OR, dl, VT, Small,
20830 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
20831}
20832
20833SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
20834 bool IsStrict = Op->isStrictFPOpcode();
20835 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
20836 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
20837 MVT VT = Op->getSimpleValueType(0);
20838 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20839 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
20840 MVT SrcVT = Src.getSimpleValueType();
20841 SDLoc dl(Op);
20842
20843 SDValue Res;
20844 if (isSoftF16(SrcVT, Subtarget)) {
20845 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
20846 if (IsStrict)
20847 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
20848 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
20849 {NVT, MVT::Other}, {Chain, Src})});
20850 return DAG.getNode(Op.getOpcode(), dl, VT,
20851 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
20852 } else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) {
20853 return Op;
20854 }
20855
20856 if (VT.isVector()) {
20857 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
20858 MVT ResVT = MVT::v4i32;
20859 MVT TruncVT = MVT::v4i1;
20860 unsigned Opc;
20861 if (IsStrict)
20863 else
20864 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
20865
20866 if (!IsSigned && !Subtarget.hasVLX()) {
20867 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
20868 // Widen to 512-bits.
20869 ResVT = MVT::v8i32;
20870 TruncVT = MVT::v8i1;
20871 Opc = Op.getOpcode();
20872 // Need to concat with zero vector for strict fp to avoid spurious
20873 // exceptions.
20874 // TODO: Should we just do this for non-strict as well?
20875 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
20876 : DAG.getUNDEF(MVT::v8f64);
20877 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
20878 DAG.getIntPtrConstant(0, dl));
20879 }
20880 if (IsStrict) {
20881 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
20882 Chain = Res.getValue(1);
20883 } else {
20884 Res = DAG.getNode(Opc, dl, ResVT, Src);
20885 }
20886
20887 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
20888 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
20889 DAG.getIntPtrConstant(0, dl));
20890 if (IsStrict)
20891 return DAG.getMergeValues({Res, Chain}, dl);
20892 return Res;
20893 }
20894
20895 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
20896 if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
20897 return Op;
20898
20899 MVT ResVT = VT;
20900 MVT EleVT = VT.getVectorElementType();
20901 if (EleVT != MVT::i64)
20902 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
20903
20904 if (SrcVT != MVT::v8f16) {
20905 SDValue Tmp =
20906 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
20907 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
20908 Ops[0] = Src;
20909 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
20910 }
20911
20912 if (IsStrict) {
20913 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
20915 dl, {ResVT, MVT::Other}, {Chain, Src});
20916 Chain = Res.getValue(1);
20917 } else {
20918 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
20919 ResVT, Src);
20920 }
20921
20922 // TODO: Need to add exception check code for strict FP.
20923 if (EleVT.getSizeInBits() < 16) {
20924 ResVT = MVT::getVectorVT(EleVT, 8);
20925 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
20926 }
20927
20928 if (ResVT != VT)
20929 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
20930 DAG.getIntPtrConstant(0, dl));
20931
20932 if (IsStrict)
20933 return DAG.getMergeValues({Res, Chain}, dl);
20934 return Res;
20935 }
20936
20937 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
20938 if (VT.getVectorElementType() == MVT::i16) {
20939 assert((SrcVT.getVectorElementType() == MVT::f32 ||
20940 SrcVT.getVectorElementType() == MVT::f64) &&
20941 "Expected f32/f64 vector!");
20942 MVT NVT = VT.changeVectorElementType(MVT::i32);
20943 if (IsStrict) {
20944 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
20946 dl, {NVT, MVT::Other}, {Chain, Src});
20947 Chain = Res.getValue(1);
20948 } else {
20949 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
20950 NVT, Src);
20951 }
20952
20953 // TODO: Need to add exception check code for strict FP.
20954 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20955
20956 if (IsStrict)
20957 return DAG.getMergeValues({Res, Chain}, dl);
20958 return Res;
20959 }
20960
20961 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
20962 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
20963 assert(!IsSigned && "Expected unsigned conversion!");
20964 assert(Subtarget.useAVX512Regs() && "Requires avx512f");
20965 return Op;
20966 }
20967
20968 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
20969 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
20970 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
20971 Subtarget.useAVX512Regs()) {
20972 assert(!IsSigned && "Expected unsigned conversion!");
20973 assert(!Subtarget.hasVLX() && "Unexpected features!");
20974 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
20975 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
20976 // Need to concat with zero vector for strict fp to avoid spurious
20977 // exceptions.
20978 // TODO: Should we just do this for non-strict as well?
20979 SDValue Tmp =
20980 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
20981 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
20982 DAG.getIntPtrConstant(0, dl));
20983
20984 if (IsStrict) {
20985 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
20986 {Chain, Src});
20987 Chain = Res.getValue(1);
20988 } else {
20989 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
20990 }
20991
20992 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
20993 DAG.getIntPtrConstant(0, dl));
20994
20995 if (IsStrict)
20996 return DAG.getMergeValues({Res, Chain}, dl);
20997 return Res;
20998 }
20999
21000 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21001 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21002 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21003 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
21004 assert(!Subtarget.hasVLX() && "Unexpected features!");
21005 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21006 // Need to concat with zero vector for strict fp to avoid spurious
21007 // exceptions.
21008 // TODO: Should we just do this for non-strict as well?
21009 SDValue Tmp =
21010 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21011 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21012 DAG.getIntPtrConstant(0, dl));
21013
21014 if (IsStrict) {
21015 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21016 {Chain, Src});
21017 Chain = Res.getValue(1);
21018 } else {
21019 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21020 }
21021
21022 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21023 DAG.getIntPtrConstant(0, dl));
21024
21025 if (IsStrict)
21026 return DAG.getMergeValues({Res, Chain}, dl);
21027 return Res;
21028 }
21029
21030 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21031 if (!Subtarget.hasVLX()) {
21032 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21033 // legalizer and then widened again by vector op legalization.
21034 if (!IsStrict)
21035 return SDValue();
21036
21037 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21038 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21039 {Src, Zero, Zero, Zero});
21040 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21041 {Chain, Tmp});
21042 SDValue Chain = Tmp.getValue(1);
21043 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21044 DAG.getIntPtrConstant(0, dl));
21045 return DAG.getMergeValues({Tmp, Chain}, dl);
21046 }
21047
21048 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
21049 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21050 DAG.getUNDEF(MVT::v2f32));
21051 if (IsStrict) {
21052 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21054 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21055 }
21056 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21057 return DAG.getNode(Opc, dl, VT, Tmp);
21058 }
21059
21060 // Generate optimized instructions for pre AVX512 unsigned conversions from
21061 // vXf32 to vXi32.
21062 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21063 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21064 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21065 assert(!IsSigned && "Expected unsigned conversion!");
21066 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21067 }
21068
21069 return SDValue();
21070 }
21071
21072 assert(!VT.isVector());
21073
21074 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21075
21076 if (!IsSigned && UseSSEReg) {
21077 // Conversions from f32/f64 with AVX512 should be legal.
21078 if (Subtarget.hasAVX512())
21079 return Op;
21080
21081 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21082 // behaves on out of range inputs to generate optimized conversions.
21083 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21084 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21085 unsigned DstBits = VT.getScalarSizeInBits();
21086 APInt UIntLimit = APInt::getSignMask(DstBits);
21087 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21088 DAG.getConstant(UIntLimit, dl, VT));
21089 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21090
21091 // Calculate the converted result for values in the range:
21092 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21093 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21094 SDValue Small =
21095 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21096 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21097 SDValue Big = DAG.getNode(
21098 X86ISD::CVTTS2SI, dl, VT,
21099 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21100 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21101
21102 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21103 // and only if the value was out of range. So we can use that
21104 // as our indicator that we rather use "Big" instead of "Small".
21105 //
21106 // Use "Small" if "IsOverflown" has all bits cleared
21107 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21108 SDValue IsOverflown = DAG.getNode(
21109 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21110 return DAG.getNode(ISD::OR, dl, VT, Small,
21111 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21112 }
21113
21114 // Use default expansion for i64.
21115 if (VT == MVT::i64)
21116 return SDValue();
21117
21118 assert(VT == MVT::i32 && "Unexpected VT!");
21119
21120 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21121 // FIXME: This does not generate an invalid exception if the input does not
21122 // fit in i32. PR44019
21123 if (Subtarget.is64Bit()) {
21124 if (IsStrict) {
21125 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
21126 {Chain, Src});
21127 Chain = Res.getValue(1);
21128 } else
21129 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21130
21131 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21132 if (IsStrict)
21133 return DAG.getMergeValues({Res, Chain}, dl);
21134 return Res;
21135 }
21136
21137 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21138 // use fisttp which will be handled later.
21139 if (!Subtarget.hasSSE3())
21140 return SDValue();
21141 }
21142
21143 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21144 // FIXME: This does not generate an invalid exception if the input does not
21145 // fit in i16. PR44019
21146 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21147 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
21148 if (IsStrict) {
21149 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
21150 {Chain, Src});
21151 Chain = Res.getValue(1);
21152 } else
21153 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21154
21155 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21156 if (IsStrict)
21157 return DAG.getMergeValues({Res, Chain}, dl);
21158 return Res;
21159 }
21160
21161 // If this is a FP_TO_SINT using SSEReg we're done.
21162 if (UseSSEReg && IsSigned)
21163 return Op;
21164
21165 // fp128 needs to use a libcall.
21166 if (SrcVT == MVT::f128) {
21167 RTLIB::Libcall LC;
21168 if (IsSigned)
21169 LC = RTLIB::getFPTOSINT(SrcVT, VT);
21170 else
21171 LC = RTLIB::getFPTOUINT(SrcVT, VT);
21172
21173 MakeLibCallOptions CallOptions;
21174 std::pair<SDValue, SDValue> Tmp =
21175 makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
21176
21177 if (IsStrict)
21178 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21179
21180 return Tmp.first;
21181 }
21182
21183 // Fall back to X87.
21184 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21185 if (IsStrict)
21186 return DAG.getMergeValues({V, Chain}, dl);
21187 return V;
21188 }
21189
21190 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
21191}
21192
21193SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21194 SelectionDAG &DAG) const {
21195 SDValue Src = Op.getOperand(0);
21196 EVT DstVT = Op.getSimpleValueType();
21197 MVT SrcVT = Src.getSimpleValueType();
21198
21199 if (SrcVT.isVector())
21200 return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();
21201
21202 if (SrcVT == MVT::f16)
21203 return SDValue();
21204
21205 // If the source is in an SSE register, the node is Legal.
21206 if (isScalarFPTypeInSSEReg(SrcVT))
21207 return Op;
21208
21209 return LRINT_LLRINTHelper(Op.getNode(), DAG);
21210}
21211
21212SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
21213 SelectionDAG &DAG) const {
21214 EVT DstVT = N->getValueType(0);
21215 SDValue Src = N->getOperand(0);
21216 EVT SrcVT = Src.getValueType();
21217
21218 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
21219 // f16 must be promoted before using the lowering in this routine.
21220 // fp128 does not use this lowering.
21221 return SDValue();
21222 }
21223
21224 SDLoc DL(N);
21225 SDValue Chain = DAG.getEntryNode();
21226
21227 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
21228
21229 // If we're converting from SSE, the stack slot needs to hold both types.
21230 // Otherwise it only needs to hold the DstVT.
21231 EVT OtherVT = UseSSE ? SrcVT : DstVT;
21232 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
21233 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
21234 MachinePointerInfo MPI =
21236
21237 if (UseSSE) {
21238 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
21239 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
21240 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21241 SDValue Ops[] = { Chain, StackPtr };
21242
21243 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
21244 /*Align*/ std::nullopt,
21246 Chain = Src.getValue(1);
21247 }
21248
21249 SDValue StoreOps[] = { Chain, Src, StackPtr };
21250 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
21251 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
21253
21254 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
21255}
21256
21257SDValue
21258X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
21259 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
21260 // but making use of X86 specifics to produce better instruction sequences.
21261 SDNode *Node = Op.getNode();
21262 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
21263 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
21264 SDLoc dl(SDValue(Node, 0));
21265 SDValue Src = Node->getOperand(0);
21266
21267 // There are three types involved here: SrcVT is the source floating point
21268 // type, DstVT is the type of the result, and TmpVT is the result of the
21269 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
21270 // DstVT).
21271 EVT SrcVT = Src.getValueType();
21272 EVT DstVT = Node->getValueType(0);
21273 EVT TmpVT = DstVT;
21274
21275 // This code is only for floats and doubles. Fall back to generic code for
21276 // anything else.
21277 if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget))
21278 return SDValue();
21279
21280 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
21281 unsigned SatWidth = SatVT.getScalarSizeInBits();
21282 unsigned DstWidth = DstVT.getScalarSizeInBits();
21283 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
21284 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
21285 "Expected saturation width smaller than result width");
21286
21287 // Promote result of FP_TO_*INT to at least 32 bits.
21288 if (TmpWidth < 32) {
21289 TmpVT = MVT::i32;
21290 TmpWidth = 32;
21291 }
21292
21293 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
21294 // us to use a native signed conversion instead.
21295 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
21296 TmpVT = MVT::i64;
21297 TmpWidth = 64;
21298 }
21299
21300 // If the saturation width is smaller than the size of the temporary result,
21301 // we can always use signed conversion, which is native.
21302 if (SatWidth < TmpWidth)
21303 FpToIntOpcode = ISD::FP_TO_SINT;
21304
21305 // Determine minimum and maximum integer values and their corresponding
21306 // floating-point values.
21307 APInt MinInt, MaxInt;
21308 if (IsSigned) {
21309 MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
21310 MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
21311 } else {
21312 MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
21313 MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
21314 }
21315
21316 APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21317 APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21318
21319 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
21320 MinInt, IsSigned, APFloat::rmTowardZero);
21321 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
21322 MaxInt, IsSigned, APFloat::rmTowardZero);
21323 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
21324 && !(MaxStatus & APFloat::opStatus::opInexact);
21325
21326 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
21327 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
21328
21329 // If the integer bounds are exactly representable as floats, emit a
21330 // min+max+fptoi sequence. Otherwise use comparisons and selects.
21331 if (AreExactFloatBounds) {
21332 if (DstVT != TmpVT) {
21333 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
21334 SDValue MinClamped = DAG.getNode(
21335 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
21336 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
21337 SDValue BothClamped = DAG.getNode(
21338 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
21339 // Convert clamped value to integer.
21340 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
21341
21342 // NaN will become INDVAL, with the top bit set and the rest zero.
21343 // Truncation will discard the top bit, resulting in zero.
21344 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21345 }
21346
21347 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
21348 SDValue MinClamped = DAG.getNode(
21349 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
21350 // Clamp by MaxFloat from above. NaN cannot occur.
21351 SDValue BothClamped = DAG.getNode(
21352 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
21353 // Convert clamped value to integer.
21354 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
21355
21356 if (!IsSigned) {
21357 // In the unsigned case we're done, because we mapped NaN to MinFloat,
21358 // which is zero.
21359 return FpToInt;
21360 }
21361
21362 // Otherwise, select zero if Src is NaN.
21363 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21364 return DAG.getSelectCC(
21365 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
21366 }
21367
21368 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
21369 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
21370
21371 // Result of direct conversion, which may be selected away.
21372 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
21373
21374 if (DstVT != TmpVT) {
21375 // NaN will become INDVAL, with the top bit set and the rest zero.
21376 // Truncation will discard the top bit, resulting in zero.
21377 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21378 }
21379
21380 SDValue Select = FpToInt;
21381 // For signed conversions where we saturate to the same size as the
21382 // result type of the fptoi instructions, INDVAL coincides with integer
21383 // minimum, so we don't need to explicitly check it.
21384 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
21385 // If Src ULT MinFloat, select MinInt. In particular, this also selects
21386 // MinInt if Src is NaN.
21387 Select = DAG.getSelectCC(
21388 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
21389 }
21390
21391 // If Src OGT MaxFloat, select MaxInt.
21392 Select = DAG.getSelectCC(
21393 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
21394
21395 // In the unsigned case we are done, because we mapped NaN to MinInt, which
21396 // is already zero. The promoted case was already handled above.
21397 if (!IsSigned || DstVT != TmpVT) {
21398 return Select;
21399 }
21400
21401 // Otherwise, select 0 if Src is NaN.
21402 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21403 return DAG.getSelectCC(
21404 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
21405}
21406
21407SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
21408 bool IsStrict = Op->isStrictFPOpcode();
21409
21410 SDLoc DL(Op);
21411 MVT VT = Op.getSimpleValueType();
21412 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21413 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21414 MVT SVT = In.getSimpleValueType();
21415
21416 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
21417 // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
21418 if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
21419 !Subtarget.getTargetTriple().isOSDarwin()))
21420 return SDValue();
21421
21422 if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) ||
21423 (SVT == MVT::v16f16 && Subtarget.useAVX512Regs()))
21424 return Op;
21425
21426 if (SVT == MVT::f16) {
21427 if (Subtarget.hasFP16())
21428 return Op;
21429
21430 if (VT != MVT::f32) {
21431 if (IsStrict)
21432 return DAG.getNode(
21433 ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
21434 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
21435 {MVT::f32, MVT::Other}, {Chain, In})});
21436
21437 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
21438 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
21439 }
21440
21441 if (!Subtarget.hasF16C()) {
21442 if (!Subtarget.getTargetTriple().isOSDarwin())
21443 return SDValue();
21444
21445 assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall");
21446
21447 // Need a libcall, but ABI for f16 is soft-float on MacOS.
21449 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21450
21451 In = DAG.getBitcast(MVT::i16, In);
21454 Entry.Node = In;
21455 Entry.Ty = EVT(MVT::i16).getTypeForEVT(*DAG.getContext());
21456 Entry.IsSExt = false;
21457 Entry.IsZExt = true;
21458 Args.push_back(Entry);
21459
21461 getLibcallName(RTLIB::FPEXT_F16_F32),
21463 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
21464 CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
21465 std::move(Args));
21466
21467 SDValue Res;
21468 std::tie(Res,Chain) = LowerCallTo(CLI);
21469 if (IsStrict)
21470 Res = DAG.getMergeValues({Res, Chain}, DL);
21471
21472 return Res;
21473 }
21474
21475 In = DAG.getBitcast(MVT::i16, In);
21476 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
21477 getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
21478 DAG.getIntPtrConstant(0, DL));
21479 SDValue Res;
21480 if (IsStrict) {
21481 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
21482 {Chain, In});
21483 Chain = Res.getValue(1);
21484 } else {
21485 Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
21486 DAG.getTargetConstant(4, DL, MVT::i32));
21487 }
21488 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
21489 DAG.getIntPtrConstant(0, DL));
21490 if (IsStrict)
21491 return DAG.getMergeValues({Res, Chain}, DL);
21492 return Res;
21493 }
21494
21495 if (!SVT.isVector() || SVT.getVectorElementType() == MVT::bf16)
21496 return Op;
21497
21498 if (SVT.getVectorElementType() == MVT::f16) {
21499 if (Subtarget.hasFP16() && isTypeLegal(SVT))
21500 return Op;
21501 assert(Subtarget.hasF16C() && "Unexpected features!");
21502 if (SVT == MVT::v2f16)
21503 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
21504 DAG.getUNDEF(MVT::v2f16));
21505 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
21506 DAG.getUNDEF(MVT::v4f16));
21507 if (IsStrict)
21508 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21509 {Op->getOperand(0), Res});
21510 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21511 } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
21512 return Op;
21513 }
21514
21515 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
21516
21517 SDValue Res =
21518 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
21519 if (IsStrict)
21520 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21521 {Op->getOperand(0), Res});
21522 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21523}
21524
21525SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
21526 bool IsStrict = Op->isStrictFPOpcode();
21527
21528 SDLoc DL(Op);
21529 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21530 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21531 MVT VT = Op.getSimpleValueType();
21532 MVT SVT = In.getSimpleValueType();
21533
21534 if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
21535 return SDValue();
21536
21537 if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
21538 !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
21539 if (!Subtarget.getTargetTriple().isOSDarwin())
21540 return SDValue();
21541
21542 // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
21544 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21545
21548 Entry.Node = In;
21549 Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext());
21550 Entry.IsSExt = false;
21551 Entry.IsZExt = true;
21552 Args.push_back(Entry);
21553
21555 getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
21556 : RTLIB::FPROUND_F32_F16),
21558 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
21559 CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
21560 std::move(Args));
21561
21562 SDValue Res;
21563 std::tie(Res, Chain) = LowerCallTo(CLI);
21564
21565 Res = DAG.getBitcast(MVT::f16, Res);
21566
21567 if (IsStrict)
21568 Res = DAG.getMergeValues({Res, Chain}, DL);
21569
21570 return Res;
21571 }
21572
21573 if (VT.getScalarType() == MVT::bf16) {
21574 if (SVT.getScalarType() == MVT::f32 &&
21575 ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
21576 Subtarget.hasAVXNECONVERT()))
21577 return Op;
21578 return SDValue();
21579 }
21580
21581 if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
21582 if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
21583 return SDValue();
21584
21585 if (VT.isVector())
21586 return Op;
21587
21588 SDValue Res;
21590 MVT::i32);
21591 if (IsStrict) {
21592 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
21593 DAG.getConstantFP(0, DL, MVT::v4f32), In,
21594 DAG.getIntPtrConstant(0, DL));
21595 Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
21596 {Chain, Res, Rnd});
21597 Chain = Res.getValue(1);
21598 } else {
21599 // FIXME: Should we use zeros for upper elements for non-strict?
21600 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
21601 Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
21602 }
21603
21604 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
21605 DAG.getIntPtrConstant(0, DL));
21606 Res = DAG.getBitcast(MVT::f16, Res);
21607
21608 if (IsStrict)
21609 return DAG.getMergeValues({Res, Chain}, DL);
21610
21611 return Res;
21612 }
21613
21614 return Op;
21615}
21616
21618 bool IsStrict = Op->isStrictFPOpcode();
21619 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21620 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
21621 "Unexpected VT!");
21622
21623 SDLoc dl(Op);
21624 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
21625 DAG.getConstant(0, dl, MVT::v8i16), Src,
21626 DAG.getIntPtrConstant(0, dl));
21627
21628 SDValue Chain;
21629 if (IsStrict) {
21630 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
21631 {Op.getOperand(0), Res});
21632 Chain = Res.getValue(1);
21633 } else {
21634 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
21635 }
21636
21637 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
21638 DAG.getIntPtrConstant(0, dl));
21639
21640 if (IsStrict)
21641 return DAG.getMergeValues({Res, Chain}, dl);
21642
21643 return Res;
21644}
21645
21647 bool IsStrict = Op->isStrictFPOpcode();
21648 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21649 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
21650 "Unexpected VT!");
21651
21652 SDLoc dl(Op);
21653 SDValue Res, Chain;
21654 if (IsStrict) {
21655 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
21656 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
21657 DAG.getIntPtrConstant(0, dl));
21658 Res = DAG.getNode(
21659 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
21660 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
21661 Chain = Res.getValue(1);
21662 } else {
21663 // FIXME: Should we use zeros for upper elements for non-strict?
21664 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
21665 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
21666 DAG.getTargetConstant(4, dl, MVT::i32));
21667 }
21668
21669 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
21670 DAG.getIntPtrConstant(0, dl));
21671
21672 if (IsStrict)
21673 return DAG.getMergeValues({Res, Chain}, dl);
21674
21675 return Res;
21676}
21677
21678SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
21679 SelectionDAG &DAG) const {
21680 SDLoc DL(Op);
21681
21682 MVT SVT = Op.getOperand(0).getSimpleValueType();
21683 if (SVT == MVT::f32 && ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
21684 Subtarget.hasAVXNECONVERT())) {
21685 SDValue Res;
21686 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Op.getOperand(0));
21687 Res = DAG.getNode(X86ISD::CVTNEPS2BF16, DL, MVT::v8bf16, Res);
21688 Res = DAG.getBitcast(MVT::v8i16, Res);
21689 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
21690 DAG.getIntPtrConstant(0, DL));
21691 }
21692
21693 MakeLibCallOptions CallOptions;
21694 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
21695 SDValue Res =
21696 makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first;
21697 return DAG.getBitcast(MVT::i16, Res);
21698}
21699
21700/// Depending on uarch and/or optimizing for size, we might prefer to use a
21701/// vector operation in place of the typical scalar operation.
21703 SelectionDAG &DAG,
21704 const X86Subtarget &Subtarget) {
21705 // If both operands have other uses, this is probably not profitable.
21706 SDValue LHS = Op.getOperand(0);
21707 SDValue RHS = Op.getOperand(1);
21708 if (!LHS.hasOneUse() && !RHS.hasOneUse())
21709 return Op;
21710
21711 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
21712 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
21713 if (IsFP && !Subtarget.hasSSE3())
21714 return Op;
21715 if (!IsFP && !Subtarget.hasSSSE3())
21716 return Op;
21717
21718 // Extract from a common vector.
21719 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21720 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21721 LHS.getOperand(0) != RHS.getOperand(0) ||
21722 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
21723 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
21724 !shouldUseHorizontalOp(true, DAG, Subtarget))
21725 return Op;
21726
21727 // Allow commuted 'hadd' ops.
21728 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
21729 unsigned HOpcode;
21730 switch (Op.getOpcode()) {
21731 // clang-format off
21732 case ISD::ADD: HOpcode = X86ISD::HADD; break;
21733 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
21734 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
21735 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
21736 default:
21737 llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
21738 // clang-format on
21739 }
21740 unsigned LExtIndex = LHS.getConstantOperandVal(1);
21741 unsigned RExtIndex = RHS.getConstantOperandVal(1);
21742 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
21743 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
21744 std::swap(LExtIndex, RExtIndex);
21745
21746 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
21747 return Op;
21748
21749 SDValue X = LHS.getOperand(0);
21750 EVT VecVT = X.getValueType();
21751 unsigned BitWidth = VecVT.getSizeInBits();
21752 unsigned NumLanes = BitWidth / 128;
21753 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
21754 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
21755 "Not expecting illegal vector widths here");
21756
21757 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
21758 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
21759 if (BitWidth == 256 || BitWidth == 512) {
21760 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
21761 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
21762 LExtIndex %= NumEltsPerLane;
21763 }
21764
21765 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
21766 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
21767 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
21768 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
21769 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
21770 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
21771 DAG.getIntPtrConstant(LExtIndex / 2, DL));
21772}
21773
21774/// Depending on uarch and/or optimizing for size, we might prefer to use a
21775/// vector operation in place of the typical scalar operation.
21776SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
21777 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
21778 "Only expecting float/double");
21779 return lowerAddSubToHorizontalOp(Op, SDLoc(Op), DAG, Subtarget);
21780}
21781
21782/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
21783/// This mode isn't supported in hardware on X86. But as long as we aren't
21784/// compiling with trapping math, we can emulate this with
21785/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
21787 SDValue N0 = Op.getOperand(0);
21788 SDLoc dl(Op);
21789 MVT VT = Op.getSimpleValueType();
21790
21791 // N0 += copysign(nextafter(0.5, 0.0), N0)
21793 bool Ignored;
21794 APFloat Point5Pred = APFloat(0.5f);
21795 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
21796 Point5Pred.next(/*nextDown*/true);
21797
21798 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
21799 DAG.getConstantFP(Point5Pred, dl, VT), N0);
21800 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
21801
21802 // Truncate the result to remove fraction.
21803 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
21804}
21805
21806/// The only differences between FABS and FNEG are the mask and the logic op.
21807/// FNEG also has a folding opportunity for FNEG(FABS(x)).
21809 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
21810 "Wrong opcode for lowering FABS or FNEG.");
21811
21812 bool IsFABS = (Op.getOpcode() == ISD::FABS);
21813
21814 // If this is a FABS and it has an FNEG user, bail out to fold the combination
21815 // into an FNABS. We'll lower the FABS after that if it is still in use.
21816 if (IsFABS)
21817 for (SDNode *User : Op->uses())
21818 if (User->getOpcode() == ISD::FNEG)
21819 return Op;
21820
21821 SDLoc dl(Op);
21822 MVT VT = Op.getSimpleValueType();
21823
21824 bool IsF128 = (VT == MVT::f128);
21825 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
21827 "Unexpected type in LowerFABSorFNEG");
21828
21829 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOptLevel to
21830 // decide if we should generate a 16-byte constant mask when we only need 4 or
21831 // 8 bytes for the scalar case.
21832
21833 // There are no scalar bitwise logical SSE/AVX instructions, so we
21834 // generate a 16-byte vector constant and logic op even for the scalar case.
21835 // Using a 16-byte mask allows folding the load of the mask with
21836 // the logic op, so it can save (~4 bytes) on code size.
21837 bool IsFakeVector = !VT.isVector() && !IsF128;
21838 MVT LogicVT = VT;
21839 if (IsFakeVector)
21840 LogicVT = (VT == MVT::f64) ? MVT::v2f64
21841 : (VT == MVT::f32) ? MVT::v4f32
21842 : MVT::v8f16;
21843
21844 unsigned EltBits = VT.getScalarSizeInBits();
21845 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
21846 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
21847 APInt::getSignMask(EltBits);
21849 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
21850
21851 SDValue Op0 = Op.getOperand(0);
21852 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
21853 unsigned LogicOp = IsFABS ? X86ISD::FAND :
21854 IsFNABS ? X86ISD::FOR :
21856 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
21857
21858 if (VT.isVector() || IsF128)
21859 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
21860
21861 // For the scalar case extend to a 128-bit vector, perform the logic op,
21862 // and extract the scalar result back out.
21863 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
21864 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
21865 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
21866 DAG.getIntPtrConstant(0, dl));
21867}
21868
21870 SDValue Mag = Op.getOperand(0);
21871 SDValue Sign = Op.getOperand(1);
21872 SDLoc dl(Op);
21873
21874 // If the sign operand is smaller, extend it first.
21875 MVT VT = Op.getSimpleValueType();
21876 if (Sign.getSimpleValueType().bitsLT(VT))
21877 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
21878
21879 // And if it is bigger, shrink it first.
21880 if (Sign.getSimpleValueType().bitsGT(VT))
21881 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
21882 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
21883
21884 // At this point the operands and the result should have the same
21885 // type, and that won't be f80 since that is not custom lowered.
21886 bool IsF128 = (VT == MVT::f128);
21887 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
21889 "Unexpected type in LowerFCOPYSIGN");
21890
21892
21893 // Perform all scalar logic operations as 16-byte vectors because there are no
21894 // scalar FP logic instructions in SSE.
21895 // TODO: This isn't necessary. If we used scalar types, we might avoid some
21896 // unnecessary splats, but we might miss load folding opportunities. Should
21897 // this decision be based on OptimizeForSize?
21898 bool IsFakeVector = !VT.isVector() && !IsF128;
21899 MVT LogicVT = VT;
21900 if (IsFakeVector)
21901 LogicVT = (VT == MVT::f64) ? MVT::v2f64
21902 : (VT == MVT::f32) ? MVT::v4f32
21903 : MVT::v8f16;
21904
21905 // The mask constants are automatically splatted for vector types.
21906 unsigned EltSizeInBits = VT.getScalarSizeInBits();
21907 SDValue SignMask = DAG.getConstantFP(
21908 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
21909 SDValue MagMask = DAG.getConstantFP(
21910 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
21911
21912 // First, clear all bits but the sign bit from the second operand (sign).
21913 if (IsFakeVector)
21914 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
21915 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
21916
21917 // Next, clear the sign bit from the first operand (magnitude).
21918 // TODO: If we had general constant folding for FP logic ops, this check
21919 // wouldn't be necessary.
21920 SDValue MagBits;
21921 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
21922 APFloat APF = Op0CN->getValueAPF();
21923 APF.clearSign();
21924 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
21925 } else {
21926 // If the magnitude operand wasn't a constant, we need to AND out the sign.
21927 if (IsFakeVector)
21928 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
21929 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
21930 }
21931
21932 // OR the magnitude value with the sign bit.
21933 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
21934 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
21935 DAG.getIntPtrConstant(0, dl));
21936}
21937
21939 SDValue N0 = Op.getOperand(0);
21940 SDLoc dl(Op);
21941 MVT VT = Op.getSimpleValueType();
21942
21943 MVT OpVT = N0.getSimpleValueType();
21944 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
21945 "Unexpected type for FGETSIGN");
21946
21947 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
21948 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
21949 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
21950 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
21951 Res = DAG.getZExtOrTrunc(Res, dl, VT);
21952 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
21953 return Res;
21954}
21955
21956/// Helper for attempting to create a X86ISD::BT node.
21957static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
21958 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
21959 // instruction. Since the shift amount is in-range-or-undefined, we know
21960 // that doing a bittest on the i32 value is ok. We extend to i32 because
21961 // the encoding for the i16 version is larger than the i32 version.
21962 // Also promote i16 to i32 for performance / code size reason.
21963 if (Src.getValueType().getScalarSizeInBits() < 32)
21964 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
21965
21966 // No legal type found, give up.
21967 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
21968 return SDValue();
21969
21970 // See if we can use the 32-bit instruction instead of the 64-bit one for a
21971 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
21972 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
21973 // known to be zero.
21974 if (Src.getValueType() == MVT::i64 &&
21975 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
21976 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
21977
21978 // If the operand types disagree, extend the shift amount to match. Since
21979 // BT ignores high bits (like shifts) we can use anyextend.
21980 if (Src.getValueType() != BitNo.getValueType()) {
21981 // Peek through a mask/modulo operation.
21982 // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
21983 // we probably need a better IsDesirableToPromoteOp to handle this as well.
21984 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
21985 BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
21986 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
21987 BitNo.getOperand(0)),
21988 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
21989 BitNo.getOperand(1)));
21990 else
21991 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
21992 }
21993
21994 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
21995}
21996
21997/// Helper for creating a X86ISD::SETCC node.
21999 SelectionDAG &DAG) {
22000 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
22001 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
22002}
22003
22004/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
22005/// recognizable memcmp expansion.
22006static bool isOrXorXorTree(SDValue X, bool Root = true) {
22007 if (X.getOpcode() == ISD::OR)
22008 return isOrXorXorTree(X.getOperand(0), false) &&
22009 isOrXorXorTree(X.getOperand(1), false);
22010 if (Root)
22011 return false;
22012 return X.getOpcode() == ISD::XOR;
22013}
22014
22015/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
22016/// expansion.
22017template <typename F>
22019 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
22020 SDValue Op0 = X.getOperand(0);
22021 SDValue Op1 = X.getOperand(1);
22022 if (X.getOpcode() == ISD::OR) {
22023 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22024 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22025 if (VecVT != CmpVT)
22026 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
22027 if (HasPT)
22028 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
22029 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
22030 }
22031 if (X.getOpcode() == ISD::XOR) {
22032 SDValue A = SToV(Op0);
22033 SDValue B = SToV(Op1);
22034 if (VecVT != CmpVT)
22035 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
22036 if (HasPT)
22037 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
22038 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
22039 }
22040 llvm_unreachable("Impossible");
22041}
22042
22043/// Try to map a 128-bit or larger integer comparison to vector instructions
22044/// before type legalization splits it up into chunks.
22047 const SDLoc &DL,
22048 SelectionDAG &DAG,
22049 const X86Subtarget &Subtarget) {
22050 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
22051
22052 // We're looking for an oversized integer equality comparison.
22053 EVT OpVT = X.getValueType();
22054 unsigned OpSize = OpVT.getSizeInBits();
22055 if (!OpVT.isScalarInteger() || OpSize < 128)
22056 return SDValue();
22057
22058 // Ignore a comparison with zero because that gets special treatment in
22059 // EmitTest(). But make an exception for the special case of a pair of
22060 // logically-combined vector-sized operands compared to zero. This pattern may
22061 // be generated by the memcmp expansion pass with oversized integer compares
22062 // (see PR33325).
22063 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
22064 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
22065 return SDValue();
22066
22067 // Don't perform this combine if constructing the vector will be expensive.
22068 auto IsVectorBitCastCheap = [](SDValue X) {
22070 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
22071 X.getOpcode() == ISD::LOAD;
22072 };
22073 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
22074 !IsOrXorXorTreeCCZero)
22075 return SDValue();
22076
22077 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
22078 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
22079 // Otherwise use PCMPEQ (plus AND) and mask testing.
22080 bool NoImplicitFloatOps =
22082 Attribute::NoImplicitFloat);
22083 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
22084 ((OpSize == 128 && Subtarget.hasSSE2()) ||
22085 (OpSize == 256 && Subtarget.hasAVX()) ||
22086 (OpSize == 512 && Subtarget.useAVX512Regs()))) {
22087 bool HasPT = Subtarget.hasSSE41();
22088
22089 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
22090 // vector registers are essentially free. (Technically, widening registers
22091 // prevents load folding, but the tradeoff is worth it.)
22092 bool PreferKOT = Subtarget.preferMaskRegisters();
22093 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
22094
22095 EVT VecVT = MVT::v16i8;
22096 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
22097 if (OpSize == 256) {
22098 VecVT = MVT::v32i8;
22099 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
22100 }
22101 EVT CastVT = VecVT;
22102 bool NeedsAVX512FCast = false;
22103 if (OpSize == 512 || NeedZExt) {
22104 if (Subtarget.hasBWI()) {
22105 VecVT = MVT::v64i8;
22106 CmpVT = MVT::v64i1;
22107 if (OpSize == 512)
22108 CastVT = VecVT;
22109 } else {
22110 VecVT = MVT::v16i32;
22111 CmpVT = MVT::v16i1;
22112 CastVT = OpSize == 512 ? VecVT
22113 : OpSize == 256 ? MVT::v8i32
22114 : MVT::v4i32;
22115 NeedsAVX512FCast = true;
22116 }
22117 }
22118
22119 auto ScalarToVector = [&](SDValue X) -> SDValue {
22120 bool TmpZext = false;
22121 EVT TmpCastVT = CastVT;
22122 if (X.getOpcode() == ISD::ZERO_EXTEND) {
22123 SDValue OrigX = X.getOperand(0);
22124 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
22125 if (OrigSize < OpSize) {
22126 if (OrigSize == 128) {
22127 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
22128 X = OrigX;
22129 TmpZext = true;
22130 } else if (OrigSize == 256) {
22131 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
22132 X = OrigX;
22133 TmpZext = true;
22134 }
22135 }
22136 }
22137 X = DAG.getBitcast(TmpCastVT, X);
22138 if (!NeedZExt && !TmpZext)
22139 return X;
22140 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
22141 DAG.getConstant(0, DL, VecVT), X,
22142 DAG.getVectorIdxConstant(0, DL));
22143 };
22144
22145 SDValue Cmp;
22146 if (IsOrXorXorTreeCCZero) {
22147 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
22148 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
22149 // Use 2 vector equality compares and 'and' the results before doing a
22150 // MOVMSK.
22151 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
22152 } else {
22153 SDValue VecX = ScalarToVector(X);
22154 SDValue VecY = ScalarToVector(Y);
22155 if (VecVT != CmpVT) {
22156 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
22157 } else if (HasPT) {
22158 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
22159 } else {
22160 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
22161 }
22162 }
22163 // AVX512 should emit a setcc that will lower to kortest.
22164 if (VecVT != CmpVT) {
22165 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64
22166 : CmpVT == MVT::v32i1 ? MVT::i32
22167 : MVT::i16;
22168 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
22169 DAG.getConstant(0, DL, KRegVT), CC);
22170 }
22171 if (HasPT) {
22172 SDValue BCCmp =
22173 DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
22174 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
22176 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
22177 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
22178 }
22179 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
22180 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
22181 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
22182 assert(Cmp.getValueType() == MVT::v16i8 &&
22183 "Non 128-bit vector on pre-SSE41 target");
22184 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
22185 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
22186 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
22187 }
22188
22189 return SDValue();
22190}
22191
22192/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
22193/// style scalarized (associative) reduction patterns. Partial reductions
22194/// are supported when the pointer SrcMask is non-null.
22195/// TODO - move this to SelectionDAG?
22198 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22200 DenseMap<SDValue, APInt> SrcOpMap;
22201 EVT VT = MVT::Other;
22202
22203 // Recognize a special case where a vector is casted into wide integer to
22204 // test all 0s.
22205 assert(Op.getOpcode() == unsigned(BinOp) &&
22206 "Unexpected bit reduction opcode");
22207 Opnds.push_back(Op.getOperand(0));
22208 Opnds.push_back(Op.getOperand(1));
22209
22210 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
22212 // BFS traverse all BinOp operands.
22213 if (I->getOpcode() == unsigned(BinOp)) {
22214 Opnds.push_back(I->getOperand(0));
22215 Opnds.push_back(I->getOperand(1));
22216 // Re-evaluate the number of nodes to be traversed.
22217 e += 2; // 2 more nodes (LHS and RHS) are pushed.
22218 continue;
22219 }
22220
22221 // Quit if a non-EXTRACT_VECTOR_ELT
22222 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22223 return false;
22224
22225 // Quit if without a constant index.
22226 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
22227 if (!Idx)
22228 return false;
22229
22230 SDValue Src = I->getOperand(0);
22231 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
22232 if (M == SrcOpMap.end()) {
22233 VT = Src.getValueType();
22234 // Quit if not the same type.
22235 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
22236 return false;
22237 unsigned NumElts = VT.getVectorNumElements();
22238 APInt EltCount = APInt::getZero(NumElts);
22239 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
22240 SrcOps.push_back(Src);
22241 }
22242
22243 // Quit if element already used.
22244 unsigned CIdx = Idx->getZExtValue();
22245 if (M->second[CIdx])
22246 return false;
22247 M->second.setBit(CIdx);
22248 }
22249
22250 if (SrcMask) {
22251 // Collect the source partial masks.
22252 for (SDValue &SrcOp : SrcOps)
22253 SrcMask->push_back(SrcOpMap[SrcOp]);
22254 } else {
22255 // Quit if not all elements are used.
22256 for (const auto &I : SrcOpMap)
22257 if (!I.second.isAllOnes())
22258 return false;
22259 }
22260
22261 return true;
22262}
22263
22264// Helper function for comparing all bits of two vectors.
22266 ISD::CondCode CC, const APInt &OriginalMask,
22267 const X86Subtarget &Subtarget,
22268 SelectionDAG &DAG, X86::CondCode &X86CC) {
22269 EVT VT = LHS.getValueType();
22270 unsigned ScalarSize = VT.getScalarSizeInBits();
22271 if (OriginalMask.getBitWidth() != ScalarSize) {
22272 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
22273 return SDValue();
22274 }
22275
22276 // Quit if not convertable to legal scalar or 128/256-bit vector.
22277 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
22278 return SDValue();
22279
22280 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
22281 if (VT.isFloatingPoint())
22282 return SDValue();
22283
22284 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22285 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
22286
22287 APInt Mask = OriginalMask;
22288
22289 auto MaskBits = [&](SDValue Src) {
22290 if (Mask.isAllOnes())
22291 return Src;
22292 EVT SrcVT = Src.getValueType();
22293 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
22294 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
22295 };
22296
22297 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
22298 if (VT.getSizeInBits() < 128) {
22299 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
22300 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
22301 if (IntVT != MVT::i64)
22302 return SDValue();
22303 auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
22304 MVT::i32, MVT::i32);
22305 auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
22306 MVT::i32, MVT::i32);
22307 SDValue Lo =
22308 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
22309 SDValue Hi =
22310 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
22311 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22312 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
22313 DAG.getConstant(0, DL, MVT::i32));
22314 }
22315 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22316 DAG.getBitcast(IntVT, MaskBits(LHS)),
22317 DAG.getBitcast(IntVT, MaskBits(RHS)));
22318 }
22319
22320 // Without PTEST, a masked v2i64 or-reduction is not faster than
22321 // scalarization.
22322 bool UseKORTEST = Subtarget.useAVX512Regs();
22323 bool UsePTEST = Subtarget.hasSSE41();
22324 if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
22325 return SDValue();
22326
22327 // Split down to 128/256/512-bit vector.
22328 unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
22329
22330 // If the input vector has vector elements wider than the target test size,
22331 // then cast to <X x i64> so it will safely split.
22332 if (ScalarSize > TestSize) {
22333 if (!Mask.isAllOnes())
22334 return SDValue();
22335 VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
22336 LHS = DAG.getBitcast(VT, LHS);
22337 RHS = DAG.getBitcast(VT, RHS);
22338 Mask = APInt::getAllOnes(64);
22339 }
22340
22341 if (VT.getSizeInBits() > TestSize) {
22342 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
22343 if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
22344 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
22345 while (VT.getSizeInBits() > TestSize) {
22346 auto Split = DAG.SplitVector(LHS, DL);
22347 VT = Split.first.getValueType();
22348 LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
22349 }
22350 RHS = DAG.getAllOnesConstant(DL, VT);
22351 } else if (!UsePTEST && !KnownRHS.isZero()) {
22352 // MOVMSK Special Case:
22353 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
22354 MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
22355 VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
22356 LHS = DAG.getBitcast(VT, MaskBits(LHS));
22357 RHS = DAG.getBitcast(VT, MaskBits(RHS));
22358 EVT BoolVT = VT.changeVectorElementType(MVT::i1);
22359 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
22360 V = DAG.getSExtOrTrunc(V, DL, VT);
22361 while (VT.getSizeInBits() > TestSize) {
22362 auto Split = DAG.SplitVector(V, DL);
22363 VT = Split.first.getValueType();
22364 V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
22365 }
22366 V = DAG.getNOT(DL, V, VT);
22367 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22368 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22369 DAG.getConstant(0, DL, MVT::i32));
22370 } else {
22371 // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
22372 SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
22373 while (VT.getSizeInBits() > TestSize) {
22374 auto Split = DAG.SplitVector(V, DL);
22375 VT = Split.first.getValueType();
22376 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
22377 }
22378 LHS = V;
22379 RHS = DAG.getConstant(0, DL, VT);
22380 }
22381 }
22382
22383 if (UseKORTEST && VT.is512BitVector()) {
22384 MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
22385 MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
22386 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
22387 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
22388 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
22389 return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
22390 }
22391
22392 if (UsePTEST) {
22393 MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
22394 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
22395 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
22396 SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
22397 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
22398 }
22399
22400 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");
22401 MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
22402 LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
22403 RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
22404 SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
22405 V = DAG.getNOT(DL, V, MaskVT);
22406 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22407 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22408 DAG.getConstant(0, DL, MVT::i32));
22409}
22410
22411// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
22412// to CMP(MOVMSK(PCMPEQB(X,Y))).
22414 ISD::CondCode CC, const SDLoc &DL,
22415 const X86Subtarget &Subtarget,
22416 SelectionDAG &DAG,
22417 X86::CondCode &X86CC) {
22418 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22419
22420 bool CmpNull = isNullConstant(RHS);
22421 bool CmpAllOnes = isAllOnesConstant(RHS);
22422 if (!CmpNull && !CmpAllOnes)
22423 return SDValue();
22424
22425 SDValue Op = LHS;
22426 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
22427 return SDValue();
22428
22429 // Check whether we're masking/truncating an OR-reduction result, in which
22430 // case track the masked bits.
22431 // TODO: Add CmpAllOnes support.
22432 APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
22433 if (CmpNull) {
22434 switch (Op.getOpcode()) {
22435 case ISD::TRUNCATE: {
22436 SDValue Src = Op.getOperand(0);
22437 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
22438 Op.getScalarValueSizeInBits());
22439 Op = Src;
22440 break;
22441 }
22442 case ISD::AND: {
22443 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
22444 Mask = Cst->getAPIntValue();
22445 Op = Op.getOperand(0);
22446 }
22447 break;
22448 }
22449 }
22450 }
22451
22452 ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
22453
22454 // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
22455 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
22457 if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
22458 EVT VT = VecIns[0].getValueType();
22459 assert(llvm::all_of(VecIns,
22460 [VT](SDValue V) { return VT == V.getValueType(); }) &&
22461 "Reduction source vector mismatch");
22462
22463 // Quit if not splittable to scalar/128/256/512-bit vector.
22464 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
22465 return SDValue();
22466
22467 // If more than one full vector is evaluated, AND/OR them first before
22468 // PTEST.
22469 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
22470 Slot += 2, e += 1) {
22471 // Each iteration will AND/OR 2 nodes and append the result until there is
22472 // only 1 node left, i.e. the final value of all vectors.
22473 SDValue LHS = VecIns[Slot];
22474 SDValue RHS = VecIns[Slot + 1];
22475 VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
22476 }
22477
22478 return LowerVectorAllEqual(DL, VecIns.back(),
22479 CmpNull ? DAG.getConstant(0, DL, VT)
22480 : DAG.getAllOnesConstant(DL, VT),
22481 CC, Mask, Subtarget, DAG, X86CC);
22482 }
22483
22484 // Match icmp(reduce_or(X),0) anyof reduction patterns.
22485 // Match icmp(reduce_and(X),-1) allof reduction patterns.
22486 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
22487 ISD::NodeType BinOp;
22488 if (SDValue Match =
22489 DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
22490 EVT MatchVT = Match.getValueType();
22492 CmpNull ? DAG.getConstant(0, DL, MatchVT)
22493 : DAG.getAllOnesConstant(DL, MatchVT),
22494 CC, Mask, Subtarget, DAG, X86CC);
22495 }
22496 }
22497
22498 if (Mask.isAllOnes()) {
22499 assert(!Op.getValueType().isVector() &&
22500 "Illegal vector type for reduction pattern");
22502 if (Src.getValueType().isFixedLengthVector() &&
22503 Src.getValueType().getScalarType() == MVT::i1) {
22504 // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
22505 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
22506 if (Src.getOpcode() == ISD::SETCC) {
22507 SDValue LHS = Src.getOperand(0);
22508 SDValue RHS = Src.getOperand(1);
22509 EVT LHSVT = LHS.getValueType();
22510 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
22511 if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
22512 llvm::has_single_bit<uint32_t>(LHSVT.getSizeInBits())) {
22513 APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
22514 return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
22515 X86CC);
22516 }
22517 }
22518 // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
22519 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
22520 // Peek through truncation, mask the LSB and compare against zero/LSB.
22521 if (Src.getOpcode() == ISD::TRUNCATE) {
22522 SDValue Inner = Src.getOperand(0);
22523 EVT InnerVT = Inner.getValueType();
22524 if (llvm::has_single_bit<uint32_t>(InnerVT.getSizeInBits())) {
22525 unsigned BW = InnerVT.getScalarSizeInBits();
22526 APInt SrcMask = APInt(BW, 1);
22527 APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
22528 return LowerVectorAllEqual(DL, Inner,
22529 DAG.getConstant(Cmp, DL, InnerVT), CC,
22530 SrcMask, Subtarget, DAG, X86CC);
22531 }
22532 }
22533 }
22534 }
22535
22536 return SDValue();
22537}
22538
22539/// return true if \c Op has a use that doesn't just read flags.
22541 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
22542 ++UI) {
22543 SDNode *User = *UI;
22544 unsigned UOpNo = UI.getOperandNo();
22545 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
22546 // Look pass truncate.
22547 UOpNo = User->use_begin().getOperandNo();
22548 User = *User->use_begin();
22549 }
22550
22551 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
22552 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
22553 return true;
22554 }
22555 return false;
22556}
22557
22558// Transform to an x86-specific ALU node with flags if there is a chance of
22559// using an RMW op or only the flags are used. Otherwise, leave
22560// the node alone and emit a 'cmp' or 'test' instruction.
22562 for (SDNode *U : Op->uses())
22563 if (U->getOpcode() != ISD::CopyToReg &&
22564 U->getOpcode() != ISD::SETCC &&
22565 U->getOpcode() != ISD::STORE)
22566 return false;
22567
22568 return true;
22569}
22570
22571/// Emit nodes that will be selected as "test Op0,Op0", or something
22572/// equivalent.
22573static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
22574 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
22575 // CF and OF aren't always set the way we want. Determine which
22576 // of these we need.
22577 bool NeedCF = false;
22578 bool NeedOF = false;
22579 switch (X86CC) {
22580 default: break;
22581 case X86::COND_A: case X86::COND_AE:
22582 case X86::COND_B: case X86::COND_BE:
22583 NeedCF = true;
22584 break;
22585 case X86::COND_G: case X86::COND_GE:
22586 case X86::COND_L: case X86::COND_LE:
22587 case X86::COND_O: case X86::COND_NO: {
22588 // Check if we really need to set the
22589 // Overflow flag. If NoSignedWrap is present
22590 // that is not actually needed.
22591 switch (Op->getOpcode()) {
22592 case ISD::ADD:
22593 case ISD::SUB:
22594 case ISD::MUL:
22595 case ISD::SHL:
22596 if (Op.getNode()->getFlags().hasNoSignedWrap())
22597 break;
22598 [[fallthrough]];
22599 default:
22600 NeedOF = true;
22601 break;
22602 }
22603 break;
22604 }
22605 }
22606 // See if we can use the EFLAGS value from the operand instead of
22607 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
22608 // we prove that the arithmetic won't overflow, we can't use OF or CF.
22609 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
22610 // Emit a CMP with 0, which is the TEST pattern.
22611 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22612 DAG.getConstant(0, dl, Op.getValueType()));
22613 }
22614 unsigned Opcode = 0;
22615 unsigned NumOperands = 0;
22616
22617 SDValue ArithOp = Op;
22618
22619 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
22620 // which may be the result of a CAST. We use the variable 'Op', which is the
22621 // non-casted variable when we check for possible users.
22622 switch (ArithOp.getOpcode()) {
22623 case ISD::AND:
22624 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
22625 // because a TEST instruction will be better.
22626 if (!hasNonFlagsUse(Op))
22627 break;
22628
22629 [[fallthrough]];
22630 case ISD::ADD:
22631 case ISD::SUB:
22632 case ISD::OR:
22633 case ISD::XOR:
22635 break;
22636
22637 // Otherwise use a regular EFLAGS-setting instruction.
22638 switch (ArithOp.getOpcode()) {
22639 // clang-format off
22640 default: llvm_unreachable("unexpected operator!");
22641 case ISD::ADD: Opcode = X86ISD::ADD; break;
22642 case ISD::SUB: Opcode = X86ISD::SUB; break;
22643 case ISD::XOR: Opcode = X86ISD::XOR; break;
22644 case ISD::AND: Opcode = X86ISD::AND; break;
22645 case ISD::OR: Opcode = X86ISD::OR; break;
22646 // clang-format on
22647 }
22648
22649 NumOperands = 2;
22650 break;
22651 case X86ISD::ADD:
22652 case X86ISD::SUB:
22653 case X86ISD::OR:
22654 case X86ISD::XOR:
22655 case X86ISD::AND:
22656 return SDValue(Op.getNode(), 1);
22657 case ISD::SSUBO:
22658 case ISD::USUBO: {
22659 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
22660 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22661 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
22662 Op->getOperand(1)).getValue(1);
22663 }
22664 default:
22665 break;
22666 }
22667
22668 if (Opcode == 0) {
22669 // Emit a CMP with 0, which is the TEST pattern.
22670 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22671 DAG.getConstant(0, dl, Op.getValueType()));
22672 }
22673 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22674 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
22675
22676 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
22677 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
22678 return SDValue(New.getNode(), 1);
22679}
22680
22681/// Emit nodes that will be selected as "cmp Op0,Op1", or something
22682/// equivalent.
22683static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
22684 const SDLoc &dl, SelectionDAG &DAG,
22685 const X86Subtarget &Subtarget) {
22686 if (isNullConstant(Op1))
22687 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
22688
22689 EVT CmpVT = Op0.getValueType();
22690
22691 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
22692 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
22693
22694 // Only promote the compare up to I32 if it is a 16 bit operation
22695 // with an immediate. 16 bit immediates are to be avoided unless the target
22696 // isn't slowed down by length changing prefixes, we're optimizing for
22697 // codesize or the comparison is with a folded load.
22698 if (CmpVT == MVT::i16 && !Subtarget.hasFastImm16() &&
22699 !X86::mayFoldLoad(Op0, Subtarget) && !X86::mayFoldLoad(Op1, Subtarget) &&
22701 auto *COp0 = dyn_cast<ConstantSDNode>(Op0);
22702 auto *COp1 = dyn_cast<ConstantSDNode>(Op1);
22703 // Don't do this if the immediate can fit in 8-bits.
22704 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
22705 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
22706 unsigned ExtendOp =
22708 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
22709 // For equality comparisons try to use SIGN_EXTEND if the input was
22710 // truncate from something with enough sign bits.
22711 if (Op0.getOpcode() == ISD::TRUNCATE) {
22712 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
22713 ExtendOp = ISD::SIGN_EXTEND;
22714 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
22715 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
22716 ExtendOp = ISD::SIGN_EXTEND;
22717 }
22718 }
22719
22720 CmpVT = MVT::i32;
22721 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
22722 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
22723 }
22724 }
22725
22726 // Try to shrink i64 compares if the input has enough zero bits.
22727 // TODO: Add sign-bits equivalent for isX86CCSigned(X86CC)?
22728 if (CmpVT == MVT::i64 && !isX86CCSigned(X86CC) &&
22729 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
22730 DAG.MaskedValueIsZero(Op1, APInt::getHighBitsSet(64, 32)) &&
22731 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
22732 CmpVT = MVT::i32;
22733 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
22734 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
22735 }
22736
22737 // 0-x == y --> x+y == 0
22738 // 0-x != y --> x+y != 0
22739 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
22740 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22741 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22742 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
22743 return Add.getValue(1);
22744 }
22745
22746 // x == 0-y --> x+y == 0
22747 // x != 0-y --> x+y != 0
22748 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
22749 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22750 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22751 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
22752 return Add.getValue(1);
22753 }
22754
22755 // Use SUB instead of CMP to enable CSE between SUB and CMP.
22756 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22757 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
22758 return Sub.getValue(1);
22759}
22760
22762 EVT VT) const {
22763 return !VT.isVector() || Cond != ISD::CondCode::SETEQ;
22764}
22765
22766bool X86TargetLowering::optimizeFMulOrFDivAsShiftAddBitcast(
22767 SDNode *N, SDValue, SDValue IntPow2) const {
22768 if (N->getOpcode() == ISD::FDIV)
22769 return true;
22770
22771 EVT FPVT = N->getValueType(0);
22772 EVT IntVT = IntPow2.getValueType();
22773
22774 // This indicates a non-free bitcast.
22775 // TODO: This is probably overly conservative as we will need to scale the
22776 // integer vector anyways for the int->fp cast.
22777 if (FPVT.isVector() &&
22778 FPVT.getScalarSizeInBits() != IntVT.getScalarSizeInBits())
22779 return false;
22780
22781 return true;
22782}
22783
22784/// Check if replacement of SQRT with RSQRT should be disabled.
22785bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
22786 EVT VT = Op.getValueType();
22787
22788 // We don't need to replace SQRT with RSQRT for half type.
22789 if (VT.getScalarType() == MVT::f16)
22790 return true;
22791
22792 // We never want to use both SQRT and RSQRT instructions for the same input.
22793 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
22794 return false;
22795
22796 if (VT.isVector())
22797 return Subtarget.hasFastVectorFSQRT();
22798 return Subtarget.hasFastScalarFSQRT();
22799}
22800
22801/// The minimum architected relative accuracy is 2^-12. We need one
22802/// Newton-Raphson step to have a good float result (24 bits of precision).
22803SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
22804 SelectionDAG &DAG, int Enabled,
22805 int &RefinementSteps,
22806 bool &UseOneConstNR,
22807 bool Reciprocal) const {
22808 SDLoc DL(Op);
22809 EVT VT = Op.getValueType();
22810
22811 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
22812 // It is likely not profitable to do this for f64 because a double-precision
22813 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
22814 // instructions: convert to single, rsqrtss, convert back to double, refine
22815 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
22816 // along with FMA, this could be a throughput win.
22817 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
22818 // after legalize types.
22819 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22820 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
22821 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
22822 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22823 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22824 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22825 RefinementSteps = 1;
22826
22827 UseOneConstNR = false;
22828 // There is no FSQRT for 512-bits, but there is RSQRT14.
22829 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
22830 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
22831 if (RefinementSteps == 0 && !Reciprocal)
22832 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
22833 return Estimate;
22834 }
22835
22836 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
22837 Subtarget.hasFP16()) {
22838 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");
22839 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22840 RefinementSteps = 0;
22841
22842 if (VT == MVT::f16) {
22843 SDValue Zero = DAG.getIntPtrConstant(0, DL);
22844 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
22845 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
22846 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
22847 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
22848 }
22849
22850 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
22851 }
22852 return SDValue();
22853}
22854
22855/// The minimum architected relative accuracy is 2^-12. We need one
22856/// Newton-Raphson step to have a good float result (24 bits of precision).
22857SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
22858 int Enabled,
22859 int &RefinementSteps) const {
22860 SDLoc DL(Op);
22861 EVT VT = Op.getValueType();
22862
22863 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
22864 // It is likely not profitable to do this for f64 because a double-precision
22865 // reciprocal estimate with refinement on x86 prior to FMA requires
22866 // 15 instructions: convert to single, rcpss, convert back to double, refine
22867 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
22868 // along with FMA, this could be a throughput win.
22869
22870 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22871 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
22872 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22873 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22874 // Enable estimate codegen with 1 refinement step for vector division.
22875 // Scalar division estimates are disabled because they break too much
22876 // real-world code. These defaults are intended to match GCC behavior.
22877 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
22878 return SDValue();
22879
22880 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22881 RefinementSteps = 1;
22882
22883 // There is no FSQRT for 512-bits, but there is RCP14.
22884 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
22885 return DAG.getNode(Opcode, DL, VT, Op);
22886 }
22887
22888 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
22889 Subtarget.hasFP16()) {
22890 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22891 RefinementSteps = 0;
22892
22893 if (VT == MVT::f16) {
22894 SDValue Zero = DAG.getIntPtrConstant(0, DL);
22895 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
22896 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
22897 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
22898 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
22899 }
22900
22901 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
22902 }
22903 return SDValue();
22904}
22905
22906/// If we have at least two divisions that use the same divisor, convert to
22907/// multiplication by a reciprocal. This may need to be adjusted for a given
22908/// CPU if a division's cost is not at least twice the cost of a multiplication.
22909/// This is because we still need one division to calculate the reciprocal and
22910/// then we need two multiplies by that reciprocal as replacements for the
22911/// original divisions.
22912unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
22913 return 2;
22914}
22915
22916SDValue
22917X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
22918 SelectionDAG &DAG,
22919 SmallVectorImpl<SDNode *> &Created) const {
22921 if (isIntDivCheap(N->getValueType(0), Attr))
22922 return SDValue(N,0); // Lower SDIV as SDIV
22923
22924 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
22925 "Unexpected divisor!");
22926
22927 // Only perform this transform if CMOV is supported otherwise the select
22928 // below will become a branch.
22929 if (!Subtarget.canUseCMOV())
22930 return SDValue();
22931
22932 // fold (sdiv X, pow2)
22933 EVT VT = N->getValueType(0);
22934 // FIXME: Support i8.
22935 if (VT != MVT::i16 && VT != MVT::i32 &&
22936 !(Subtarget.is64Bit() && VT == MVT::i64))
22937 return SDValue();
22938
22939 // If the divisor is 2 or -2, the default expansion is better.
22940 if (Divisor == 2 ||
22941 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
22942 return SDValue();
22943
22944 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
22945}
22946
22947/// Result of 'and' is compared against zero. Change to a BT node if possible.
22948/// Returns the BT node and the condition code needed to use it.
22950 SelectionDAG &DAG, X86::CondCode &X86CC) {
22951 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
22952 SDValue Op0 = And.getOperand(0);
22953 SDValue Op1 = And.getOperand(1);
22954 if (Op0.getOpcode() == ISD::TRUNCATE)
22955 Op0 = Op0.getOperand(0);
22956 if (Op1.getOpcode() == ISD::TRUNCATE)
22957 Op1 = Op1.getOperand(0);
22958
22959 SDValue Src, BitNo;
22960 if (Op1.getOpcode() == ISD::SHL)
22961 std::swap(Op0, Op1);
22962 if (Op0.getOpcode() == ISD::SHL) {
22963 if (isOneConstant(Op0.getOperand(0))) {
22964 // If we looked past a truncate, check that it's only truncating away
22965 // known zeros.
22966 unsigned BitWidth = Op0.getValueSizeInBits();
22967 unsigned AndBitWidth = And.getValueSizeInBits();
22968 if (BitWidth > AndBitWidth) {
22969 KnownBits Known = DAG.computeKnownBits(Op0);
22970 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
22971 return SDValue();
22972 }
22973 Src = Op1;
22974 BitNo = Op0.getOperand(1);
22975 }
22976 } else if (Op1.getOpcode() == ISD::Constant) {
22977 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
22978 uint64_t AndRHSVal = AndRHS->getZExtValue();
22979 SDValue AndLHS = Op0;
22980
22981 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
22982 Src = AndLHS.getOperand(0);
22983 BitNo = AndLHS.getOperand(1);
22984 } else {
22985 // Use BT if the immediate can't be encoded in a TEST instruction or we
22986 // are optimizing for size and the immedaite won't fit in a byte.
22987 bool OptForSize = DAG.shouldOptForSize();
22988 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
22989 isPowerOf2_64(AndRHSVal)) {
22990 Src = AndLHS;
22991 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
22992 Src.getValueType());
22993 }
22994 }
22995 }
22996
22997 // No patterns found, give up.
22998 if (!Src.getNode())
22999 return SDValue();
23000
23001 // Remove any bit flip.
23002 if (isBitwiseNot(Src)) {
23003 Src = Src.getOperand(0);
23005 }
23006
23007 // Attempt to create the X86ISD::BT node.
23008 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
23009 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23010 return BT;
23011 }
23012
23013 return SDValue();
23014}
23015
23016// Check if pre-AVX condcode can be performed by a single FCMP op.
23017static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
23018 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
23019}
23020
23021/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23022/// CMPs.
23023static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
23024 SDValue &Op1, bool &IsAlwaysSignaling) {
23025 unsigned SSECC;
23026 bool Swap = false;
23027
23028 // SSE Condition code mapping:
23029 // 0 - EQ
23030 // 1 - LT
23031 // 2 - LE
23032 // 3 - UNORD
23033 // 4 - NEQ
23034 // 5 - NLT
23035 // 6 - NLE
23036 // 7 - ORD
23037 switch (SetCCOpcode) {
23038 // clang-format off
23039 default: llvm_unreachable("Unexpected SETCC condition");
23040 case ISD::SETOEQ:
23041 case ISD::SETEQ: SSECC = 0; break;
23042 case ISD::SETOGT:
23043 case ISD::SETGT: Swap = true; [[fallthrough]];
23044 case ISD::SETLT:
23045 case ISD::SETOLT: SSECC = 1; break;
23046 case ISD::SETOGE:
23047 case ISD::SETGE: Swap = true; [[fallthrough]];
23048 case ISD::SETLE:
23049 case ISD::SETOLE: SSECC = 2; break;
23050 case ISD::SETUO: SSECC = 3; break;
23051 case ISD::SETUNE:
23052 case ISD::SETNE: SSECC = 4; break;
23053 case ISD::SETULE: Swap = true; [[fallthrough]];
23054 case ISD::SETUGE: SSECC = 5; break;
23055 case ISD::SETULT: Swap = true; [[fallthrough]];
23056 case ISD::SETUGT: SSECC = 6; break;
23057 case ISD::SETO: SSECC = 7; break;
23058 case ISD::SETUEQ: SSECC = 8; break;
23059 case ISD::SETONE: SSECC = 12; break;
23060 // clang-format on
23061 }
23062 if (Swap)
23063 std::swap(Op0, Op1);
23064
23065 switch (SetCCOpcode) {
23066 default:
23067 IsAlwaysSignaling = true;
23068 break;
23069 case ISD::SETEQ:
23070 case ISD::SETOEQ:
23071 case ISD::SETUEQ:
23072 case ISD::SETNE:
23073 case ISD::SETONE:
23074 case ISD::SETUNE:
23075 case ISD::SETO:
23076 case ISD::SETUO:
23077 IsAlwaysSignaling = false;
23078 break;
23079 }
23080
23081 return SSECC;
23082}
23083
23084/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
23085/// concatenate the result back.
23088 const SDLoc &dl) {
23089 assert(VT.isInteger() && VT == LHS.getValueType() &&
23090 VT == RHS.getValueType() && "Unsupported VTs!");
23091
23092 SDValue CC = DAG.getCondCode(Cond);
23093
23094 // Extract the LHS Lo/Hi vectors
23095 SDValue LHS1, LHS2;
23096 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
23097
23098 // Extract the RHS Lo/Hi vectors
23099 SDValue RHS1, RHS2;
23100 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
23101
23102 // Issue the operation on the smaller types and concatenate the result back
23103 EVT LoVT, HiVT;
23104 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
23105 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23106 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
23107 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
23108}
23109
23111 SelectionDAG &DAG) {
23112 SDValue Op0 = Op.getOperand(0);
23113 SDValue Op1 = Op.getOperand(1);
23114 SDValue CC = Op.getOperand(2);
23115 MVT VT = Op.getSimpleValueType();
23116 assert(VT.getVectorElementType() == MVT::i1 &&
23117 "Cannot set masked compare for this operation");
23118
23119 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23120
23121 // Prefer SETGT over SETLT.
23122 if (SetCCOpcode == ISD::SETLT) {
23123 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
23124 std::swap(Op0, Op1);
23125 }
23126
23127 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
23128}
23129
23130/// Given a buildvector constant, return a new vector constant with each element
23131/// incremented or decremented. If incrementing or decrementing would result in
23132/// unsigned overflow or underflow or this is not a simple vector constant,
23133/// return an empty value.
23135 bool NSW) {
23136 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
23137 if (!BV || !V.getValueType().isSimple())
23138 return SDValue();
23139
23140 MVT VT = V.getSimpleValueType();
23141 MVT EltVT = VT.getVectorElementType();
23142 unsigned NumElts = VT.getVectorNumElements();
23144 SDLoc DL(V);
23145 for (unsigned i = 0; i < NumElts; ++i) {
23146 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23147 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23148 return SDValue();
23149
23150 // Avoid overflow/underflow.
23151 const APInt &EltC = Elt->getAPIntValue();
23152 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
23153 return SDValue();
23154 if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
23155 (!IsInc && EltC.isMinSignedValue())))
23156 return SDValue();
23157
23158 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23159 }
23160
23161 return DAG.getBuildVector(VT, DL, NewVecC);
23162}
23163
23164/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
23165/// Op0 u<= Op1:
23166/// t = psubus Op0, Op1
23167/// pcmpeq t, <0..0>
23169 ISD::CondCode Cond, const SDLoc &dl,
23170 const X86Subtarget &Subtarget,
23171 SelectionDAG &DAG) {
23172 if (!Subtarget.hasSSE2())
23173 return SDValue();
23174
23175 MVT VET = VT.getVectorElementType();
23176 if (VET != MVT::i8 && VET != MVT::i16)
23177 return SDValue();
23178
23179 switch (Cond) {
23180 default:
23181 return SDValue();
23182 case ISD::SETULT: {
23183 // If the comparison is against a constant we can turn this into a
23184 // setule. With psubus, setule does not require a swap. This is
23185 // beneficial because the constant in the register is no longer
23186 // destructed as the destination so it can be hoisted out of a loop.
23187 // Only do this pre-AVX since vpcmp* is no longer destructive.
23188 if (Subtarget.hasAVX())
23189 return SDValue();
23190 SDValue ULEOp1 =
23191 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
23192 if (!ULEOp1)
23193 return SDValue();
23194 Op1 = ULEOp1;
23195 break;
23196 }
23197 case ISD::SETUGT: {
23198 // If the comparison is against a constant, we can turn this into a setuge.
23199 // This is beneficial because materializing a constant 0 for the PCMPEQ is
23200 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
23201 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
23202 SDValue UGEOp1 =
23203 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
23204 if (!UGEOp1)
23205 return SDValue();
23206 Op1 = Op0;
23207 Op0 = UGEOp1;
23208 break;
23209 }
23210 // Psubus is better than flip-sign because it requires no inversion.
23211 case ISD::SETUGE:
23212 std::swap(Op0, Op1);
23213 break;
23214 case ISD::SETULE:
23215 break;
23216 }
23217
23218 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
23219 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
23220 DAG.getConstant(0, dl, VT));
23221}
23222
23223static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
23224 SelectionDAG &DAG) {
23225 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23226 Op.getOpcode() == ISD::STRICT_FSETCCS;
23227 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23228 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23229 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
23230 MVT VT = Op->getSimpleValueType(0);
23231 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
23232 bool isFP = Op1.getSimpleValueType().isFloatingPoint();
23233 SDLoc dl(Op);
23234
23235 if (isFP) {
23237 assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64);
23238 if (isSoftF16(EltVT, Subtarget))
23239 return SDValue();
23240
23241 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23242 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23243
23244 // If we have a strict compare with a vXi1 result and the input is 128/256
23245 // bits we can't use a masked compare unless we have VLX. If we use a wider
23246 // compare like we do for non-strict, we might trigger spurious exceptions
23247 // from the upper elements. Instead emit a AVX compare and convert to mask.
23248 unsigned Opc;
23249 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
23250 (!IsStrict || Subtarget.hasVLX() ||
23252#ifndef NDEBUG
23253 unsigned Num = VT.getVectorNumElements();
23254 assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16));
23255#endif
23256 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
23257 } else {
23258 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
23259 // The SSE/AVX packed FP comparison nodes are defined with a
23260 // floating-point vector result that matches the operand type. This allows
23261 // them to work with an SSE1 target (integer vector types are not legal).
23262 VT = Op0.getSimpleValueType();
23263 }
23264
23265 SDValue Cmp;
23266 bool IsAlwaysSignaling;
23267 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
23268 if (!Subtarget.hasAVX()) {
23269 // TODO: We could use following steps to handle a quiet compare with
23270 // signaling encodings.
23271 // 1. Get ordered masks from a quiet ISD::SETO
23272 // 2. Use the masks to mask potential unordered elements in operand A, B
23273 // 3. Get the compare results of masked A, B
23274 // 4. Calculating final result using the mask and result from 3
23275 // But currently, we just fall back to scalar operations.
23276 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
23277 return SDValue();
23278
23279 // Insert an extra signaling instruction to raise exception.
23280 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
23281 SDValue SignalCmp = DAG.getNode(
23282 Opc, dl, {VT, MVT::Other},
23283 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
23284 // FIXME: It seems we need to update the flags of all new strict nodes.
23285 // Otherwise, mayRaiseFPException in MI will return false due to
23286 // NoFPExcept = false by default. However, I didn't find it in other
23287 // patches.
23288 SignalCmp->setFlags(Op->getFlags());
23289 Chain = SignalCmp.getValue(1);
23290 }
23291
23292 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
23293 // emit two comparisons and a logic op to tie them together.
23294 if (!cheapX86FSETCC_SSE(Cond)) {
23295 // LLVM predicate is SETUEQ or SETONE.
23296 unsigned CC0, CC1;
23297 unsigned CombineOpc;
23298 if (Cond == ISD::SETUEQ) {
23299 CC0 = 3; // UNORD
23300 CC1 = 0; // EQ
23301 CombineOpc = X86ISD::FOR;
23302 } else {
23304 CC0 = 7; // ORD
23305 CC1 = 4; // NEQ
23306 CombineOpc = X86ISD::FAND;
23307 }
23308
23309 SDValue Cmp0, Cmp1;
23310 if (IsStrict) {
23311 Cmp0 = DAG.getNode(
23312 Opc, dl, {VT, MVT::Other},
23313 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
23314 Cmp1 = DAG.getNode(
23315 Opc, dl, {VT, MVT::Other},
23316 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
23317 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
23318 Cmp1.getValue(1));
23319 } else {
23320 Cmp0 = DAG.getNode(
23321 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
23322 Cmp1 = DAG.getNode(
23323 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
23324 }
23325 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
23326 } else {
23327 if (IsStrict) {
23328 Cmp = DAG.getNode(
23329 Opc, dl, {VT, MVT::Other},
23330 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23331 Chain = Cmp.getValue(1);
23332 } else
23333 Cmp = DAG.getNode(
23334 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23335 }
23336 } else {
23337 // Handle all other FP comparisons here.
23338 if (IsStrict) {
23339 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
23340 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
23341 Cmp = DAG.getNode(
23342 Opc, dl, {VT, MVT::Other},
23343 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23344 Chain = Cmp.getValue(1);
23345 } else
23346 Cmp = DAG.getNode(
23347 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23348 }
23349
23350 if (VT.getFixedSizeInBits() >
23351 Op.getSimpleValueType().getFixedSizeInBits()) {
23352 // We emitted a compare with an XMM/YMM result. Finish converting to a
23353 // mask register using a vptestm.
23355 Cmp = DAG.getBitcast(CastVT, Cmp);
23356 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
23357 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
23358 } else {
23359 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
23360 // the result type of SETCC. The bitcast is expected to be optimized
23361 // away during combining/isel.
23362 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
23363 }
23364
23365 if (IsStrict)
23366 return DAG.getMergeValues({Cmp, Chain}, dl);
23367
23368 return Cmp;
23369 }
23370
23371 assert(!IsStrict && "Strict SETCC only handles FP operands.");
23372
23373 MVT VTOp0 = Op0.getSimpleValueType();
23374 (void)VTOp0;
23375 assert(VTOp0 == Op1.getSimpleValueType() &&
23376 "Expected operands with same type!");
23378 "Invalid number of packed elements for source and destination!");
23379
23380 // The non-AVX512 code below works under the assumption that source and
23381 // destination types are the same.
23382 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
23383 "Value types for source and destination must be the same!");
23384
23385 // The result is boolean, but operands are int/float
23386 if (VT.getVectorElementType() == MVT::i1) {
23387 // In AVX-512 architecture setcc returns mask with i1 elements,
23388 // But there is no compare instruction for i8 and i16 elements in KNL.
23389 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
23390 "Unexpected operand type");
23391 return LowerIntVSETCC_AVX512(Op, dl, DAG);
23392 }
23393
23394 // Lower using XOP integer comparisons.
23395 if (VT.is128BitVector() && Subtarget.hasXOP()) {
23396 // Translate compare code to XOP PCOM compare mode.
23397 unsigned CmpMode = 0;
23398 switch (Cond) {
23399 // clang-format off
23400 default: llvm_unreachable("Unexpected SETCC condition");
23401 case ISD::SETULT:
23402 case ISD::SETLT: CmpMode = 0x00; break;
23403 case ISD::SETULE:
23404 case ISD::SETLE: CmpMode = 0x01; break;
23405 case ISD::SETUGT:
23406 case ISD::SETGT: CmpMode = 0x02; break;
23407 case ISD::SETUGE:
23408 case ISD::SETGE: CmpMode = 0x03; break;
23409 case ISD::SETEQ: CmpMode = 0x04; break;
23410 case ISD::SETNE: CmpMode = 0x05; break;
23411 // clang-format on
23412 }
23413
23414 // Are we comparing unsigned or signed integers?
23415 unsigned Opc =
23417
23418 return DAG.getNode(Opc, dl, VT, Op0, Op1,
23419 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
23420 }
23421
23422 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
23423 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
23425 SDValue BC0 = peekThroughBitcasts(Op0);
23426 if (BC0.getOpcode() == ISD::AND) {
23427 APInt UndefElts;
23428 SmallVector<APInt, 64> EltBits;
23430 BC0.getOperand(1), VT.getScalarSizeInBits(), UndefElts, EltBits,
23431 /*AllowWholeUndefs*/ false, /*AllowPartialUndefs*/ false)) {
23432 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
23433 Cond = ISD::SETEQ;
23434 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
23435 }
23436 }
23437 }
23438 }
23439
23440 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
23441 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
23442 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
23444 if (C1 && C1->getAPIntValue().isPowerOf2()) {
23445 unsigned BitWidth = VT.getScalarSizeInBits();
23446 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
23447
23448 SDValue Result = Op0.getOperand(0);
23449 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
23450 DAG.getConstant(ShiftAmt, dl, VT));
23451 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
23452 DAG.getConstant(BitWidth - 1, dl, VT));
23453 return Result;
23454 }
23455 }
23456
23457 // Break 256-bit integer vector compare into smaller ones.
23458 if (VT.is256BitVector() && !Subtarget.hasInt256())
23459 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23460
23461 // Break 512-bit integer vector compare into smaller ones.
23462 // TODO: Try harder to use VPCMPx + VPMOV2x?
23463 if (VT.is512BitVector())
23464 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23465
23466 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
23467 // not-of-PCMPEQ:
23468 // X != INT_MIN --> X >s INT_MIN
23469 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
23470 // +X != 0 --> +X >s 0
23471 APInt ConstValue;
23472 if (Cond == ISD::SETNE &&
23473 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
23474 if (ConstValue.isMinSignedValue())
23475 Cond = ISD::SETGT;
23476 else if (ConstValue.isMaxSignedValue())
23477 Cond = ISD::SETLT;
23478 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
23479 Cond = ISD::SETGT;
23480 }
23481
23482 // If both operands are known non-negative, then an unsigned compare is the
23483 // same as a signed compare and there's no need to flip signbits.
23484 // TODO: We could check for more general simplifications here since we're
23485 // computing known bits.
23486 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
23487 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
23488
23489 // Special case: Use min/max operations for unsigned compares.
23490 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23492 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
23493 TLI.isOperationLegal(ISD::UMIN, VT)) {
23494 // If we have a constant operand, increment/decrement it and change the
23495 // condition to avoid an invert.
23496 if (Cond == ISD::SETUGT) {
23497 // X > C --> X >= (C+1) --> X == umax(X, C+1)
23498 if (SDValue UGTOp1 =
23499 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
23500 Op1 = UGTOp1;
23501 Cond = ISD::SETUGE;
23502 }
23503 }
23504 if (Cond == ISD::SETULT) {
23505 // X < C --> X <= (C-1) --> X == umin(X, C-1)
23506 if (SDValue ULTOp1 =
23507 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
23508 Op1 = ULTOp1;
23509 Cond = ISD::SETULE;
23510 }
23511 }
23512 bool Invert = false;
23513 unsigned Opc;
23514 switch (Cond) {
23515 // clang-format off
23516 default: llvm_unreachable("Unexpected condition code");
23517 case ISD::SETUGT: Invert = true; [[fallthrough]];
23518 case ISD::SETULE: Opc = ISD::UMIN; break;
23519 case ISD::SETULT: Invert = true; [[fallthrough]];
23520 case ISD::SETUGE: Opc = ISD::UMAX; break;
23521 // clang-format on
23522 }
23523
23524 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23525 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
23526
23527 // If the logical-not of the result is required, perform that now.
23528 if (Invert)
23529 Result = DAG.getNOT(dl, Result, VT);
23530
23531 return Result;
23532 }
23533
23534 // Try to use SUBUS and PCMPEQ.
23535 if (FlipSigns)
23536 if (SDValue V =
23537 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
23538 return V;
23539
23540 // We are handling one of the integer comparisons here. Since SSE only has
23541 // GT and EQ comparisons for integer, swapping operands and multiple
23542 // operations may be required for some comparisons.
23543 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
23545 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
23547 bool Invert = Cond == ISD::SETNE ||
23549
23550 if (Swap)
23551 std::swap(Op0, Op1);
23552
23553 // Check that the operation in question is available (most are plain SSE2,
23554 // but PCMPGTQ and PCMPEQQ have different requirements).
23555 if (VT == MVT::v2i64) {
23556 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
23557 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
23558
23559 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
23560 // the odd elements over the even elements.
23561 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
23562 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
23563 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23564
23565 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23566 static const int MaskHi[] = { 1, 1, 3, 3 };
23567 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23568
23569 return DAG.getBitcast(VT, Result);
23570 }
23571
23572 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
23573 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23574 Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
23575
23576 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23577 static const int MaskHi[] = { 1, 1, 3, 3 };
23578 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23579
23580 return DAG.getBitcast(VT, Result);
23581 }
23582
23583 // If the i64 elements are sign-extended enough to be representable as i32
23584 // then we can compare the lower i32 bits and splat.
23585 if (!FlipSigns && !Invert && DAG.ComputeNumSignBits(Op0) > 32 &&
23586 DAG.ComputeNumSignBits(Op1) > 32) {
23587 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23588 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23589
23590 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23591 static const int MaskLo[] = {0, 0, 2, 2};
23592 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
23593
23594 return DAG.getBitcast(VT, Result);
23595 }
23596
23597 // Since SSE has no unsigned integer comparisons, we need to flip the sign
23598 // bits of the inputs before performing those operations. The lower
23599 // compare is always unsigned.
23600 SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
23601 : 0x0000000080000000ULL,
23602 dl, MVT::v2i64);
23603
23604 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
23605 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
23606
23607 // Cast everything to the right type.
23608 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23609 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23610
23611 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
23612 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23613 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
23614
23615 // Create masks for only the low parts/high parts of the 64 bit integers.
23616 static const int MaskHi[] = { 1, 1, 3, 3 };
23617 static const int MaskLo[] = { 0, 0, 2, 2 };
23618 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
23619 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
23620 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23621
23622 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
23623 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
23624
23625 if (Invert)
23626 Result = DAG.getNOT(dl, Result, MVT::v4i32);
23627
23628 return DAG.getBitcast(VT, Result);
23629 }
23630
23631 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
23632 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
23633 // pcmpeqd + pshufd + pand.
23634 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
23635
23636 // First cast everything to the right type.
23637 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23638 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23639
23640 // Do the compare.
23641 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
23642
23643 // Make sure the lower and upper halves are both all-ones.
23644 static const int Mask[] = { 1, 0, 3, 2 };
23645 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
23646 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
23647
23648 if (Invert)
23649 Result = DAG.getNOT(dl, Result, MVT::v4i32);
23650
23651 return DAG.getBitcast(VT, Result);
23652 }
23653 }
23654
23655 // Since SSE has no unsigned integer comparisons, we need to flip the sign
23656 // bits of the inputs before performing those operations.
23657 if (FlipSigns) {
23658 MVT EltVT = VT.getVectorElementType();
23660 VT);
23661 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
23662 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
23663 }
23664
23665 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23666
23667 // If the logical-not of the result is required, perform that now.
23668 if (Invert)
23669 Result = DAG.getNOT(dl, Result, VT);
23670
23671 return Result;
23672}
23673
23674// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
23676 const SDLoc &dl, SelectionDAG &DAG,
23677 const X86Subtarget &Subtarget,
23678 SDValue &X86CC) {
23679 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
23680
23681 // Must be a bitcast from vXi1.
23682 if (Op0.getOpcode() != ISD::BITCAST)
23683 return SDValue();
23684
23685 Op0 = Op0.getOperand(0);
23686 MVT VT = Op0.getSimpleValueType();
23687 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
23688 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
23689 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
23690 return SDValue();
23691
23692 X86::CondCode X86Cond;
23693 if (isNullConstant(Op1)) {
23694 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
23695 } else if (isAllOnesConstant(Op1)) {
23696 // C flag is set for all ones.
23697 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
23698 } else
23699 return SDValue();
23700
23701 // If the input is an AND, we can combine it's operands into the KTEST.
23702 bool KTestable = false;
23703 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
23704 KTestable = true;
23705 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
23706 KTestable = true;
23707 if (!isNullConstant(Op1))
23708 KTestable = false;
23709 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
23710 SDValue LHS = Op0.getOperand(0);
23711 SDValue RHS = Op0.getOperand(1);
23712 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23713 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
23714 }
23715
23716 // If the input is an OR, we can combine it's operands into the KORTEST.
23717 SDValue LHS = Op0;
23718 SDValue RHS = Op0;
23719 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
23720 LHS = Op0.getOperand(0);
23721 RHS = Op0.getOperand(1);
23722 }
23723
23724 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23725 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
23726}
23727
23728/// Emit flags for the given setcc condition and operands. Also returns the
23729/// corresponding X86 condition code constant in X86CC.
23730SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
23731 ISD::CondCode CC, const SDLoc &dl,
23732 SelectionDAG &DAG,
23733 SDValue &X86CC) const {
23734 // Equality Combines.
23735 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
23736 X86::CondCode X86CondCode;
23737
23738 // Optimize to BT if possible.
23739 // Lower (X & (1 << N)) == 0 to BT(X, N).
23740 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
23741 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
23742 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
23743 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
23744 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23745 return BT;
23746 }
23747 }
23748
23749 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
23750 if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
23751 X86CondCode)) {
23752 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23753 return CmpZ;
23754 }
23755
23756 // Try to lower using KORTEST or KTEST.
23757 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
23758 return Test;
23759
23760 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms
23761 // of these.
23762 if (isOneConstant(Op1) || isNullConstant(Op1)) {
23763 // If the input is a setcc, then reuse the input setcc or use a new one
23764 // with the inverted condition.
23765 if (Op0.getOpcode() == X86ISD::SETCC) {
23766 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
23767
23768 X86CC = Op0.getOperand(0);
23769 if (Invert) {
23770 X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
23771 X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
23772 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23773 }
23774
23775 return Op0.getOperand(1);
23776 }
23777 }
23778
23779 // Try to use the carry flag from the add in place of an separate CMP for:
23780 // (seteq (add X, -1), -1). Similar for setne.
23781 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
23782 Op0.getOperand(1) == Op1) {
23783 if (isProfitableToUseFlagOp(Op0)) {
23784 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
23785
23786 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
23787 Op0.getOperand(1));
23788 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
23789 X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23790 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23791 return SDValue(New.getNode(), 1);
23792 }
23793 }
23794 }
23795
23797 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
23798 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
23799
23800 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
23801 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23802 return EFLAGS;
23803}
23804
23805SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
23806
23807 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23808 Op.getOpcode() == ISD::STRICT_FSETCCS;
23809 MVT VT = Op->getSimpleValueType(0);
23810
23811 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
23812
23813 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
23814 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23815 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23816 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23817 SDLoc dl(Op);
23819 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
23820
23821 if (isSoftF16(Op0.getValueType(), Subtarget))
23822 return SDValue();
23823
23824 // Handle f128 first, since one possible outcome is a normal integer
23825 // comparison which gets handled by emitFlagsForSetcc.
23826 if (Op0.getValueType() == MVT::f128) {
23827 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
23828 Op.getOpcode() == ISD::STRICT_FSETCCS);
23829
23830 // If softenSetCCOperands returned a scalar, use it.
23831 if (!Op1.getNode()) {
23832 assert(Op0.getValueType() == Op.getValueType() &&
23833 "Unexpected setcc expansion!");
23834 if (IsStrict)
23835 return DAG.getMergeValues({Op0, Chain}, dl);
23836 return Op0;
23837 }
23838 }
23839
23840 if (Op0.getSimpleValueType().isInteger()) {
23841 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
23842 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
23843 // this may translate to less uops depending on uarch implementation. The
23844 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
23845 // canonicalize to that CondCode.
23846 // NOTE: Only do this if incrementing the constant doesn't increase the bit
23847 // encoding size - so it must either already be a i8 or i32 immediate, or it
23848 // shrinks down to that. We don't do this for any i64's to avoid additional
23849 // constant materializations.
23850 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
23851 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
23852 const APInt &Op1Val = Op1C->getAPIntValue();
23853 if (!Op1Val.isZero()) {
23854 // Ensure the constant+1 doesn't overflow.
23855 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
23856 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
23857 APInt Op1ValPlusOne = Op1Val + 1;
23858 if (Op1ValPlusOne.isSignedIntN(32) &&
23859 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
23860 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
23863 }
23864 }
23865 }
23866 }
23867
23868 SDValue X86CC;
23869 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
23870 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23871 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23872 }
23873
23874 // Handle floating point.
23875 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
23876 if (CondCode == X86::COND_INVALID)
23877 return SDValue();
23878
23879 SDValue EFLAGS;
23880 if (IsStrict) {
23881 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23882 EFLAGS =
23884 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
23885 Chain = EFLAGS.getValue(1);
23886 } else {
23887 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
23888 }
23889
23890 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23891 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23892 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23893}
23894
23895SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
23896 SDValue LHS = Op.getOperand(0);
23897 SDValue RHS = Op.getOperand(1);
23898 SDValue Carry = Op.getOperand(2);
23899 SDValue Cond = Op.getOperand(3);
23900 SDLoc DL(Op);
23901
23902 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
23903 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
23904
23905 // Recreate the carry if needed.
23906 EVT CarryVT = Carry.getValueType();
23907 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
23908 Carry, DAG.getAllOnesConstant(DL, CarryVT));
23909
23910 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
23911 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
23912 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
23913}
23914
23915// This function returns three things: the arithmetic computation itself
23916// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
23917// flag and the condition code define the case in which the arithmetic
23918// computation overflows.
23919static std::pair<SDValue, SDValue>
23921 assert(Op.getResNo() == 0 && "Unexpected result number!");
23922 SDValue Value, Overflow;
23923 SDValue LHS = Op.getOperand(0);
23924 SDValue RHS = Op.getOperand(1);
23925 unsigned BaseOp = 0;
23926 SDLoc DL(Op);
23927 switch (Op.getOpcode()) {
23928 default: llvm_unreachable("Unknown ovf instruction!");
23929 case ISD::SADDO:
23930 BaseOp = X86ISD::ADD;
23931 Cond = X86::COND_O;
23932 break;
23933 case ISD::UADDO:
23934 BaseOp = X86ISD::ADD;
23936 break;
23937 case ISD::SSUBO:
23938 BaseOp = X86ISD::SUB;
23939 Cond = X86::COND_O;
23940 break;
23941 case ISD::USUBO:
23942 BaseOp = X86ISD::SUB;
23943 Cond = X86::COND_B;
23944 break;
23945 case ISD::SMULO:
23946 BaseOp = X86ISD::SMUL;
23947 Cond = X86::COND_O;
23948 break;
23949 case ISD::UMULO:
23950 BaseOp = X86ISD::UMUL;
23951 Cond = X86::COND_O;
23952 break;
23953 }
23954
23955 if (BaseOp) {
23956 // Also sets EFLAGS.
23957 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23958 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
23959 Overflow = Value.getValue(1);
23960 }
23961
23962 return std::make_pair(Value, Overflow);
23963}
23964
23966 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
23967 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
23968 // looks for this combo and may remove the "setcc" instruction if the "setcc"
23969 // has only one use.
23970 SDLoc DL(Op);
23972 SDValue Value, Overflow;
23973 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
23974
23975 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
23976 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
23977 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
23978}
23979
23980/// Return true if opcode is a X86 logical comparison.
23982 unsigned Opc = Op.getOpcode();
23983 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
23984 Opc == X86ISD::FCMP)
23985 return true;
23986 if (Op.getResNo() == 1 &&
23987 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
23988 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
23989 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
23990 return true;
23991
23992 return false;
23993}
23994
23996 if (V.getOpcode() != ISD::TRUNCATE)
23997 return false;
23998
23999 SDValue VOp0 = V.getOperand(0);
24000 unsigned InBits = VOp0.getValueSizeInBits();
24001 unsigned Bits = V.getValueSizeInBits();
24002 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
24003}
24004
24005SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
24006 bool AddTest = true;
24007 SDValue Cond = Op.getOperand(0);
24008 SDValue Op1 = Op.getOperand(1);
24009 SDValue Op2 = Op.getOperand(2);
24010 SDLoc DL(Op);
24011 MVT VT = Op1.getSimpleValueType();
24012 SDValue CC;
24013
24014 if (isSoftF16(VT, Subtarget)) {
24015 MVT NVT = VT.changeTypeToInteger();
24016 return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
24017 DAG.getBitcast(NVT, Op1),
24018 DAG.getBitcast(NVT, Op2)));
24019 }
24020
24021 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
24022 // are available or VBLENDV if AVX is available.
24023 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
24024 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
24025 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
24026 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
24027 bool IsAlwaysSignaling;
24028 unsigned SSECC =
24029 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
24030 CondOp0, CondOp1, IsAlwaysSignaling);
24031
24032 if (Subtarget.hasAVX512()) {
24033 SDValue Cmp =
24034 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
24035 DAG.getTargetConstant(SSECC, DL, MVT::i8));
24036 assert(!VT.isVector() && "Not a scalar type?");
24037 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
24038 }
24039
24040 if (SSECC < 8 || Subtarget.hasAVX()) {
24041 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
24042 DAG.getTargetConstant(SSECC, DL, MVT::i8));
24043
24044 // If we have AVX, we can use a variable vector select (VBLENDV) instead
24045 // of 3 logic instructions for size savings and potentially speed.
24046 // Unfortunately, there is no scalar form of VBLENDV.
24047
24048 // If either operand is a +0.0 constant, don't try this. We can expect to
24049 // optimize away at least one of the logic instructions later in that
24050 // case, so that sequence would be faster than a variable blend.
24051
24052 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
24053 // uses XMM0 as the selection register. That may need just as many
24054 // instructions as the AND/ANDN/OR sequence due to register moves, so
24055 // don't bother.
24056 if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
24057 !isNullFPConstant(Op2)) {
24058 // Convert to vectors, do a VSELECT, and convert back to scalar.
24059 // All of the conversions should be optimized away.
24060 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
24061 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
24062 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
24063 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
24064
24065 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
24066 VCmp = DAG.getBitcast(VCmpVT, VCmp);
24067
24068 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
24069
24070 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
24071 VSel, DAG.getIntPtrConstant(0, DL));
24072 }
24073 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
24074 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
24075 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
24076 }
24077 }
24078
24079 // AVX512 fallback is to lower selects of scalar floats to masked moves.
24080 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
24081 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
24082 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
24083 }
24084
24085 if (Cond.getOpcode() == ISD::SETCC &&
24086 !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) {
24087 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
24088 Cond = NewCond;
24089 // If the condition was updated, it's possible that the operands of the
24090 // select were also updated (for example, EmitTest has a RAUW). Refresh
24091 // the local references to the select operands in case they got stale.
24092 Op1 = Op.getOperand(1);
24093 Op2 = Op.getOperand(2);
24094 }
24095 }
24096
24097 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
24098 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
24099 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
24100 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
24101 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
24102 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
24103 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
24104 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
24105 if (Cond.getOpcode() == X86ISD::SETCC &&
24106 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
24107 isNullConstant(Cond.getOperand(1).getOperand(1))) {
24108 SDValue Cmp = Cond.getOperand(1);
24109 SDValue CmpOp0 = Cmp.getOperand(0);
24110 unsigned CondCode = Cond.getConstantOperandVal(0);
24111
24112 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
24113 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
24114 // handle to keep the CMP with 0. This should be removed by
24115 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
24116 // cttz_zero_undef.
24117 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
24118 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
24119 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
24120 };
24121 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
24122 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
24123 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
24124 // Keep Cmp.
24125 } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24126 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
24127 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
24128 SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
24129
24130 // 'X - 1' sets the carry flag if X == 0.
24131 // '0 - X' sets the carry flag if X != 0.
24132 // Convert the carry flag to a -1/0 mask with sbb:
24133 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
24134 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
24135 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
24136 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
24137 SDValue Sub;
24138 if (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE)) {
24139 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
24140 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
24141 } else {
24142 SDValue One = DAG.getConstant(1, DL, CmpOp0.getValueType());
24143 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpOp0, One);
24144 }
24146 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
24147 Sub.getValue(1));
24148 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
24149 } else if (!Subtarget.canUseCMOV() && CondCode == X86::COND_E &&
24150 CmpOp0.getOpcode() == ISD::AND &&
24151 isOneConstant(CmpOp0.getOperand(1))) {
24152 SDValue Src1, Src2;
24153 // true if Op2 is XOR or OR operator and one of its operands
24154 // is equal to Op1
24155 // ( a , a op b) || ( b , a op b)
24156 auto isOrXorPattern = [&]() {
24157 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
24158 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
24159 Src1 =
24160 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
24161 Src2 = Op1;
24162 return true;
24163 }
24164 return false;
24165 };
24166
24167 if (isOrXorPattern()) {
24168 SDValue Neg;
24169 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
24170 // we need mask of all zeros or ones with same size of the other
24171 // operands.
24172 if (CmpSz > VT.getSizeInBits())
24173 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
24174 else if (CmpSz < VT.getSizeInBits())
24175 Neg = DAG.getNode(ISD::AND, DL, VT,
24176 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
24177 DAG.getConstant(1, DL, VT));
24178 else
24179 Neg = CmpOp0;
24180 SDValue Mask = DAG.getNegative(Neg, DL, VT); // -(and (x, 0x1))
24181 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
24182 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
24183 }
24184 } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&
24185 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
24186 ((CondCode == X86::COND_S) || // smin(x, 0)
24187 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
24188 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
24189 //
24190 // If the comparison is testing for a positive value, we have to invert
24191 // the sign bit mask, so only do that transform if the target has a
24192 // bitwise 'and not' instruction (the invert is free).
24193 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
24194 unsigned ShCt = VT.getSizeInBits() - 1;
24195 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
24196 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
24197 if (CondCode == X86::COND_G)
24198 Shift = DAG.getNOT(DL, Shift, VT);
24199 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
24200 }
24201 }
24202
24203 // Look past (and (setcc_carry (cmp ...)), 1).
24204 if (Cond.getOpcode() == ISD::AND &&
24205 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
24206 isOneConstant(Cond.getOperand(1)))
24207 Cond = Cond.getOperand(0);
24208
24209 // If condition flag is set by a X86ISD::CMP, then use it as the condition
24210 // setting operand in place of the X86ISD::SETCC.
24211 unsigned CondOpcode = Cond.getOpcode();
24212 if (CondOpcode == X86ISD::SETCC ||
24213 CondOpcode == X86ISD::SETCC_CARRY) {
24214 CC = Cond.getOperand(0);
24215
24216 SDValue Cmp = Cond.getOperand(1);
24217 bool IllegalFPCMov = false;
24218 if (VT.isFloatingPoint() && !VT.isVector() &&
24219 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
24220 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
24221
24222 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
24223 Cmp.getOpcode() == X86ISD::BT) { // FIXME
24224 Cond = Cmp;
24225 AddTest = false;
24226 }
24227 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
24228 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
24229 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
24230 SDValue Value;
24231 X86::CondCode X86Cond;
24232 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24233
24234 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
24235 AddTest = false;
24236 }
24237
24238 if (AddTest) {
24239 // Look past the truncate if the high bits are known zero.
24241 Cond = Cond.getOperand(0);
24242
24243 // We know the result of AND is compared against zero. Try to match
24244 // it to BT.
24245 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
24246 X86::CondCode X86CondCode;
24247 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
24248 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
24249 Cond = BT;
24250 AddTest = false;
24251 }
24252 }
24253 }
24254
24255 if (AddTest) {
24256 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
24257 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
24258 }
24259
24260 // a < b ? -1 : 0 -> RES = ~setcc_carry
24261 // a < b ? 0 : -1 -> RES = setcc_carry
24262 // a >= b ? -1 : 0 -> RES = setcc_carry
24263 // a >= b ? 0 : -1 -> RES = ~setcc_carry
24264 if (Cond.getOpcode() == X86ISD::SUB) {
24265 unsigned CondCode = CC->getAsZExtVal();
24266
24267 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
24268 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24269 (isNullConstant(Op1) || isNullConstant(Op2))) {
24270 SDValue Res =
24271 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
24272 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
24273 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
24274 return DAG.getNOT(DL, Res, Res.getValueType());
24275 return Res;
24276 }
24277 }
24278
24279 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
24280 // widen the cmov and push the truncate through. This avoids introducing a new
24281 // branch during isel and doesn't add any extensions.
24282 if (Op.getValueType() == MVT::i8 &&
24283 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
24284 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
24285 if (T1.getValueType() == T2.getValueType() &&
24286 // Exclude CopyFromReg to avoid partial register stalls.
24287 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
24288 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
24289 CC, Cond);
24290 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24291 }
24292 }
24293
24294 // Or finally, promote i8 cmovs if we have CMOV,
24295 // or i16 cmovs if it won't prevent folding a load.
24296 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
24297 // legal, but EmitLoweredSelect() can not deal with these extensions
24298 // being inserted between two CMOV's. (in i16 case too TBN)
24299 // https://bugs.llvm.org/show_bug.cgi?id=40974
24300 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
24301 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
24302 !X86::mayFoldLoad(Op2, Subtarget))) {
24303 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
24304 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
24305 SDValue Ops[] = { Op2, Op1, CC, Cond };
24306 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
24307 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24308 }
24309
24310 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
24311 // condition is true.
24312 SDValue Ops[] = { Op2, Op1, CC, Cond };
24313 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
24314}
24315
24317 const X86Subtarget &Subtarget,
24318 SelectionDAG &DAG) {
24319 MVT VT = Op->getSimpleValueType(0);
24320 SDValue In = Op->getOperand(0);
24321 MVT InVT = In.getSimpleValueType();
24322 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
24323 MVT VTElt = VT.getVectorElementType();
24324 SDLoc dl(Op);
24325
24326 unsigned NumElts = VT.getVectorNumElements();
24327
24328 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
24329 MVT ExtVT = VT;
24330 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
24331 // If v16i32 is to be avoided, we'll need to split and concatenate.
24332 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
24333 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
24334
24335 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
24336 }
24337
24338 // Widen to 512-bits if VLX is not supported.
24339 MVT WideVT = ExtVT;
24340 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
24341 NumElts *= 512 / ExtVT.getSizeInBits();
24342 InVT = MVT::getVectorVT(MVT::i1, NumElts);
24343 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
24344 In, DAG.getIntPtrConstant(0, dl));
24345 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
24346 }
24347
24348 SDValue V;
24349 MVT WideEltVT = WideVT.getVectorElementType();
24350 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
24351 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
24352 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
24353 } else {
24354 SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
24355 SDValue Zero = DAG.getConstant(0, dl, WideVT);
24356 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
24357 }
24358
24359 // Truncate if we had to extend i16/i8 above.
24360 if (VT != ExtVT) {
24361 WideVT = MVT::getVectorVT(VTElt, NumElts);
24362 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
24363 }
24364
24365 // Extract back to 128/256-bit if we widened.
24366 if (WideVT != VT)
24367 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
24368 DAG.getIntPtrConstant(0, dl));
24369
24370 return V;
24371}
24372
24374 SelectionDAG &DAG) {
24375 SDValue In = Op->getOperand(0);
24376 MVT InVT = In.getSimpleValueType();
24377
24378 if (InVT.getVectorElementType() == MVT::i1)
24379 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24380
24381 assert(Subtarget.hasAVX() && "Expected AVX support");
24382 return LowerAVXExtend(Op, DAG, Subtarget);
24383}
24384
24385// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
24386// For sign extend this needs to handle all vector sizes and SSE4.1 and
24387// non-SSE4.1 targets. For zero extend this should only handle inputs of
24388// MVT::v64i8 when BWI is not supported, but AVX512 is.
24390 const X86Subtarget &Subtarget,
24391 SelectionDAG &DAG) {
24392 SDValue In = Op->getOperand(0);
24393 MVT VT = Op->getSimpleValueType(0);
24394 MVT InVT = In.getSimpleValueType();
24395
24396 MVT SVT = VT.getVectorElementType();
24397 MVT InSVT = InVT.getVectorElementType();
24399
24400 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
24401 return SDValue();
24402 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
24403 return SDValue();
24404 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
24405 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
24406 !(VT.is512BitVector() && Subtarget.hasAVX512()))
24407 return SDValue();
24408
24409 SDLoc dl(Op);
24410 unsigned Opc = Op.getOpcode();
24411 unsigned NumElts = VT.getVectorNumElements();
24412
24413 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
24414 // For 512-bit vectors, we need 128-bits or 256-bits.
24415 if (InVT.getSizeInBits() > 128) {
24416 // Input needs to be at least the same number of elements as output, and
24417 // at least 128-bits.
24418 int InSize = InSVT.getSizeInBits() * NumElts;
24419 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
24420 InVT = In.getSimpleValueType();
24421 }
24422
24423 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
24424 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
24425 // need to be handled here for 256/512-bit results.
24426 if (Subtarget.hasInt256()) {
24427 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
24428
24429 if (InVT.getVectorNumElements() != NumElts)
24430 return DAG.getNode(Op.getOpcode(), dl, VT, In);
24431
24432 // FIXME: Apparently we create inreg operations that could be regular
24433 // extends.
24434 unsigned ExtOpc =
24437 return DAG.getNode(ExtOpc, dl, VT, In);
24438 }
24439
24440 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
24441 if (Subtarget.hasAVX()) {
24442 assert(VT.is256BitVector() && "256-bit vector expected");
24443 MVT HalfVT = VT.getHalfNumVectorElementsVT();
24444 int HalfNumElts = HalfVT.getVectorNumElements();
24445
24446 unsigned NumSrcElts = InVT.getVectorNumElements();
24447 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
24448 for (int i = 0; i != HalfNumElts; ++i)
24449 HiMask[i] = HalfNumElts + i;
24450
24451 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
24452 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
24453 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
24454 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
24455 }
24456
24457 // We should only get here for sign extend.
24458 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
24459 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
24460 unsigned InNumElts = InVT.getVectorNumElements();
24461
24462 // If the source elements are already all-signbits, we don't need to extend,
24463 // just splat the elements.
24464 APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
24465 if (DAG.ComputeNumSignBits(In, DemandedElts) == InVT.getScalarSizeInBits()) {
24466 unsigned Scale = InNumElts / NumElts;
24467 SmallVector<int, 16> ShuffleMask;
24468 for (unsigned I = 0; I != NumElts; ++I)
24469 ShuffleMask.append(Scale, I);
24470 return DAG.getBitcast(VT,
24471 DAG.getVectorShuffle(InVT, dl, In, In, ShuffleMask));
24472 }
24473
24474 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
24475 SDValue Curr = In;
24476 SDValue SignExt = Curr;
24477
24478 // As SRAI is only available on i16/i32 types, we expand only up to i32
24479 // and handle i64 separately.
24480 if (InVT != MVT::v4i32) {
24481 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
24482
24483 unsigned DestWidth = DestVT.getScalarSizeInBits();
24484 unsigned Scale = DestWidth / InSVT.getSizeInBits();
24485 unsigned DestElts = DestVT.getVectorNumElements();
24486
24487 // Build a shuffle mask that takes each input element and places it in the
24488 // MSBs of the new element size.
24489 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
24490 for (unsigned i = 0; i != DestElts; ++i)
24491 Mask[i * Scale + (Scale - 1)] = i;
24492
24493 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
24494 Curr = DAG.getBitcast(DestVT, Curr);
24495
24496 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
24497 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
24498 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
24499 }
24500
24501 if (VT == MVT::v2i64) {
24502 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
24503 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
24504 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
24505 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
24506 SignExt = DAG.getBitcast(VT, SignExt);
24507 }
24508
24509 return SignExt;
24510}
24511
24513 SelectionDAG &DAG) {
24514 MVT VT = Op->getSimpleValueType(0);
24515 SDValue In = Op->getOperand(0);
24516 MVT InVT = In.getSimpleValueType();
24517 SDLoc dl(Op);
24518
24519 if (InVT.getVectorElementType() == MVT::i1)
24520 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24521
24522 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
24524 "Expected same number of elements");
24525 assert((VT.getVectorElementType() == MVT::i16 ||
24526 VT.getVectorElementType() == MVT::i32 ||
24527 VT.getVectorElementType() == MVT::i64) &&
24528 "Unexpected element type");
24529 assert((InVT.getVectorElementType() == MVT::i8 ||
24530 InVT.getVectorElementType() == MVT::i16 ||
24531 InVT.getVectorElementType() == MVT::i32) &&
24532 "Unexpected element type");
24533
24534 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
24535 assert(InVT == MVT::v32i8 && "Unexpected VT!");
24536 return splitVectorIntUnary(Op, DAG, dl);
24537 }
24538
24539 if (Subtarget.hasInt256())
24540 return Op;
24541
24542 // Optimize vectors in AVX mode
24543 // Sign extend v8i16 to v8i32 and
24544 // v4i32 to v4i64
24545 //
24546 // Divide input vector into two parts
24547 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
24548 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
24549 // concat the vectors to original VT
24550 MVT HalfVT = VT.getHalfNumVectorElementsVT();
24551 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
24552
24553 unsigned NumElems = InVT.getVectorNumElements();
24554 SmallVector<int,8> ShufMask(NumElems, -1);
24555 for (unsigned i = 0; i != NumElems/2; ++i)
24556 ShufMask[i] = i + NumElems/2;
24557
24558 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
24559 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
24560
24561 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
24562}
24563
24564/// Change a vector store into a pair of half-size vector stores.
24566 SDValue StoredVal = Store->getValue();
24567 assert((StoredVal.getValueType().is256BitVector() ||
24568 StoredVal.getValueType().is512BitVector()) &&
24569 "Expecting 256/512-bit op");
24570
24571 // Splitting volatile memory ops is not allowed unless the operation was not
24572 // legal to begin with. Assume the input store is legal (this transform is
24573 // only used for targets with AVX). Note: It is possible that we have an
24574 // illegal type like v2i128, and so we could allow splitting a volatile store
24575 // in that case if that is important.
24576 if (!Store->isSimple())
24577 return SDValue();
24578
24579 SDLoc DL(Store);
24580 SDValue Value0, Value1;
24581 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
24582 unsigned HalfOffset = Value0.getValueType().getStoreSize();
24583 SDValue Ptr0 = Store->getBasePtr();
24584 SDValue Ptr1 =
24585 DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(HalfOffset), DL);
24586 SDValue Ch0 =
24587 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
24588 Store->getOriginalAlign(),
24589 Store->getMemOperand()->getFlags());
24590 SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
24591 Store->getPointerInfo().getWithOffset(HalfOffset),
24592 Store->getOriginalAlign(),
24593 Store->getMemOperand()->getFlags());
24594 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
24595}
24596
24597/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
24598/// type.
24600 SelectionDAG &DAG) {
24601 SDValue StoredVal = Store->getValue();
24602 assert(StoreVT.is128BitVector() &&
24603 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
24604 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
24605
24606 // Splitting volatile memory ops is not allowed unless the operation was not
24607 // legal to begin with. We are assuming the input op is legal (this transform
24608 // is only used for targets with AVX).
24609 if (!Store->isSimple())
24610 return SDValue();
24611
24612 MVT StoreSVT = StoreVT.getScalarType();
24613 unsigned NumElems = StoreVT.getVectorNumElements();
24614 unsigned ScalarSize = StoreSVT.getStoreSize();
24615
24616 SDLoc DL(Store);
24618 for (unsigned i = 0; i != NumElems; ++i) {
24619 unsigned Offset = i * ScalarSize;
24620 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
24622 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
24623 DAG.getIntPtrConstant(i, DL));
24624 SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
24625 Store->getPointerInfo().getWithOffset(Offset),
24626 Store->getOriginalAlign(),
24627 Store->getMemOperand()->getFlags());
24628 Stores.push_back(Ch);
24629 }
24630 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
24631}
24632
24633static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
24634 SelectionDAG &DAG) {
24635 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
24636 SDLoc dl(St);
24637 SDValue StoredVal = St->getValue();
24638
24639 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
24640 if (StoredVal.getValueType().isVector() &&
24641 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
24642 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
24643 assert(NumElts <= 8 && "Unexpected VT");
24644 assert(!St->isTruncatingStore() && "Expected non-truncating store");
24645 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
24646 "Expected AVX512F without AVX512DQI");
24647
24648 // We must pad with zeros to ensure we store zeroes to any unused bits.
24649 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
24650 DAG.getUNDEF(MVT::v16i1), StoredVal,
24651 DAG.getIntPtrConstant(0, dl));
24652 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
24653 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
24654 // Make sure we store zeros in the extra bits.
24655 if (NumElts < 8)
24656 StoredVal = DAG.getZeroExtendInReg(
24657 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
24658
24659 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24660 St->getPointerInfo(), St->getOriginalAlign(),
24661 St->getMemOperand()->getFlags());
24662 }
24663
24664 if (St->isTruncatingStore())
24665 return SDValue();
24666
24667 // If this is a 256-bit store of concatenated ops, we are better off splitting
24668 // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
24669 // and each half can execute independently. Some cores would split the op into
24670 // halves anyway, so the concat (vinsertf128) is purely an extra op.
24671 MVT StoreVT = StoredVal.getSimpleValueType();
24672 if (StoreVT.is256BitVector() ||
24673 ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
24674 !Subtarget.hasBWI())) {
24675 if (StoredVal.hasOneUse() && isFreeToSplitVector(StoredVal.getNode(), DAG))
24676 return splitVectorStore(St, DAG);
24677 return SDValue();
24678 }
24679
24680 if (StoreVT.is32BitVector())
24681 return SDValue();
24682
24683 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24684 assert(StoreVT.is64BitVector() && "Unexpected VT");
24685 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
24687 "Unexpected type action!");
24688
24689 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
24690 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
24691 DAG.getUNDEF(StoreVT));
24692
24693 if (Subtarget.hasSSE2()) {
24694 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
24695 // and store it.
24696 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
24697 MVT CastVT = MVT::getVectorVT(StVT, 2);
24698 StoredVal = DAG.getBitcast(CastVT, StoredVal);
24699 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
24700 DAG.getIntPtrConstant(0, dl));
24701
24702 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24703 St->getPointerInfo(), St->getOriginalAlign(),
24704 St->getMemOperand()->getFlags());
24705 }
24706 assert(Subtarget.hasSSE1() && "Expected SSE");
24707 SDVTList Tys = DAG.getVTList(MVT::Other);
24708 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
24709 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
24710 St->getMemOperand());
24711}
24712
24713// Lower vector extended loads using a shuffle. If SSSE3 is not available we
24714// may emit an illegal shuffle but the expansion is still better than scalar
24715// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
24716// we'll emit a shuffle and a arithmetic shift.
24717// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
24718// TODO: It is possible to support ZExt by zeroing the undef values during
24719// the shuffle phase or after the shuffle.
24720static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
24721 SelectionDAG &DAG) {
24722 MVT RegVT = Op.getSimpleValueType();
24723 assert(RegVT.isVector() && "We only custom lower vector loads.");
24724 assert(RegVT.isInteger() &&
24725 "We only custom lower integer vector loads.");
24726
24727 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
24728 SDLoc dl(Ld);
24729
24730 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
24731 if (RegVT.getVectorElementType() == MVT::i1) {
24732 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
24733 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
24734 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
24735 "Expected AVX512F without AVX512DQI");
24736
24737 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
24738 Ld->getPointerInfo(), Ld->getOriginalAlign(),
24739 Ld->getMemOperand()->getFlags());
24740
24741 // Replace chain users with the new chain.
24742 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
24743
24744 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
24745 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
24746 DAG.getBitcast(MVT::v16i1, Val),
24747 DAG.getIntPtrConstant(0, dl));
24748 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
24749 }
24750
24751 return SDValue();
24752}
24753
24754/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
24755/// each of which has no other use apart from the AND / OR.
24756static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
24757 Opc = Op.getOpcode();
24758 if (Opc != ISD::OR && Opc != ISD::AND)
24759 return false;
24760 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
24761 Op.getOperand(0).hasOneUse() &&
24762 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
24763 Op.getOperand(1).hasOneUse());
24764}
24765
24766SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
24767 SDValue Chain = Op.getOperand(0);
24768 SDValue Cond = Op.getOperand(1);
24769 SDValue Dest = Op.getOperand(2);
24770 SDLoc dl(Op);
24771
24772 // Bail out when we don't have native compare instructions.
24773 if (Cond.getOpcode() == ISD::SETCC &&
24774 Cond.getOperand(0).getValueType() != MVT::f128 &&
24775 !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) {
24776 SDValue LHS = Cond.getOperand(0);
24777 SDValue RHS = Cond.getOperand(1);
24778 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
24779
24780 // Special case for
24781 // setcc([su]{add,sub,mul}o == 0)
24782 // setcc([su]{add,sub,mul}o != 1)
24783 if (ISD::isOverflowIntrOpRes(LHS) &&
24784 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
24785 (isNullConstant(RHS) || isOneConstant(RHS))) {
24786 SDValue Value, Overflow;
24787 X86::CondCode X86Cond;
24788 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
24789
24790 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
24791 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
24792
24793 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24794 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24795 Overflow);
24796 }
24797
24798 if (LHS.getSimpleValueType().isInteger()) {
24799 SDValue CCVal;
24800 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
24801 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24802 EFLAGS);
24803 }
24804
24805 if (CC == ISD::SETOEQ) {
24806 // For FCMP_OEQ, we can emit
24807 // two branches instead of an explicit AND instruction with a
24808 // separate test. However, we only do this if this block doesn't
24809 // have a fall-through edge, because this requires an explicit
24810 // jmp when the condition is false.
24811 if (Op.getNode()->hasOneUse()) {
24812 SDNode *User = *Op.getNode()->use_begin();
24813 // Look for an unconditional branch following this conditional branch.
24814 // We need this because we need to reverse the successors in order
24815 // to implement FCMP_OEQ.
24816 if (User->getOpcode() == ISD::BR) {
24817 SDValue FalseBB = User->getOperand(1);
24818 SDNode *NewBR =
24819 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
24820 assert(NewBR == User);
24821 (void)NewBR;
24822 Dest = FalseBB;
24823
24824 SDValue Cmp =
24825 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24826 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24827 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
24828 CCVal, Cmp);
24829 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24830 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24831 Cmp);
24832 }
24833 }
24834 } else if (CC == ISD::SETUNE) {
24835 // For FCMP_UNE, we can emit
24836 // two branches instead of an explicit OR instruction with a
24837 // separate test.
24838 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24839 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24840 Chain =
24841 DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
24842 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24843 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24844 Cmp);
24845 } else {
24846 X86::CondCode X86Cond =
24847 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
24848 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24849 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24850 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24851 Cmp);
24852 }
24853 }
24854
24856 SDValue Value, Overflow;
24857 X86::CondCode X86Cond;
24858 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24859
24860 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24861 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24862 Overflow);
24863 }
24864
24865 // Look past the truncate if the high bits are known zero.
24867 Cond = Cond.getOperand(0);
24868
24869 EVT CondVT = Cond.getValueType();
24870
24871 // Add an AND with 1 if we don't already have one.
24872 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
24873 Cond =
24874 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
24875
24876 SDValue LHS = Cond;
24877 SDValue RHS = DAG.getConstant(0, dl, CondVT);
24878
24879 SDValue CCVal;
24880 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
24881 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24882 EFLAGS);
24883}
24884
24885// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
24886// Calls to _alloca are needed to probe the stack when allocating more than 4k
24887// bytes in one go. Touching the stack at 4K increments is necessary to ensure
24888// that the guard pages used by the OS virtual memory manager are allocated in
24889// correct sequence.
24890SDValue
24891X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
24892 SelectionDAG &DAG) const {
24894 bool SplitStack = MF.shouldSplitStack();
24895 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
24896 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
24897 SplitStack || EmitStackProbeCall;
24898 SDLoc dl(Op);
24899
24900 // Get the inputs.
24901 SDNode *Node = Op.getNode();
24902 SDValue Chain = Op.getOperand(0);
24903 SDValue Size = Op.getOperand(1);
24904 MaybeAlign Alignment(Op.getConstantOperandVal(2));
24905 EVT VT = Node->getValueType(0);
24906
24907 // Chain the dynamic stack allocation so that it doesn't modify the stack
24908 // pointer when other instructions are using the stack.
24909 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
24910
24911 bool Is64Bit = Subtarget.is64Bit();
24912 MVT SPTy = getPointerTy(DAG.getDataLayout());
24913
24915 if (!Lower) {
24916 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24918 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
24919 " not tell us which reg is the stack pointer!");
24920
24921 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
24922 const Align StackAlign = TFI.getStackAlign();
24923 if (hasInlineStackProbe(MF)) {
24925
24926 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24927 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24928 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24929 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
24930 DAG.getRegister(Vreg, SPTy));
24931 } else {
24932 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
24933 Chain = SP.getValue(1);
24934 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
24935 }
24936 if (Alignment && *Alignment > StackAlign)
24937 Result =
24938 DAG.getNode(ISD::AND, dl, VT, Result,
24939 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24940 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
24941 } else if (SplitStack) {
24943
24944 if (Is64Bit) {
24945 // The 64 bit implementation of segmented stacks needs to clobber both r10
24946 // r11. This makes it impossible to use it along with nested parameters.
24947 const Function &F = MF.getFunction();
24948 for (const auto &A : F.args()) {
24949 if (A.hasNestAttr())
24950 report_fatal_error("Cannot use segmented stacks with functions that "
24951 "have nested arguments.");
24952 }
24953 }
24954
24955 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24956 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24957 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24958 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
24959 DAG.getRegister(Vreg, SPTy));
24960 } else {
24961 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
24962 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
24963 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
24964
24965 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
24966 Register SPReg = RegInfo->getStackRegister();
24967 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
24968 Chain = SP.getValue(1);
24969
24970 if (Alignment) {
24971 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
24972 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24973 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
24974 }
24975
24976 Result = SP;
24977 }
24978
24979 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
24980
24981 SDValue Ops[2] = {Result, Chain};
24982 return DAG.getMergeValues(Ops, dl);
24983}
24984
24985SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
24987 auto PtrVT = getPointerTy(MF.getDataLayout());
24989
24990 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
24991 SDLoc DL(Op);
24992
24993 if (!Subtarget.is64Bit() ||
24994 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
24995 // vastart just stores the address of the VarArgsFrameIndex slot into the
24996 // memory location argument.
24997 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
24998 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
24999 MachinePointerInfo(SV));
25000 }
25001
25002 // __va_list_tag:
25003 // gp_offset (0 - 6 * 8)
25004 // fp_offset (48 - 48 + 8 * 16)
25005 // overflow_arg_area (point to parameters coming in memory).
25006 // reg_save_area
25008 SDValue FIN = Op.getOperand(1);
25009 // Store gp_offset
25010 SDValue Store = DAG.getStore(
25011 Op.getOperand(0), DL,
25012 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
25013 MachinePointerInfo(SV));
25014 MemOps.push_back(Store);
25015
25016 // Store fp_offset
25017 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::getFixed(4), DL);
25018 Store = DAG.getStore(
25019 Op.getOperand(0), DL,
25020 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25021 MachinePointerInfo(SV, 4));
25022 MemOps.push_back(Store);
25023
25024 // Store ptr to overflow_arg_area
25025 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
25026 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25027 Store =
25028 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
25029 MemOps.push_back(Store);
25030
25031 // Store ptr to reg_save_area.
25032 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
25033 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
25034 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25035 Store = DAG.getStore(
25036 Op.getOperand(0), DL, RSFIN, FIN,
25037 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
25038 MemOps.push_back(Store);
25039 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
25040}
25041
25042SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
25043 assert(Subtarget.is64Bit() &&
25044 "LowerVAARG only handles 64-bit va_arg!");
25045 assert(Op.getNumOperands() == 4);
25046
25048 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
25049 // The Win64 ABI uses char* instead of a structure.
25050 return DAG.expandVAArg(Op.getNode());
25051
25052 SDValue Chain = Op.getOperand(0);
25053 SDValue SrcPtr = Op.getOperand(1);
25054 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25055 unsigned Align = Op.getConstantOperandVal(3);
25056 SDLoc dl(Op);
25057
25058 EVT ArgVT = Op.getNode()->getValueType(0);
25059 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
25060 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
25061 uint8_t ArgMode;
25062
25063 // Decide which area this value should be read from.
25064 // TODO: Implement the AMD64 ABI in its entirety. This simple
25065 // selection mechanism works only for the basic types.
25066 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
25067 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
25068 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
25069 } else {
25070 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
25071 "Unhandled argument type in LowerVAARG");
25072 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
25073 }
25074
25075 if (ArgMode == 2) {
25076 // Make sure using fp_offset makes sense.
25077 assert(!Subtarget.useSoftFloat() &&
25078 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
25079 Subtarget.hasSSE1());
25080 }
25081
25082 // Insert VAARG node into the DAG
25083 // VAARG returns two values: Variable Argument Address, Chain
25084 SDValue InstOps[] = {Chain, SrcPtr,
25085 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
25086 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
25087 DAG.getTargetConstant(Align, dl, MVT::i32)};
25088 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
25091 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
25092 /*Alignment=*/std::nullopt,
25094 Chain = VAARG.getValue(1);
25095
25096 // Load the next argument and return it
25097 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
25098}
25099
25100static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
25101 SelectionDAG &DAG) {
25102 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
25103 // where a va_list is still an i8*.
25104 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
25105 if (Subtarget.isCallingConvWin64(
25107 // Probably a Win64 va_copy.
25108 return DAG.expandVACopy(Op.getNode());
25109
25110 SDValue Chain = Op.getOperand(0);
25111 SDValue DstPtr = Op.getOperand(1);
25112 SDValue SrcPtr = Op.getOperand(2);
25113 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
25114 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
25115 SDLoc DL(Op);
25116
25117 return DAG.getMemcpy(
25118 Chain, DL, DstPtr, SrcPtr,
25119 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
25120 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
25121 false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
25122}
25123
25124// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
25125static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
25126 switch (Opc) {
25127 case ISD::SHL:
25128 case X86ISD::VSHL:
25129 case X86ISD::VSHLI:
25130 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
25131 case ISD::SRL:
25132 case X86ISD::VSRL:
25133 case X86ISD::VSRLI:
25134 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
25135 case ISD::SRA:
25136 case X86ISD::VSRA:
25137 case X86ISD::VSRAI:
25138 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
25139 }
25140 llvm_unreachable("Unknown target vector shift node");
25141}
25142
25143/// Handle vector element shifts where the shift amount is a constant.
25144/// Takes immediate version of shift as input.
25145static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
25146 SDValue SrcOp, uint64_t ShiftAmt,
25147 SelectionDAG &DAG) {
25148 MVT ElementType = VT.getVectorElementType();
25149
25150 // Bitcast the source vector to the output type, this is mainly necessary for
25151 // vXi8/vXi64 shifts.
25152 if (VT != SrcOp.getSimpleValueType())
25153 SrcOp = DAG.getBitcast(VT, SrcOp);
25154
25155 // Fold this packed shift into its first operand if ShiftAmt is 0.
25156 if (ShiftAmt == 0)
25157 return SrcOp;
25158
25159 // Check for ShiftAmt >= element width
25160 if (ShiftAmt >= ElementType.getSizeInBits()) {
25161 if (Opc == X86ISD::VSRAI)
25162 ShiftAmt = ElementType.getSizeInBits() - 1;
25163 else
25164 return DAG.getConstant(0, dl, VT);
25165 }
25166
25167 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
25168 && "Unknown target vector shift-by-constant node");
25169
25170 // Fold this packed vector shift into a build vector if SrcOp is a
25171 // vector of Constants or UNDEFs.
25173 unsigned ShiftOpc;
25174 switch (Opc) {
25175 default: llvm_unreachable("Unknown opcode!");
25176 case X86ISD::VSHLI:
25177 ShiftOpc = ISD::SHL;
25178 break;
25179 case X86ISD::VSRLI:
25180 ShiftOpc = ISD::SRL;
25181 break;
25182 case X86ISD::VSRAI:
25183 ShiftOpc = ISD::SRA;
25184 break;
25185 }
25186
25187 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
25188 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
25189 return C;
25190 }
25191
25192 return DAG.getNode(Opc, dl, VT, SrcOp,
25193 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
25194}
25195
25196/// Handle vector element shifts by a splat shift amount
25197static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
25198 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
25199 const X86Subtarget &Subtarget,
25200 SelectionDAG &DAG) {
25201 MVT AmtVT = ShAmt.getSimpleValueType();
25202 assert(AmtVT.isVector() && "Vector shift type mismatch");
25203 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
25204 "Illegal vector splat index");
25205
25206 // Move the splat element to the bottom element.
25207 if (ShAmtIdx != 0) {
25208 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
25209 Mask[0] = ShAmtIdx;
25210 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
25211 }
25212
25213 // Peek through any zext node if we can get back to a 128-bit source.
25214 if (AmtVT.getScalarSizeInBits() == 64 &&
25215 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
25217 ShAmt.getOperand(0).getValueType().isSimple() &&
25218 ShAmt.getOperand(0).getValueType().is128BitVector()) {
25219 ShAmt = ShAmt.getOperand(0);
25220 AmtVT = ShAmt.getSimpleValueType();
25221 }
25222
25223 // See if we can mask off the upper elements using the existing source node.
25224 // The shift uses the entire lower 64-bits of the amount vector, so no need to
25225 // do this for vXi64 types.
25226 bool IsMasked = false;
25227 if (AmtVT.getScalarSizeInBits() < 64) {
25228 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
25229 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
25230 // If the shift amount has come from a scalar, then zero-extend the scalar
25231 // before moving to the vector.
25232 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
25233 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
25234 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
25235 AmtVT = MVT::v4i32;
25236 IsMasked = true;
25237 } else if (ShAmt.getOpcode() == ISD::AND) {
25238 // See if the shift amount is already masked (e.g. for rotation modulo),
25239 // then we can zero-extend it by setting all the other mask elements to
25240 // zero.
25241 SmallVector<SDValue> MaskElts(
25242 AmtVT.getVectorNumElements(),
25243 DAG.getConstant(0, dl, AmtVT.getScalarType()));
25244 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
25245 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
25246 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
25247 {ShAmt.getOperand(1), Mask}))) {
25248 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
25249 IsMasked = true;
25250 }
25251 }
25252 }
25253
25254 // Extract if the shift amount vector is larger than 128-bits.
25255 if (AmtVT.getSizeInBits() > 128) {
25256 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
25257 AmtVT = ShAmt.getSimpleValueType();
25258 }
25259
25260 // Zero-extend bottom element to v2i64 vector type, either by extension or
25261 // shuffle masking.
25262 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
25263 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
25264 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
25265 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
25266 } else if (Subtarget.hasSSE41()) {
25267 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
25268 MVT::v2i64, ShAmt);
25269 } else {
25270 SDValue ByteShift = DAG.getTargetConstant(
25271 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
25272 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
25273 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25274 ByteShift);
25275 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25276 ByteShift);
25277 }
25278 }
25279
25280 // Change opcode to non-immediate version.
25281 Opc = getTargetVShiftUniformOpcode(Opc, true);
25282
25283 // The return type has to be a 128-bit type with the same element
25284 // type as the input type.
25285 MVT EltVT = VT.getVectorElementType();
25286 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
25287
25288 ShAmt = DAG.getBitcast(ShVT, ShAmt);
25289 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
25290}
25291
25292/// Return Mask with the necessary casting or extending
25293/// for \p Mask according to \p MaskVT when lowering masking intrinsics
25294static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
25295 const X86Subtarget &Subtarget, SelectionDAG &DAG,
25296 const SDLoc &dl) {
25297
25298 if (isAllOnesConstant(Mask))
25299 return DAG.getConstant(1, dl, MaskVT);
25300 if (X86::isZeroNode(Mask))
25301 return DAG.getConstant(0, dl, MaskVT);
25302
25303 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
25304
25305 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
25306 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
25307 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
25308 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
25309 SDValue Lo, Hi;
25310 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
25311 Lo = DAG.getBitcast(MVT::v32i1, Lo);
25312 Hi = DAG.getBitcast(MVT::v32i1, Hi);
25313 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
25314 } else {
25315 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
25316 Mask.getSimpleValueType().getSizeInBits());
25317 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
25318 // are extracted by EXTRACT_SUBVECTOR.
25319 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
25320 DAG.getBitcast(BitcastVT, Mask),
25321 DAG.getIntPtrConstant(0, dl));
25322 }
25323}
25324
25325/// Return (and \p Op, \p Mask) for compare instructions or
25326/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
25327/// necessary casting or extending for \p Mask when lowering masking intrinsics
25329 SDValue PreservedSrc,
25330 const X86Subtarget &Subtarget,
25331 SelectionDAG &DAG) {
25332 MVT VT = Op.getSimpleValueType();
25333 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
25334 unsigned OpcodeSelect = ISD::VSELECT;
25335 SDLoc dl(Op);
25336
25337 if (isAllOnesConstant(Mask))
25338 return Op;
25339
25340 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25341
25342 if (PreservedSrc.isUndef())
25343 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25344 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
25345}
25346
25347/// Creates an SDNode for a predicated scalar operation.
25348/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
25349/// The mask is coming as MVT::i8 and it should be transformed
25350/// to MVT::v1i1 while lowering masking intrinsics.
25351/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
25352/// "X86select" instead of "vselect". We just can't create the "vselect" node
25353/// for a scalar instruction.
25355 SDValue PreservedSrc,
25356 const X86Subtarget &Subtarget,
25357 SelectionDAG &DAG) {
25358
25359 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
25360 if (MaskConst->getZExtValue() & 0x1)
25361 return Op;
25362
25363 MVT VT = Op.getSimpleValueType();
25364 SDLoc dl(Op);
25365
25366 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
25367 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
25368 DAG.getBitcast(MVT::v8i1, Mask),
25369 DAG.getIntPtrConstant(0, dl));
25370 if (Op.getOpcode() == X86ISD::FSETCCM ||
25371 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
25372 Op.getOpcode() == X86ISD::VFPCLASSS)
25373 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
25374
25375 if (PreservedSrc.isUndef())
25376 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25377 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
25378}
25379
25381 if (!Fn->hasPersonalityFn())
25383 "querying registration node size for function without personality");
25384 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
25385 // WinEHStatePass for the full struct definition.
25386 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
25387 case EHPersonality::MSVC_X86SEH: return 24;
25388 case EHPersonality::MSVC_CXX: return 16;
25389 default: break;
25390 }
25392 "can only recover FP for 32-bit MSVC EH personality functions");
25393}
25394
25395/// When the MSVC runtime transfers control to us, either to an outlined
25396/// function or when returning to a parent frame after catching an exception, we
25397/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
25398/// Here's the math:
25399/// RegNodeBase = EntryEBP - RegNodeSize
25400/// ParentFP = RegNodeBase - ParentFrameOffset
25401/// Subtracting RegNodeSize takes us to the offset of the registration node, and
25402/// subtracting the offset (negative on x86) takes us back to the parent FP.
25404 SDValue EntryEBP) {
25406 SDLoc dl;
25407
25408 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25409 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
25410
25411 // It's possible that the parent function no longer has a personality function
25412 // if the exceptional code was optimized away, in which case we just return
25413 // the incoming EBP.
25414 if (!Fn->hasPersonalityFn())
25415 return EntryEBP;
25416
25417 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
25418 // registration, or the .set_setframe offset.
25419 MCSymbol *OffsetSym =
25422 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
25423 SDValue ParentFrameOffset =
25424 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
25425
25426 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
25427 // prologue to RBP in the parent function.
25428 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
25429 if (Subtarget.is64Bit())
25430 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
25431
25432 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
25433 // RegNodeBase = EntryEBP - RegNodeSize
25434 // ParentFP = RegNodeBase - ParentFrameOffset
25435 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
25436 DAG.getConstant(RegNodeSize, dl, PtrVT));
25437 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
25438}
25439
25440SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
25441 SelectionDAG &DAG) const {
25442 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
25443 auto isRoundModeCurDirection = [](SDValue Rnd) {
25444 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
25445 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
25446
25447 return false;
25448 };
25449 auto isRoundModeSAE = [](SDValue Rnd) {
25450 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25451 unsigned RC = C->getZExtValue();
25453 // Clear the NO_EXC bit and check remaining bits.
25455 // As a convenience we allow no other bits or explicitly
25456 // current direction.
25457 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
25458 }
25459 }
25460
25461 return false;
25462 };
25463 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
25464 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25465 RC = C->getZExtValue();
25467 // Clear the NO_EXC bit and check remaining bits.
25473 }
25474 }
25475
25476 return false;
25477 };
25478
25479 SDLoc dl(Op);
25480 unsigned IntNo = Op.getConstantOperandVal(0);
25481 MVT VT = Op.getSimpleValueType();
25482 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
25483
25484 // Propagate flags from original node to transformed node(s).
25485 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
25486
25487 if (IntrData) {
25488 switch(IntrData->Type) {
25489 case INTR_TYPE_1OP: {
25490 // We specify 2 possible opcodes for intrinsics with rounding modes.
25491 // First, we check if the intrinsic may have non-default rounding mode,
25492 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25493 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25494 if (IntrWithRoundingModeOpcode != 0) {
25495 SDValue Rnd = Op.getOperand(2);
25496 unsigned RC = 0;
25497 if (isRoundModeSAEToX(Rnd, RC))
25498 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25499 Op.getOperand(1),
25500 DAG.getTargetConstant(RC, dl, MVT::i32));
25501 if (!isRoundModeCurDirection(Rnd))
25502 return SDValue();
25503 }
25504 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25505 Op.getOperand(1));
25506 }
25507 case INTR_TYPE_1OP_SAE: {
25508 SDValue Sae = Op.getOperand(2);
25509
25510 unsigned Opc;
25511 if (isRoundModeCurDirection(Sae))
25512 Opc = IntrData->Opc0;
25513 else if (isRoundModeSAE(Sae))
25514 Opc = IntrData->Opc1;
25515 else
25516 return SDValue();
25517
25518 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
25519 }
25520 case INTR_TYPE_2OP: {
25521 SDValue Src2 = Op.getOperand(2);
25522
25523 // We specify 2 possible opcodes for intrinsics with rounding modes.
25524 // First, we check if the intrinsic may have non-default rounding mode,
25525 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25526 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25527 if (IntrWithRoundingModeOpcode != 0) {
25528 SDValue Rnd = Op.getOperand(3);
25529 unsigned RC = 0;
25530 if (isRoundModeSAEToX(Rnd, RC))
25531 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25532 Op.getOperand(1), Src2,
25533 DAG.getTargetConstant(RC, dl, MVT::i32));
25534 if (!isRoundModeCurDirection(Rnd))
25535 return SDValue();
25536 }
25537
25538 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25539 Op.getOperand(1), Src2);
25540 }
25541 case INTR_TYPE_2OP_SAE: {
25542 SDValue Sae = Op.getOperand(3);
25543
25544 unsigned Opc;
25545 if (isRoundModeCurDirection(Sae))
25546 Opc = IntrData->Opc0;
25547 else if (isRoundModeSAE(Sae))
25548 Opc = IntrData->Opc1;
25549 else
25550 return SDValue();
25551
25552 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
25553 Op.getOperand(2));
25554 }
25555 case INTR_TYPE_3OP:
25556 case INTR_TYPE_3OP_IMM8: {
25557 SDValue Src1 = Op.getOperand(1);
25558 SDValue Src2 = Op.getOperand(2);
25559 SDValue Src3 = Op.getOperand(3);
25560
25561 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
25562 Src3.getValueType() != MVT::i8) {
25563 Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8);
25564 }
25565
25566 // We specify 2 possible opcodes for intrinsics with rounding modes.
25567 // First, we check if the intrinsic may have non-default rounding mode,
25568 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25569 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25570 if (IntrWithRoundingModeOpcode != 0) {
25571 SDValue Rnd = Op.getOperand(4);
25572 unsigned RC = 0;
25573 if (isRoundModeSAEToX(Rnd, RC))
25574 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25575 Src1, Src2, Src3,
25576 DAG.getTargetConstant(RC, dl, MVT::i32));
25577 if (!isRoundModeCurDirection(Rnd))
25578 return SDValue();
25579 }
25580
25581 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25582 {Src1, Src2, Src3});
25583 }
25584 case INTR_TYPE_4OP_IMM8: {
25585 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
25586 SDValue Src4 = Op.getOperand(4);
25587 if (Src4.getValueType() != MVT::i8) {
25588 Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8);
25589 }
25590
25591 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25592 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
25593 Src4);
25594 }
25595 case INTR_TYPE_1OP_MASK: {
25596 SDValue Src = Op.getOperand(1);
25597 SDValue PassThru = Op.getOperand(2);
25598 SDValue Mask = Op.getOperand(3);
25599 // We add rounding mode to the Node when
25600 // - RC Opcode is specified and
25601 // - RC is not "current direction".
25602 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25603 if (IntrWithRoundingModeOpcode != 0) {
25604 SDValue Rnd = Op.getOperand(4);
25605 unsigned RC = 0;
25606 if (isRoundModeSAEToX(Rnd, RC))
25607 return getVectorMaskingNode(
25608 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25609 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
25610 Mask, PassThru, Subtarget, DAG);
25611 if (!isRoundModeCurDirection(Rnd))
25612 return SDValue();
25613 }
25614 return getVectorMaskingNode(
25615 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
25616 Subtarget, DAG);
25617 }
25619 SDValue Src = Op.getOperand(1);
25620 SDValue PassThru = Op.getOperand(2);
25621 SDValue Mask = Op.getOperand(3);
25622 SDValue Rnd = Op.getOperand(4);
25623
25624 unsigned Opc;
25625 if (isRoundModeCurDirection(Rnd))
25626 Opc = IntrData->Opc0;
25627 else if (isRoundModeSAE(Rnd))
25628 Opc = IntrData->Opc1;
25629 else
25630 return SDValue();
25631
25632 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
25633 Subtarget, DAG);
25634 }
25635 case INTR_TYPE_SCALAR_MASK: {
25636 SDValue Src1 = Op.getOperand(1);
25637 SDValue Src2 = Op.getOperand(2);
25638 SDValue passThru = Op.getOperand(3);
25639 SDValue Mask = Op.getOperand(4);
25640 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25641 // There are 2 kinds of intrinsics in this group:
25642 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
25643 // (2) With rounding mode and sae - 7 operands.
25644 bool HasRounding = IntrWithRoundingModeOpcode != 0;
25645 if (Op.getNumOperands() == (5U + HasRounding)) {
25646 if (HasRounding) {
25647 SDValue Rnd = Op.getOperand(5);
25648 unsigned RC = 0;
25649 if (isRoundModeSAEToX(Rnd, RC))
25650 return getScalarMaskingNode(
25651 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
25652 DAG.getTargetConstant(RC, dl, MVT::i32)),
25653 Mask, passThru, Subtarget, DAG);
25654 if (!isRoundModeCurDirection(Rnd))
25655 return SDValue();
25656 }
25657 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
25658 Src2),
25659 Mask, passThru, Subtarget, DAG);
25660 }
25661
25662 assert(Op.getNumOperands() == (6U + HasRounding) &&
25663 "Unexpected intrinsic form");
25664 SDValue RoundingMode = Op.getOperand(5);
25665 unsigned Opc = IntrData->Opc0;
25666 if (HasRounding) {
25667 SDValue Sae = Op.getOperand(6);
25668 if (isRoundModeSAE(Sae))
25669 Opc = IntrWithRoundingModeOpcode;
25670 else if (!isRoundModeCurDirection(Sae))
25671 return SDValue();
25672 }
25673 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
25674 Src2, RoundingMode),
25675 Mask, passThru, Subtarget, DAG);
25676 }
25678 SDValue Src1 = Op.getOperand(1);
25679 SDValue Src2 = Op.getOperand(2);
25680 SDValue passThru = Op.getOperand(3);
25681 SDValue Mask = Op.getOperand(4);
25682 SDValue Rnd = Op.getOperand(5);
25683
25684 SDValue NewOp;
25685 unsigned RC = 0;
25686 if (isRoundModeCurDirection(Rnd))
25687 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25688 else if (isRoundModeSAEToX(Rnd, RC))
25689 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25690 DAG.getTargetConstant(RC, dl, MVT::i32));
25691 else
25692 return SDValue();
25693
25694 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
25695 }
25697 SDValue Src1 = Op.getOperand(1);
25698 SDValue Src2 = Op.getOperand(2);
25699 SDValue passThru = Op.getOperand(3);
25700 SDValue Mask = Op.getOperand(4);
25701 SDValue Sae = Op.getOperand(5);
25702 unsigned Opc;
25703 if (isRoundModeCurDirection(Sae))
25704 Opc = IntrData->Opc0;
25705 else if (isRoundModeSAE(Sae))
25706 Opc = IntrData->Opc1;
25707 else
25708 return SDValue();
25709
25710 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25711 Mask, passThru, Subtarget, DAG);
25712 }
25713 case INTR_TYPE_2OP_MASK: {
25714 SDValue Src1 = Op.getOperand(1);
25715 SDValue Src2 = Op.getOperand(2);
25716 SDValue PassThru = Op.getOperand(3);
25717 SDValue Mask = Op.getOperand(4);
25718 SDValue NewOp;
25719 if (IntrData->Opc1 != 0) {
25720 SDValue Rnd = Op.getOperand(5);
25721 unsigned RC = 0;
25722 if (isRoundModeSAEToX(Rnd, RC))
25723 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25724 DAG.getTargetConstant(RC, dl, MVT::i32));
25725 else if (!isRoundModeCurDirection(Rnd))
25726 return SDValue();
25727 }
25728 if (!NewOp)
25729 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25730 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
25731 }
25733 SDValue Src1 = Op.getOperand(1);
25734 SDValue Src2 = Op.getOperand(2);
25735 SDValue PassThru = Op.getOperand(3);
25736 SDValue Mask = Op.getOperand(4);
25737
25738 unsigned Opc = IntrData->Opc0;
25739 if (IntrData->Opc1 != 0) {
25740 SDValue Sae = Op.getOperand(5);
25741 if (isRoundModeSAE(Sae))
25742 Opc = IntrData->Opc1;
25743 else if (!isRoundModeCurDirection(Sae))
25744 return SDValue();
25745 }
25746
25747 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25748 Mask, PassThru, Subtarget, DAG);
25749 }
25751 SDValue Src1 = Op.getOperand(1);
25752 SDValue Src2 = Op.getOperand(2);
25753 SDValue Src3 = Op.getOperand(3);
25754 SDValue PassThru = Op.getOperand(4);
25755 SDValue Mask = Op.getOperand(5);
25756 SDValue Sae = Op.getOperand(6);
25757 unsigned Opc;
25758 if (isRoundModeCurDirection(Sae))
25759 Opc = IntrData->Opc0;
25760 else if (isRoundModeSAE(Sae))
25761 Opc = IntrData->Opc1;
25762 else
25763 return SDValue();
25764
25765 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25766 Mask, PassThru, Subtarget, DAG);
25767 }
25769 SDValue Src1 = Op.getOperand(1);
25770 SDValue Src2 = Op.getOperand(2);
25771 SDValue Src3 = Op.getOperand(3);
25772 SDValue PassThru = Op.getOperand(4);
25773 SDValue Mask = Op.getOperand(5);
25774
25775 unsigned Opc = IntrData->Opc0;
25776 if (IntrData->Opc1 != 0) {
25777 SDValue Sae = Op.getOperand(6);
25778 if (isRoundModeSAE(Sae))
25779 Opc = IntrData->Opc1;
25780 else if (!isRoundModeCurDirection(Sae))
25781 return SDValue();
25782 }
25783 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25784 Mask, PassThru, Subtarget, DAG);
25785 }
25786 case BLENDV: {
25787 SDValue Src1 = Op.getOperand(1);
25788 SDValue Src2 = Op.getOperand(2);
25789 SDValue Src3 = Op.getOperand(3);
25790
25792 Src3 = DAG.getBitcast(MaskVT, Src3);
25793
25794 // Reverse the operands to match VSELECT order.
25795 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
25796 }
25797 case VPERM_2OP : {
25798 SDValue Src1 = Op.getOperand(1);
25799 SDValue Src2 = Op.getOperand(2);
25800
25801 // Swap Src1 and Src2 in the node creation
25802 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
25803 }
25804 case CFMA_OP_MASKZ:
25805 case CFMA_OP_MASK: {
25806 SDValue Src1 = Op.getOperand(1);
25807 SDValue Src2 = Op.getOperand(2);
25808 SDValue Src3 = Op.getOperand(3);
25809 SDValue Mask = Op.getOperand(4);
25810 MVT VT = Op.getSimpleValueType();
25811
25812 SDValue PassThru = Src3;
25813 if (IntrData->Type == CFMA_OP_MASKZ)
25814 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
25815
25816 // We add rounding mode to the Node when
25817 // - RC Opcode is specified and
25818 // - RC is not "current direction".
25819 SDValue NewOp;
25820 if (IntrData->Opc1 != 0) {
25821 SDValue Rnd = Op.getOperand(5);
25822 unsigned RC = 0;
25823 if (isRoundModeSAEToX(Rnd, RC))
25824 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
25825 DAG.getTargetConstant(RC, dl, MVT::i32));
25826 else if (!isRoundModeCurDirection(Rnd))
25827 return SDValue();
25828 }
25829 if (!NewOp)
25830 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
25831 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
25832 }
25833 case IFMA_OP:
25834 // NOTE: We need to swizzle the operands to pass the multiply operands
25835 // first.
25836 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25837 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
25838 case FPCLASSS: {
25839 SDValue Src1 = Op.getOperand(1);
25840 SDValue Imm = Op.getOperand(2);
25841 SDValue Mask = Op.getOperand(3);
25842 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
25843 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
25844 Subtarget, DAG);
25845 // Need to fill with zeros to ensure the bitcast will produce zeroes
25846 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25847 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25848 DAG.getConstant(0, dl, MVT::v8i1),
25849 FPclassMask, DAG.getIntPtrConstant(0, dl));
25850 return DAG.getBitcast(MVT::i8, Ins);
25851 }
25852
25853 case CMP_MASK_CC: {
25854 MVT MaskVT = Op.getSimpleValueType();
25855 SDValue CC = Op.getOperand(3);
25856 SDValue Mask = Op.getOperand(4);
25857 // We specify 2 possible opcodes for intrinsics with rounding modes.
25858 // First, we check if the intrinsic may have non-default rounding mode,
25859 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25860 if (IntrData->Opc1 != 0) {
25861 SDValue Sae = Op.getOperand(5);
25862 if (isRoundModeSAE(Sae))
25863 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
25864 Op.getOperand(2), CC, Mask, Sae);
25865 if (!isRoundModeCurDirection(Sae))
25866 return SDValue();
25867 }
25868 //default rounding mode
25869 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
25870 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
25871 }
25872 case CMP_MASK_SCALAR_CC: {
25873 SDValue Src1 = Op.getOperand(1);
25874 SDValue Src2 = Op.getOperand(2);
25875 SDValue CC = Op.getOperand(3);
25876 SDValue Mask = Op.getOperand(4);
25877
25878 SDValue Cmp;
25879 if (IntrData->Opc1 != 0) {
25880 SDValue Sae = Op.getOperand(5);
25881 if (isRoundModeSAE(Sae))
25882 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
25883 else if (!isRoundModeCurDirection(Sae))
25884 return SDValue();
25885 }
25886 //default rounding mode
25887 if (!Cmp.getNode())
25888 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
25889
25890 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
25891 Subtarget, DAG);
25892 // Need to fill with zeros to ensure the bitcast will produce zeroes
25893 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25894 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25895 DAG.getConstant(0, dl, MVT::v8i1),
25896 CmpMask, DAG.getIntPtrConstant(0, dl));
25897 return DAG.getBitcast(MVT::i8, Ins);
25898 }
25899 case COMI: { // Comparison intrinsics
25900 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
25901 SDValue LHS = Op.getOperand(1);
25902 SDValue RHS = Op.getOperand(2);
25903 // Some conditions require the operands to be swapped.
25904 if (CC == ISD::SETLT || CC == ISD::SETLE)
25905 std::swap(LHS, RHS);
25906
25907 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
25908 SDValue SetCC;
25909 switch (CC) {
25910 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
25911 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
25912 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
25913 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
25914 break;
25915 }
25916 case ISD::SETNE: { // (ZF = 1 or PF = 1)
25917 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
25918 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
25919 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
25920 break;
25921 }
25922 case ISD::SETGT: // (CF = 0 and ZF = 0)
25923 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
25924 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
25925 break;
25926 }
25927 case ISD::SETGE: // CF = 0
25928 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
25929 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
25930 break;
25931 default:
25932 llvm_unreachable("Unexpected illegal condition!");
25933 }
25934 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25935 }
25936 case COMI_RM: { // Comparison intrinsics with Sae
25937 SDValue LHS = Op.getOperand(1);
25938 SDValue RHS = Op.getOperand(2);
25939 unsigned CondVal = Op.getConstantOperandVal(3);
25940 SDValue Sae = Op.getOperand(4);
25941
25942 SDValue FCmp;
25943 if (isRoundModeCurDirection(Sae))
25944 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
25945 DAG.getTargetConstant(CondVal, dl, MVT::i8));
25946 else if (isRoundModeSAE(Sae))
25947 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
25948 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
25949 else
25950 return SDValue();
25951 // Need to fill with zeros to ensure the bitcast will produce zeroes
25952 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25953 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25954 DAG.getConstant(0, dl, MVT::v16i1),
25955 FCmp, DAG.getIntPtrConstant(0, dl));
25956 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
25957 DAG.getBitcast(MVT::i16, Ins));
25958 }
25959 case VSHIFT: {
25960 SDValue SrcOp = Op.getOperand(1);
25961 SDValue ShAmt = Op.getOperand(2);
25962 assert(ShAmt.getValueType() == MVT::i32 &&
25963 "Unexpected VSHIFT amount type");
25964
25965 // Catch shift-by-constant.
25966 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
25967 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
25968 Op.getSimpleValueType(), SrcOp,
25969 CShAmt->getZExtValue(), DAG);
25970
25971 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
25972 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
25973 SrcOp, ShAmt, 0, Subtarget, DAG);
25974 }
25976 SDValue Mask = Op.getOperand(3);
25977 SDValue DataToCompress = Op.getOperand(1);
25978 SDValue PassThru = Op.getOperand(2);
25979 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
25980 return Op.getOperand(1);
25981
25982 // Avoid false dependency.
25983 if (PassThru.isUndef())
25984 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
25985
25986 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
25987 Mask);
25988 }
25989 case FIXUPIMM:
25990 case FIXUPIMM_MASKZ: {
25991 SDValue Src1 = Op.getOperand(1);
25992 SDValue Src2 = Op.getOperand(2);
25993 SDValue Src3 = Op.getOperand(3);
25994 SDValue Imm = Op.getOperand(4);
25995 SDValue Mask = Op.getOperand(5);
25996 SDValue Passthru = (IntrData->Type == FIXUPIMM)
25997 ? Src1
25998 : getZeroVector(VT, Subtarget, DAG, dl);
25999
26000 unsigned Opc = IntrData->Opc0;
26001 if (IntrData->Opc1 != 0) {
26002 SDValue Sae = Op.getOperand(6);
26003 if (isRoundModeSAE(Sae))
26004 Opc = IntrData->Opc1;
26005 else if (!isRoundModeCurDirection(Sae))
26006 return SDValue();
26007 }
26008
26009 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
26010
26011 if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
26012 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26013
26014 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26015 }
26016 case ROUNDP: {
26017 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
26018 // Clear the upper bits of the rounding immediate so that the legacy
26019 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26020 uint64_t Round = Op.getConstantOperandVal(2);
26021 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26022 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26023 Op.getOperand(1), RoundingMode);
26024 }
26025 case ROUNDS: {
26026 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
26027 // Clear the upper bits of the rounding immediate so that the legacy
26028 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26029 uint64_t Round = Op.getConstantOperandVal(3);
26030 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26031 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26032 Op.getOperand(1), Op.getOperand(2), RoundingMode);
26033 }
26034 case BEXTRI: {
26035 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
26036
26037 uint64_t Imm = Op.getConstantOperandVal(2);
26038 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
26039 Op.getValueType());
26040 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26041 Op.getOperand(1), Control);
26042 }
26043 // ADC/SBB
26044 case ADX: {
26045 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
26046 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
26047
26048 SDValue Res;
26049 // If the carry in is zero, then we should just use ADD/SUB instead of
26050 // ADC/SBB.
26051 if (isNullConstant(Op.getOperand(1))) {
26052 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
26053 Op.getOperand(3));
26054 } else {
26055 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
26056 DAG.getConstant(-1, dl, MVT::i8));
26057 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
26058 Op.getOperand(3), GenCF.getValue(1));
26059 }
26060 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
26061 SDValue Results[] = { SetCC, Res };
26062 return DAG.getMergeValues(Results, dl);
26063 }
26064 case CVTPD2PS_MASK:
26065 case CVTPD2DQ_MASK:
26066 case CVTQQ2PS_MASK:
26067 case TRUNCATE_TO_REG: {
26068 SDValue Src = Op.getOperand(1);
26069 SDValue PassThru = Op.getOperand(2);
26070 SDValue Mask = Op.getOperand(3);
26071
26072 if (isAllOnesConstant(Mask))
26073 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26074
26075 MVT SrcVT = Src.getSimpleValueType();
26076 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
26077 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26078 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
26079 {Src, PassThru, Mask});
26080 }
26081 case CVTPS2PH_MASK: {
26082 SDValue Src = Op.getOperand(1);
26083 SDValue Rnd = Op.getOperand(2);
26084 SDValue PassThru = Op.getOperand(3);
26085 SDValue Mask = Op.getOperand(4);
26086
26087 unsigned RC = 0;
26088 unsigned Opc = IntrData->Opc0;
26089 bool SAE = Src.getValueType().is512BitVector() &&
26090 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
26091 if (SAE) {
26093 Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
26094 }
26095
26096 if (isAllOnesConstant(Mask))
26097 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
26098
26099 if (SAE)
26101 else
26102 Opc = IntrData->Opc1;
26103 MVT SrcVT = Src.getSimpleValueType();
26104 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
26105 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26106 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
26107 }
26108 case CVTNEPS2BF16_MASK: {
26109 SDValue Src = Op.getOperand(1);
26110 SDValue PassThru = Op.getOperand(2);
26111 SDValue Mask = Op.getOperand(3);
26112
26113 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
26114 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26115
26116 // Break false dependency.
26117 if (PassThru.isUndef())
26118 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
26119
26120 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
26121 Mask);
26122 }
26123 default:
26124 break;
26125 }
26126 }
26127
26128 switch (IntNo) {
26129 default: return SDValue(); // Don't custom lower most intrinsics.
26130
26131 // ptest and testp intrinsics. The intrinsic these come from are designed to
26132 // return an integer value, not just an instruction so lower it to the ptest
26133 // or testp pattern and a setcc for the result.
26134 case Intrinsic::x86_avx512_ktestc_b:
26135 case Intrinsic::x86_avx512_ktestc_w:
26136 case Intrinsic::x86_avx512_ktestc_d:
26137 case Intrinsic::x86_avx512_ktestc_q:
26138 case Intrinsic::x86_avx512_ktestz_b:
26139 case Intrinsic::x86_avx512_ktestz_w:
26140 case Intrinsic::x86_avx512_ktestz_d:
26141 case Intrinsic::x86_avx512_ktestz_q:
26142 case Intrinsic::x86_sse41_ptestz:
26143 case Intrinsic::x86_sse41_ptestc:
26144 case Intrinsic::x86_sse41_ptestnzc:
26145 case Intrinsic::x86_avx_ptestz_256:
26146 case Intrinsic::x86_avx_ptestc_256:
26147 case Intrinsic::x86_avx_ptestnzc_256:
26148 case Intrinsic::x86_avx_vtestz_ps:
26149 case Intrinsic::x86_avx_vtestc_ps:
26150 case Intrinsic::x86_avx_vtestnzc_ps:
26151 case Intrinsic::x86_avx_vtestz_pd:
26152 case Intrinsic::x86_avx_vtestc_pd:
26153 case Intrinsic::x86_avx_vtestnzc_pd:
26154 case Intrinsic::x86_avx_vtestz_ps_256:
26155 case Intrinsic::x86_avx_vtestc_ps_256:
26156 case Intrinsic::x86_avx_vtestnzc_ps_256:
26157 case Intrinsic::x86_avx_vtestz_pd_256:
26158 case Intrinsic::x86_avx_vtestc_pd_256:
26159 case Intrinsic::x86_avx_vtestnzc_pd_256: {
26160 unsigned TestOpc = X86ISD::PTEST;
26161 X86::CondCode X86CC;
26162 switch (IntNo) {
26163 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
26164 case Intrinsic::x86_avx512_ktestc_b:
26165 case Intrinsic::x86_avx512_ktestc_w:
26166 case Intrinsic::x86_avx512_ktestc_d:
26167 case Intrinsic::x86_avx512_ktestc_q:
26168 // CF = 1
26169 TestOpc = X86ISD::KTEST;
26170 X86CC = X86::COND_B;
26171 break;
26172 case Intrinsic::x86_avx512_ktestz_b:
26173 case Intrinsic::x86_avx512_ktestz_w:
26174 case Intrinsic::x86_avx512_ktestz_d:
26175 case Intrinsic::x86_avx512_ktestz_q:
26176 TestOpc = X86ISD::KTEST;
26177 X86CC = X86::COND_E;
26178 break;
26179 case Intrinsic::x86_avx_vtestz_ps:
26180 case Intrinsic::x86_avx_vtestz_pd:
26181 case Intrinsic::x86_avx_vtestz_ps_256:
26182 case Intrinsic::x86_avx_vtestz_pd_256:
26183 TestOpc = X86ISD::TESTP;
26184 [[fallthrough]];
26185 case Intrinsic::x86_sse41_ptestz:
26186 case Intrinsic::x86_avx_ptestz_256:
26187 // ZF = 1
26188 X86CC = X86::COND_E;
26189 break;
26190 case Intrinsic::x86_avx_vtestc_ps:
26191 case Intrinsic::x86_avx_vtestc_pd:
26192 case Intrinsic::x86_avx_vtestc_ps_256:
26193 case Intrinsic::x86_avx_vtestc_pd_256:
26194 TestOpc = X86ISD::TESTP;
26195 [[fallthrough]];
26196 case Intrinsic::x86_sse41_ptestc:
26197 case Intrinsic::x86_avx_ptestc_256:
26198 // CF = 1
26199 X86CC = X86::COND_B;
26200 break;
26201 case Intrinsic::x86_avx_vtestnzc_ps:
26202 case Intrinsic::x86_avx_vtestnzc_pd:
26203 case Intrinsic::x86_avx_vtestnzc_ps_256:
26204 case Intrinsic::x86_avx_vtestnzc_pd_256:
26205 TestOpc = X86ISD::TESTP;
26206 [[fallthrough]];
26207 case Intrinsic::x86_sse41_ptestnzc:
26208 case Intrinsic::x86_avx_ptestnzc_256:
26209 // ZF and CF = 0
26210 X86CC = X86::COND_A;
26211 break;
26212 }
26213
26214 SDValue LHS = Op.getOperand(1);
26215 SDValue RHS = Op.getOperand(2);
26216 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
26217 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
26218 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26219 }
26220
26221 case Intrinsic::x86_sse42_pcmpistria128:
26222 case Intrinsic::x86_sse42_pcmpestria128:
26223 case Intrinsic::x86_sse42_pcmpistric128:
26224 case Intrinsic::x86_sse42_pcmpestric128:
26225 case Intrinsic::x86_sse42_pcmpistrio128:
26226 case Intrinsic::x86_sse42_pcmpestrio128:
26227 case Intrinsic::x86_sse42_pcmpistris128:
26228 case Intrinsic::x86_sse42_pcmpestris128:
26229 case Intrinsic::x86_sse42_pcmpistriz128:
26230 case Intrinsic::x86_sse42_pcmpestriz128: {
26231 unsigned Opcode;
26232 X86::CondCode X86CC;
26233 switch (IntNo) {
26234 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
26235 case Intrinsic::x86_sse42_pcmpistria128:
26236 Opcode = X86ISD::PCMPISTR;
26237 X86CC = X86::COND_A;
26238 break;
26239 case Intrinsic::x86_sse42_pcmpestria128:
26240 Opcode = X86ISD::PCMPESTR;
26241 X86CC = X86::COND_A;
26242 break;
26243 case Intrinsic::x86_sse42_pcmpistric128:
26244 Opcode = X86ISD::PCMPISTR;
26245 X86CC = X86::COND_B;
26246 break;
26247 case Intrinsic::x86_sse42_pcmpestric128:
26248 Opcode = X86ISD::PCMPESTR;
26249 X86CC = X86::COND_B;
26250 break;
26251 case Intrinsic::x86_sse42_pcmpistrio128:
26252 Opcode = X86ISD::PCMPISTR;
26253 X86CC = X86::COND_O;
26254 break;
26255 case Intrinsic::x86_sse42_pcmpestrio128:
26256 Opcode = X86ISD::PCMPESTR;
26257 X86CC = X86::COND_O;
26258 break;
26259 case Intrinsic::x86_sse42_pcmpistris128:
26260 Opcode = X86ISD::PCMPISTR;
26261 X86CC = X86::COND_S;
26262 break;
26263 case Intrinsic::x86_sse42_pcmpestris128:
26264 Opcode = X86ISD::PCMPESTR;
26265 X86CC = X86::COND_S;
26266 break;
26267 case Intrinsic::x86_sse42_pcmpistriz128:
26268 Opcode = X86ISD::PCMPISTR;
26269 X86CC = X86::COND_E;
26270 break;
26271 case Intrinsic::x86_sse42_pcmpestriz128:
26272 Opcode = X86ISD::PCMPESTR;
26273 X86CC = X86::COND_E;
26274 break;
26275 }
26277 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26278 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
26279 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
26280 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26281 }
26282
26283 case Intrinsic::x86_sse42_pcmpistri128:
26284 case Intrinsic::x86_sse42_pcmpestri128: {
26285 unsigned Opcode;
26286 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
26287 Opcode = X86ISD::PCMPISTR;
26288 else
26289 Opcode = X86ISD::PCMPESTR;
26290
26292 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26293 return DAG.getNode(Opcode, dl, VTs, NewOps);
26294 }
26295
26296 case Intrinsic::x86_sse42_pcmpistrm128:
26297 case Intrinsic::x86_sse42_pcmpestrm128: {
26298 unsigned Opcode;
26299 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
26300 Opcode = X86ISD::PCMPISTR;
26301 else
26302 Opcode = X86ISD::PCMPESTR;
26303
26305 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26306 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
26307 }
26308
26309 case Intrinsic::eh_sjlj_lsda: {
26311 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26312 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26313 auto &Context = MF.getMMI().getContext();
26314 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
26315 Twine(MF.getFunctionNumber()));
26316 return DAG.getNode(getGlobalWrapperKind(nullptr, /*OpFlags=*/0), dl, VT,
26317 DAG.getMCSymbol(S, PtrVT));
26318 }
26319
26320 case Intrinsic::x86_seh_lsda: {
26321 // Compute the symbol for the LSDA. We know it'll get emitted later.
26323 SDValue Op1 = Op.getOperand(1);
26324 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
26327
26328 // Generate a simple absolute symbol reference. This intrinsic is only
26329 // supported on 32-bit Windows, which isn't PIC.
26330 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
26331 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
26332 }
26333
26334 case Intrinsic::eh_recoverfp: {
26335 SDValue FnOp = Op.getOperand(1);
26336 SDValue IncomingFPOp = Op.getOperand(2);
26337 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
26338 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
26339 if (!Fn)
26341 "llvm.eh.recoverfp must take a function as the first argument");
26342 return recoverFramePointer(DAG, Fn, IncomingFPOp);
26343 }
26344
26345 case Intrinsic::localaddress: {
26346 // Returns one of the stack, base, or frame pointer registers, depending on
26347 // which is used to reference local variables.
26349 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26350 unsigned Reg;
26351 if (RegInfo->hasBasePointer(MF))
26352 Reg = RegInfo->getBaseRegister();
26353 else { // Handles the SP or FP case.
26354 bool CantUseFP = RegInfo->hasStackRealignment(MF);
26355 if (CantUseFP)
26356 Reg = RegInfo->getPtrSizedStackRegister(MF);
26357 else
26358 Reg = RegInfo->getPtrSizedFrameRegister(MF);
26359 }
26360 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
26361 }
26362 case Intrinsic::x86_avx512_vp2intersect_q_512:
26363 case Intrinsic::x86_avx512_vp2intersect_q_256:
26364 case Intrinsic::x86_avx512_vp2intersect_q_128:
26365 case Intrinsic::x86_avx512_vp2intersect_d_512:
26366 case Intrinsic::x86_avx512_vp2intersect_d_256:
26367 case Intrinsic::x86_avx512_vp2intersect_d_128: {
26368 MVT MaskVT = Op.getSimpleValueType();
26369
26370 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
26371 SDLoc DL(Op);
26372
26375 Op->getOperand(1), Op->getOperand(2));
26376
26377 SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
26378 MaskVT, Operation);
26379 SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
26380 MaskVT, Operation);
26381 return DAG.getMergeValues({Result0, Result1}, DL);
26382 }
26383 case Intrinsic::x86_mmx_pslli_w:
26384 case Intrinsic::x86_mmx_pslli_d:
26385 case Intrinsic::x86_mmx_pslli_q:
26386 case Intrinsic::x86_mmx_psrli_w:
26387 case Intrinsic::x86_mmx_psrli_d:
26388 case Intrinsic::x86_mmx_psrli_q:
26389 case Intrinsic::x86_mmx_psrai_w:
26390 case Intrinsic::x86_mmx_psrai_d: {
26391 SDLoc DL(Op);
26392 SDValue ShAmt = Op.getOperand(2);
26393 // If the argument is a constant, convert it to a target constant.
26394 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
26395 // Clamp out of bounds shift amounts since they will otherwise be masked
26396 // to 8-bits which may make it no longer out of bounds.
26397 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
26398 if (ShiftAmount == 0)
26399 return Op.getOperand(1);
26400
26401 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26402 Op.getOperand(0), Op.getOperand(1),
26403 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
26404 }
26405
26406 unsigned NewIntrinsic;
26407 switch (IntNo) {
26408 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
26409 case Intrinsic::x86_mmx_pslli_w:
26410 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
26411 break;
26412 case Intrinsic::x86_mmx_pslli_d:
26413 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
26414 break;
26415 case Intrinsic::x86_mmx_pslli_q:
26416 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
26417 break;
26418 case Intrinsic::x86_mmx_psrli_w:
26419 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
26420 break;
26421 case Intrinsic::x86_mmx_psrli_d:
26422 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
26423 break;
26424 case Intrinsic::x86_mmx_psrli_q:
26425 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
26426 break;
26427 case Intrinsic::x86_mmx_psrai_w:
26428 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
26429 break;
26430 case Intrinsic::x86_mmx_psrai_d:
26431 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
26432 break;
26433 }
26434
26435 // The vector shift intrinsics with scalars uses 32b shift amounts but
26436 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
26437 // MMX register.
26438 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
26439 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26440 DAG.getTargetConstant(NewIntrinsic, DL,
26442 Op.getOperand(1), ShAmt);
26443 }
26444 case Intrinsic::thread_pointer: {
26445 if (Subtarget.isTargetELF()) {
26446 SDLoc dl(Op);
26447 EVT PtrVT = getPointerTy(DAG.getDataLayout());
26448 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
26450 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
26451 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
26453 }
26455 "Target OS doesn't support __builtin_thread_pointer() yet.");
26456 }
26457 }
26458}
26459
26461 SDValue Src, SDValue Mask, SDValue Base,
26462 SDValue Index, SDValue ScaleOp, SDValue Chain,
26463 const X86Subtarget &Subtarget) {
26464 SDLoc dl(Op);
26465 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26466 // Scale must be constant.
26467 if (!C)
26468 return SDValue();
26469 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26470 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26471 TLI.getPointerTy(DAG.getDataLayout()));
26472 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
26473 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26474 // If source is undef or we know it won't be used, use a zero vector
26475 // to break register dependency.
26476 // TODO: use undef instead and let BreakFalseDeps deal with it?
26477 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26478 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26479
26480 // Cast mask to an integer type.
26481 Mask = DAG.getBitcast(MaskVT, Mask);
26482
26483 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26484
26485 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26486 SDValue Res =
26487 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26488 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26489 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26490}
26491
26493 SDValue Src, SDValue Mask, SDValue Base,
26494 SDValue Index, SDValue ScaleOp, SDValue Chain,
26495 const X86Subtarget &Subtarget) {
26496 MVT VT = Op.getSimpleValueType();
26497 SDLoc dl(Op);
26498 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26499 // Scale must be constant.
26500 if (!C)
26501 return SDValue();
26502 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26503 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26504 TLI.getPointerTy(DAG.getDataLayout()));
26505 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26507 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26508
26509 // We support two versions of the gather intrinsics. One with scalar mask and
26510 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26511 if (Mask.getValueType() != MaskVT)
26512 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26513
26514 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26515 // If source is undef or we know it won't be used, use a zero vector
26516 // to break register dependency.
26517 // TODO: use undef instead and let BreakFalseDeps deal with it?
26518 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26519 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26520
26521 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26522
26523 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26524 SDValue Res =
26525 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26526 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26527 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26528}
26529
26530static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26531 SDValue Src, SDValue Mask, SDValue Base,
26532 SDValue Index, SDValue ScaleOp, SDValue Chain,
26533 const X86Subtarget &Subtarget) {
26534 SDLoc dl(Op);
26535 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26536 // Scale must be constant.
26537 if (!C)
26538 return SDValue();
26539 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26540 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26541 TLI.getPointerTy(DAG.getDataLayout()));
26542 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26543 Src.getSimpleValueType().getVectorNumElements());
26544 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26545
26546 // We support two versions of the scatter intrinsics. One with scalar mask and
26547 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26548 if (Mask.getValueType() != MaskVT)
26549 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26550
26551 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26552
26553 SDVTList VTs = DAG.getVTList(MVT::Other);
26554 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
26555 SDValue Res =
26556 DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
26557 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26558 return Res;
26559}
26560
26561static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26563 SDValue ScaleOp, SDValue Chain,
26564 const X86Subtarget &Subtarget) {
26565 SDLoc dl(Op);
26566 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26567 // Scale must be constant.
26568 if (!C)
26569 return SDValue();
26570 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26571 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26572 TLI.getPointerTy(DAG.getDataLayout()));
26573 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
26574 SDValue Segment = DAG.getRegister(0, MVT::i32);
26575 MVT MaskVT =
26576 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
26577 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26578 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
26579 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
26580 return SDValue(Res, 0);
26581}
26582
26583/// Handles the lowering of builtin intrinsics with chain that return their
26584/// value into registers EDX:EAX.
26585/// If operand ScrReg is a valid register identifier, then operand 2 of N is
26586/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
26587/// TargetOpcode.
26588/// Returns a Glue value which can be used to add extra copy-from-reg if the
26589/// expanded intrinsics implicitly defines extra registers (i.e. not just
26590/// EDX:EAX).
26592 SelectionDAG &DAG,
26593 unsigned TargetOpcode,
26594 unsigned SrcReg,
26595 const X86Subtarget &Subtarget,
26597 SDValue Chain = N->getOperand(0);
26598 SDValue Glue;
26599
26600 if (SrcReg) {
26601 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
26602 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
26603 Glue = Chain.getValue(1);
26604 }
26605
26606 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
26607 SDValue N1Ops[] = {Chain, Glue};
26608 SDNode *N1 = DAG.getMachineNode(
26609 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
26610 Chain = SDValue(N1, 0);
26611
26612 // Reads the content of XCR and returns it in registers EDX:EAX.
26613 SDValue LO, HI;
26614 if (Subtarget.is64Bit()) {
26615 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
26616 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
26617 LO.getValue(2));
26618 } else {
26619 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
26620 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
26621 LO.getValue(2));
26622 }
26623 Chain = HI.getValue(1);
26624 Glue = HI.getValue(2);
26625
26626 if (Subtarget.is64Bit()) {
26627 // Merge the two 32-bit values into a 64-bit one.
26628 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
26629 DAG.getConstant(32, DL, MVT::i8));
26630 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
26631 Results.push_back(Chain);
26632 return Glue;
26633 }
26634
26635 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
26636 SDValue Ops[] = { LO, HI };
26637 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
26638 Results.push_back(Pair);
26639 Results.push_back(Chain);
26640 return Glue;
26641}
26642
26643/// Handles the lowering of builtin intrinsics that read the time stamp counter
26644/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
26645/// READCYCLECOUNTER nodes.
26646static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
26647 SelectionDAG &DAG,
26648 const X86Subtarget &Subtarget,
26650 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
26651 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
26652 // and the EAX register is loaded with the low-order 32 bits.
26653 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
26654 /* NoRegister */0, Subtarget,
26655 Results);
26656 if (Opcode != X86::RDTSCP)
26657 return;
26658
26659 SDValue Chain = Results[1];
26660 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
26661 // the ECX register. Add 'ecx' explicitly to the chain.
26662 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
26663 Results[1] = ecx;
26664 Results.push_back(ecx.getValue(1));
26665}
26666
26668 SelectionDAG &DAG) {
26670 SDLoc DL(Op);
26671 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
26672 Results);
26673 return DAG.getMergeValues(Results, DL);
26674}
26675
26678 SDValue Chain = Op.getOperand(0);
26679 SDValue RegNode = Op.getOperand(2);
26680 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26681 if (!EHInfo)
26682 report_fatal_error("EH registrations only live in functions using WinEH");
26683
26684 // Cast the operand to an alloca, and remember the frame index.
26685 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
26686 if (!FINode)
26687 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
26688 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
26689
26690 // Return the chain operand without making any DAG nodes.
26691 return Chain;
26692}
26693
26696 SDValue Chain = Op.getOperand(0);
26697 SDValue EHGuard = Op.getOperand(2);
26698 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26699 if (!EHInfo)
26700 report_fatal_error("EHGuard only live in functions using WinEH");
26701
26702 // Cast the operand to an alloca, and remember the frame index.
26703 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
26704 if (!FINode)
26705 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
26706 EHInfo->EHGuardFrameIndex = FINode->getIndex();
26707
26708 // Return the chain operand without making any DAG nodes.
26709 return Chain;
26710}
26711
26712/// Emit Truncating Store with signed or unsigned saturation.
26713static SDValue
26714EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val,
26715 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
26716 SelectionDAG &DAG) {
26717 SDVTList VTs = DAG.getVTList(MVT::Other);
26718 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
26719 SDValue Ops[] = { Chain, Val, Ptr, Undef };
26720 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
26721 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
26722}
26723
26724/// Emit Masked Truncating Store with signed or unsigned saturation.
26725static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain,
26726 const SDLoc &DL,
26727 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
26728 MachineMemOperand *MMO, SelectionDAG &DAG) {
26729 SDVTList VTs = DAG.getVTList(MVT::Other);
26730 SDValue Ops[] = { Chain, Val, Ptr, Mask };
26731 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
26732 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
26733}
26734
26736 const MachineFunction &MF) {
26737 if (!Subtarget.is64Bit())
26738 return false;
26739 // 64-bit targets support extended Swift async frame setup,
26740 // except for targets that use the windows 64 prologue.
26741 return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
26742}
26743
26745 SelectionDAG &DAG) {
26746 unsigned IntNo = Op.getConstantOperandVal(1);
26747 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
26748 if (!IntrData) {
26749 switch (IntNo) {
26750
26751 case Intrinsic::swift_async_context_addr: {
26752 SDLoc dl(Op);
26753 auto &MF = DAG.getMachineFunction();
26754 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
26755 if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) {
26757 X86FI->setHasSwiftAsyncContext(true);
26758 SDValue Chain = Op->getOperand(0);
26759 SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
26760 SDValue Result =
26761 SDValue(DAG.getMachineNode(X86::SUB64ri32, dl, MVT::i64, CopyRBP,
26762 DAG.getTargetConstant(8, dl, MVT::i32)),
26763 0);
26764 // Return { result, chain }.
26765 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
26766 CopyRBP.getValue(1));
26767 } else {
26768 // No special extended frame, create or reuse an existing stack slot.
26769 int PtrSize = Subtarget.is64Bit() ? 8 : 4;
26770 if (!X86FI->getSwiftAsyncContextFrameIdx())
26771 X86FI->setSwiftAsyncContextFrameIdx(
26772 MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize),
26773 false));
26774 SDValue Result =
26775 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(),
26776 PtrSize == 8 ? MVT::i64 : MVT::i32);
26777 // Return { result, chain }.
26778 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
26779 Op->getOperand(0));
26780 }
26781 }
26782
26783 case llvm::Intrinsic::x86_seh_ehregnode:
26784 return MarkEHRegistrationNode(Op, DAG);
26785 case llvm::Intrinsic::x86_seh_ehguard:
26786 return MarkEHGuard(Op, DAG);
26787 case llvm::Intrinsic::x86_rdpkru: {
26788 SDLoc dl(Op);
26789 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26790 // Create a RDPKRU node and pass 0 to the ECX parameter.
26791 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
26792 DAG.getConstant(0, dl, MVT::i32));
26793 }
26794 case llvm::Intrinsic::x86_wrpkru: {
26795 SDLoc dl(Op);
26796 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
26797 // to the EDX and ECX parameters.
26798 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
26799 Op.getOperand(0), Op.getOperand(2),
26800 DAG.getConstant(0, dl, MVT::i32),
26801 DAG.getConstant(0, dl, MVT::i32));
26802 }
26803 case llvm::Intrinsic::asan_check_memaccess: {
26804 // Mark this as adjustsStack because it will be lowered to a call.
26806 // Don't do anything here, we will expand these intrinsics out later.
26807 return Op;
26808 }
26809 case llvm::Intrinsic::x86_flags_read_u32:
26810 case llvm::Intrinsic::x86_flags_read_u64:
26811 case llvm::Intrinsic::x86_flags_write_u32:
26812 case llvm::Intrinsic::x86_flags_write_u64: {
26813 // We need a frame pointer because this will get lowered to a PUSH/POP
26814 // sequence.
26817 // Don't do anything here, we will expand these intrinsics out later
26818 // during FinalizeISel in EmitInstrWithCustomInserter.
26819 return Op;
26820 }
26821 case Intrinsic::x86_lwpins32:
26822 case Intrinsic::x86_lwpins64:
26823 case Intrinsic::x86_umwait:
26824 case Intrinsic::x86_tpause: {
26825 SDLoc dl(Op);
26826 SDValue Chain = Op->getOperand(0);
26827 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26828 unsigned Opcode;
26829
26830 switch (IntNo) {
26831 default: llvm_unreachable("Impossible intrinsic");
26832 case Intrinsic::x86_umwait:
26833 Opcode = X86ISD::UMWAIT;
26834 break;
26835 case Intrinsic::x86_tpause:
26836 Opcode = X86ISD::TPAUSE;
26837 break;
26838 case Intrinsic::x86_lwpins32:
26839 case Intrinsic::x86_lwpins64:
26840 Opcode = X86ISD::LWPINS;
26841 break;
26842 }
26843
26845 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
26846 Op->getOperand(3), Op->getOperand(4));
26847 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26848 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26849 Operation.getValue(1));
26850 }
26851 case Intrinsic::x86_enqcmd:
26852 case Intrinsic::x86_enqcmds: {
26853 SDLoc dl(Op);
26854 SDValue Chain = Op.getOperand(0);
26855 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26856 unsigned Opcode;
26857 switch (IntNo) {
26858 default: llvm_unreachable("Impossible intrinsic!");
26859 case Intrinsic::x86_enqcmd:
26860 Opcode = X86ISD::ENQCMD;
26861 break;
26862 case Intrinsic::x86_enqcmds:
26863 Opcode = X86ISD::ENQCMDS;
26864 break;
26865 }
26866 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
26867 Op.getOperand(3));
26868 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
26869 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26870 Operation.getValue(1));
26871 }
26872 case Intrinsic::x86_aesenc128kl:
26873 case Intrinsic::x86_aesdec128kl:
26874 case Intrinsic::x86_aesenc256kl:
26875 case Intrinsic::x86_aesdec256kl: {
26876 SDLoc DL(Op);
26877 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
26878 SDValue Chain = Op.getOperand(0);
26879 unsigned Opcode;
26880
26881 switch (IntNo) {
26882 default: llvm_unreachable("Impossible intrinsic");
26883 case Intrinsic::x86_aesenc128kl:
26884 Opcode = X86ISD::AESENC128KL;
26885 break;
26886 case Intrinsic::x86_aesdec128kl:
26887 Opcode = X86ISD::AESDEC128KL;
26888 break;
26889 case Intrinsic::x86_aesenc256kl:
26890 Opcode = X86ISD::AESENC256KL;
26891 break;
26892 case Intrinsic::x86_aesdec256kl:
26893 Opcode = X86ISD::AESDEC256KL;
26894 break;
26895 }
26896
26897 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26898 MachineMemOperand *MMO = MemIntr->getMemOperand();
26899 EVT MemVT = MemIntr->getMemoryVT();
26901 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
26902 MMO);
26903 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
26904
26905 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26906 {ZF, Operation.getValue(0), Operation.getValue(2)});
26907 }
26908 case Intrinsic::x86_aesencwide128kl:
26909 case Intrinsic::x86_aesdecwide128kl:
26910 case Intrinsic::x86_aesencwide256kl:
26911 case Intrinsic::x86_aesdecwide256kl: {
26912 SDLoc DL(Op);
26913 SDVTList VTs = DAG.getVTList(
26914 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
26915 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
26916 SDValue Chain = Op.getOperand(0);
26917 unsigned Opcode;
26918
26919 switch (IntNo) {
26920 default: llvm_unreachable("Impossible intrinsic");
26921 case Intrinsic::x86_aesencwide128kl:
26922 Opcode = X86ISD::AESENCWIDE128KL;
26923 break;
26924 case Intrinsic::x86_aesdecwide128kl:
26925 Opcode = X86ISD::AESDECWIDE128KL;
26926 break;
26927 case Intrinsic::x86_aesencwide256kl:
26928 Opcode = X86ISD::AESENCWIDE256KL;
26929 break;
26930 case Intrinsic::x86_aesdecwide256kl:
26931 Opcode = X86ISD::AESDECWIDE256KL;
26932 break;
26933 }
26934
26935 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26936 MachineMemOperand *MMO = MemIntr->getMemOperand();
26937 EVT MemVT = MemIntr->getMemoryVT();
26939 Opcode, DL, VTs,
26940 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
26941 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
26942 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
26943 MemVT, MMO);
26944 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
26945
26946 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26947 {ZF, Operation.getValue(1), Operation.getValue(2),
26948 Operation.getValue(3), Operation.getValue(4),
26949 Operation.getValue(5), Operation.getValue(6),
26950 Operation.getValue(7), Operation.getValue(8),
26951 Operation.getValue(9)});
26952 }
26953 case Intrinsic::x86_testui: {
26954 SDLoc dl(Op);
26955 SDValue Chain = Op.getOperand(0);
26956 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26957 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
26958 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26959 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26960 Operation.getValue(1));
26961 }
26962 case Intrinsic::x86_atomic_bts_rm:
26963 case Intrinsic::x86_atomic_btc_rm:
26964 case Intrinsic::x86_atomic_btr_rm: {
26965 SDLoc DL(Op);
26966 MVT VT = Op.getSimpleValueType();
26967 SDValue Chain = Op.getOperand(0);
26968 SDValue Op1 = Op.getOperand(2);
26969 SDValue Op2 = Op.getOperand(3);
26970 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
26971 : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
26973 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
26974 SDValue Res =
26975 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
26976 {Chain, Op1, Op2}, VT, MMO);
26977 Chain = Res.getValue(1);
26978 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
26979 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
26980 }
26981 case Intrinsic::x86_atomic_bts:
26982 case Intrinsic::x86_atomic_btc:
26983 case Intrinsic::x86_atomic_btr: {
26984 SDLoc DL(Op);
26985 MVT VT = Op.getSimpleValueType();
26986 SDValue Chain = Op.getOperand(0);
26987 SDValue Op1 = Op.getOperand(2);
26988 SDValue Op2 = Op.getOperand(3);
26989 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
26990 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
26991 : X86ISD::LBTR;
26992 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
26993 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
26994 SDValue Res =
26995 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
26996 {Chain, Op1, Op2, Size}, VT, MMO);
26997 Chain = Res.getValue(1);
26998 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
26999 unsigned Imm = Op2->getAsZExtVal();
27000 if (Imm)
27001 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
27002 DAG.getShiftAmountConstant(Imm, VT, DL));
27003 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
27004 }
27005 case Intrinsic::x86_cmpccxadd32:
27006 case Intrinsic::x86_cmpccxadd64: {
27007 SDLoc DL(Op);
27008 SDValue Chain = Op.getOperand(0);
27009 SDValue Addr = Op.getOperand(2);
27010 SDValue Src1 = Op.getOperand(3);
27011 SDValue Src2 = Op.getOperand(4);
27012 SDValue CC = Op.getOperand(5);
27013 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27015 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
27016 MVT::i32, MMO);
27017 return Operation;
27018 }
27019 case Intrinsic::x86_aadd32:
27020 case Intrinsic::x86_aadd64:
27021 case Intrinsic::x86_aand32:
27022 case Intrinsic::x86_aand64:
27023 case Intrinsic::x86_aor32:
27024 case Intrinsic::x86_aor64:
27025 case Intrinsic::x86_axor32:
27026 case Intrinsic::x86_axor64: {
27027 SDLoc DL(Op);
27028 SDValue Chain = Op.getOperand(0);
27029 SDValue Op1 = Op.getOperand(2);
27030 SDValue Op2 = Op.getOperand(3);
27031 MVT VT = Op2.getSimpleValueType();
27032 unsigned Opc = 0;
27033 switch (IntNo) {
27034 default:
27035 llvm_unreachable("Unknown Intrinsic");
27036 case Intrinsic::x86_aadd32:
27037 case Intrinsic::x86_aadd64:
27038 Opc = X86ISD::AADD;
27039 break;
27040 case Intrinsic::x86_aand32:
27041 case Intrinsic::x86_aand64:
27042 Opc = X86ISD::AAND;
27043 break;
27044 case Intrinsic::x86_aor32:
27045 case Intrinsic::x86_aor64:
27046 Opc = X86ISD::AOR;
27047 break;
27048 case Intrinsic::x86_axor32:
27049 case Intrinsic::x86_axor64:
27050 Opc = X86ISD::AXOR;
27051 break;
27052 }
27053 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
27054 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
27055 {Chain, Op1, Op2}, VT, MMO);
27056 }
27057 case Intrinsic::x86_atomic_add_cc:
27058 case Intrinsic::x86_atomic_sub_cc:
27059 case Intrinsic::x86_atomic_or_cc:
27060 case Intrinsic::x86_atomic_and_cc:
27061 case Intrinsic::x86_atomic_xor_cc: {
27062 SDLoc DL(Op);
27063 SDValue Chain = Op.getOperand(0);
27064 SDValue Op1 = Op.getOperand(2);
27065 SDValue Op2 = Op.getOperand(3);
27066 X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
27067 MVT VT = Op2.getSimpleValueType();
27068 unsigned Opc = 0;
27069 switch (IntNo) {
27070 default:
27071 llvm_unreachable("Unknown Intrinsic");
27072 case Intrinsic::x86_atomic_add_cc:
27073 Opc = X86ISD::LADD;
27074 break;
27075 case Intrinsic::x86_atomic_sub_cc:
27076 Opc = X86ISD::LSUB;
27077 break;
27078 case Intrinsic::x86_atomic_or_cc:
27079 Opc = X86ISD::LOR;
27080 break;
27081 case Intrinsic::x86_atomic_and_cc:
27082 Opc = X86ISD::LAND;
27083 break;
27084 case Intrinsic::x86_atomic_xor_cc:
27085 Opc = X86ISD::LXOR;
27086 break;
27087 }
27088 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27089 SDValue LockArith =
27090 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
27091 {Chain, Op1, Op2}, VT, MMO);
27092 Chain = LockArith.getValue(1);
27093 return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
27094 }
27095 }
27096 return SDValue();
27097 }
27098
27099 SDLoc dl(Op);
27100 switch(IntrData->Type) {
27101 default: llvm_unreachable("Unknown Intrinsic Type");
27102 case RDSEED:
27103 case RDRAND: {
27104 // Emit the node with the right value type.
27105 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
27106 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27107
27108 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
27109 // Otherwise return the value from Rand, which is always 0, casted to i32.
27110 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
27111 DAG.getConstant(1, dl, Op->getValueType(1)),
27112 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
27113 SDValue(Result.getNode(), 1)};
27114 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
27115
27116 // Return { result, isValid, chain }.
27117 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
27118 SDValue(Result.getNode(), 2));
27119 }
27120 case GATHER_AVX2: {
27121 SDValue Chain = Op.getOperand(0);
27122 SDValue Src = Op.getOperand(2);
27123 SDValue Base = Op.getOperand(3);
27124 SDValue Index = Op.getOperand(4);
27125 SDValue Mask = Op.getOperand(5);
27126 SDValue Scale = Op.getOperand(6);
27127 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27128 Scale, Chain, Subtarget);
27129 }
27130 case GATHER: {
27131 //gather(v1, mask, index, base, scale);
27132 SDValue Chain = Op.getOperand(0);
27133 SDValue Src = Op.getOperand(2);
27134 SDValue Base = Op.getOperand(3);
27135 SDValue Index = Op.getOperand(4);
27136 SDValue Mask = Op.getOperand(5);
27137 SDValue Scale = Op.getOperand(6);
27138 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
27139 Chain, Subtarget);
27140 }
27141 case SCATTER: {
27142 //scatter(base, mask, index, v1, scale);
27143 SDValue Chain = Op.getOperand(0);
27144 SDValue Base = Op.getOperand(2);
27145 SDValue Mask = Op.getOperand(3);
27146 SDValue Index = Op.getOperand(4);
27147 SDValue Src = Op.getOperand(5);
27148 SDValue Scale = Op.getOperand(6);
27149 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27150 Scale, Chain, Subtarget);
27151 }
27152 case PREFETCH: {
27153 const APInt &HintVal = Op.getConstantOperandAPInt(6);
27154 assert((HintVal == 2 || HintVal == 3) &&
27155 "Wrong prefetch hint in intrinsic: should be 2 or 3");
27156 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
27157 SDValue Chain = Op.getOperand(0);
27158 SDValue Mask = Op.getOperand(2);
27159 SDValue Index = Op.getOperand(3);
27160 SDValue Base = Op.getOperand(4);
27161 SDValue Scale = Op.getOperand(5);
27162 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
27163 Subtarget);
27164 }
27165 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
27166 case RDTSC: {
27168 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
27169 Results);
27170 return DAG.getMergeValues(Results, dl);
27171 }
27172 // Read Performance Monitoring Counters.
27173 case RDPMC:
27174 // Read Processor Register.
27175 case RDPRU:
27176 // GetExtended Control Register.
27177 case XGETBV: {
27179
27180 // RDPMC uses ECX to select the index of the performance counter to read.
27181 // RDPRU uses ECX to select the processor register to read.
27182 // XGETBV uses ECX to select the index of the XCR register to return.
27183 // The result is stored into registers EDX:EAX.
27184 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
27185 Subtarget, Results);
27186 return DAG.getMergeValues(Results, dl);
27187 }
27188 // XTEST intrinsics.
27189 case XTEST: {
27190 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
27191 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27192
27193 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
27194 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
27195 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
27196 Ret, SDValue(InTrans.getNode(), 1));
27197 }
27200 case TRUNCATE_TO_MEM_VI32: {
27201 SDValue Mask = Op.getOperand(4);
27202 SDValue DataToTruncate = Op.getOperand(3);
27203 SDValue Addr = Op.getOperand(2);
27204 SDValue Chain = Op.getOperand(0);
27205
27206 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
27207 assert(MemIntr && "Expected MemIntrinsicSDNode!");
27208
27209 EVT MemVT = MemIntr->getMemoryVT();
27210
27211 uint16_t TruncationOp = IntrData->Opc0;
27212 switch (TruncationOp) {
27213 case X86ISD::VTRUNC: {
27214 if (isAllOnesConstant(Mask)) // return just a truncate store
27215 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
27216 MemIntr->getMemOperand());
27217
27218 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
27219 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27220 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
27221
27222 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
27223 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
27224 true /* truncating */);
27225 }
27226 case X86ISD::VTRUNCUS:
27227 case X86ISD::VTRUNCS: {
27228 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
27229 if (isAllOnesConstant(Mask))
27230 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
27231 MemIntr->getMemOperand(), DAG);
27232
27233 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
27234 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27235
27236 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
27237 VMask, MemVT, MemIntr->getMemOperand(), DAG);
27238 }
27239 default:
27240 llvm_unreachable("Unsupported truncstore intrinsic");
27241 }
27242 }
27243 }
27244}
27245
27246SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
27247 SelectionDAG &DAG) const {
27249 MFI.setReturnAddressIsTaken(true);
27250
27252 return SDValue();
27253
27254 unsigned Depth = Op.getConstantOperandVal(0);
27255 SDLoc dl(Op);
27256 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27257
27258 if (Depth > 0) {
27259 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
27260 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27261 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
27262 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27263 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
27265 }
27266
27267 // Just load the return address.
27268 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
27269 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
27271}
27272
27273SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
27274 SelectionDAG &DAG) const {
27276 return getReturnAddressFrameIndex(DAG);
27277}
27278
27279SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
27281 MachineFrameInfo &MFI = MF.getFrameInfo();
27283 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27284 EVT VT = Op.getValueType();
27285
27286 MFI.setFrameAddressIsTaken(true);
27287
27288 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
27289 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
27290 // is not possible to crawl up the stack without looking at the unwind codes
27291 // simultaneously.
27292 int FrameAddrIndex = FuncInfo->getFAIndex();
27293 if (!FrameAddrIndex) {
27294 // Set up a frame object for the return address.
27295 unsigned SlotSize = RegInfo->getSlotSize();
27296 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
27297 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
27298 FuncInfo->setFAIndex(FrameAddrIndex);
27299 }
27300 return DAG.getFrameIndex(FrameAddrIndex, VT);
27301 }
27302
27303 unsigned FrameReg =
27304 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
27305 SDLoc dl(Op); // FIXME probably not meaningful
27306 unsigned Depth = Op.getConstantOperandVal(0);
27307 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
27308 (FrameReg == X86::EBP && VT == MVT::i32)) &&
27309 "Invalid Frame Register!");
27310 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
27311 while (Depth--)
27312 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
27314 return FrameAddr;
27315}
27316
27317// FIXME? Maybe this could be a TableGen attribute on some registers and
27318// this table could be generated automatically from RegInfo.
27320 const MachineFunction &MF) const {
27321 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
27322
27324 .Case("esp", X86::ESP)
27325 .Case("rsp", X86::RSP)
27326 .Case("ebp", X86::EBP)
27327 .Case("rbp", X86::RBP)
27328 .Case("r14", X86::R14)
27329 .Case("r15", X86::R15)
27330 .Default(0);
27331
27332 if (Reg == X86::EBP || Reg == X86::RBP) {
27333 if (!TFI.hasFP(MF))
27334 report_fatal_error("register " + StringRef(RegName) +
27335 " is allocatable: function has no frame pointer");
27336#ifndef NDEBUG
27337 else {
27338 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27339 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
27340 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
27341 "Invalid Frame Register!");
27342 }
27343#endif
27344 }
27345
27346 if (Reg)
27347 return Reg;
27348
27349 report_fatal_error("Invalid register name global variable");
27350}
27351
27352SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
27353 SelectionDAG &DAG) const {
27354 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27355 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
27356}
27357
27359 const Constant *PersonalityFn) const {
27360 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
27361 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27362
27363 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
27364}
27365
27367 const Constant *PersonalityFn) const {
27368 // Funclet personalities don't use selectors (the runtime does the selection).
27370 return X86::NoRegister;
27371 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27372}
27373
27375 return Subtarget.isTargetWin64();
27376}
27377
27378SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
27379 SDValue Chain = Op.getOperand(0);
27380 SDValue Offset = Op.getOperand(1);
27381 SDValue Handler = Op.getOperand(2);
27382 SDLoc dl (Op);
27383
27384 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27385 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27386 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
27387 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
27388 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
27389 "Invalid Frame Register!");
27390 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
27391 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
27392
27393 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
27394 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
27395 dl));
27396 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
27397 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
27398 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
27399
27400 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
27401 DAG.getRegister(StoreAddrReg, PtrVT));
27402}
27403
27404SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
27405 SelectionDAG &DAG) const {
27406 SDLoc DL(Op);
27407 // If the subtarget is not 64bit, we may need the global base reg
27408 // after isel expand pseudo, i.e., after CGBR pass ran.
27409 // Therefore, ask for the GlobalBaseReg now, so that the pass
27410 // inserts the code for us in case we need it.
27411 // Otherwise, we will end up in a situation where we will
27412 // reference a virtual register that is not defined!
27413 if (!Subtarget.is64Bit()) {
27414 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27415 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
27416 }
27417 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
27418 DAG.getVTList(MVT::i32, MVT::Other),
27419 Op.getOperand(0), Op.getOperand(1));
27420}
27421
27422SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
27423 SelectionDAG &DAG) const {
27424 SDLoc DL(Op);
27425 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
27426 Op.getOperand(0), Op.getOperand(1));
27427}
27428
27429SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
27430 SelectionDAG &DAG) const {
27431 SDLoc DL(Op);
27432 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
27433 Op.getOperand(0));
27434}
27435
27437 return Op.getOperand(0);
27438}
27439
27440SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
27441 SelectionDAG &DAG) const {
27442 SDValue Root = Op.getOperand(0);
27443 SDValue Trmp = Op.getOperand(1); // trampoline
27444 SDValue FPtr = Op.getOperand(2); // nested function
27445 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
27446 SDLoc dl (Op);
27447
27448 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
27449 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27450
27451 if (Subtarget.is64Bit()) {
27452 SDValue OutChains[6];
27453
27454 // Large code-model.
27455 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
27456 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
27457
27458 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
27459 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
27460
27461 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
27462
27463 // Load the pointer to the nested function into R11.
27464 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
27465 SDValue Addr = Trmp;
27466 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27467 Addr, MachinePointerInfo(TrmpAddr));
27468
27469 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27470 DAG.getConstant(2, dl, MVT::i64));
27471 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
27472 MachinePointerInfo(TrmpAddr, 2), Align(2));
27473
27474 // Load the 'nest' parameter value into R10.
27475 // R10 is specified in X86CallingConv.td
27476 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
27477 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27478 DAG.getConstant(10, dl, MVT::i64));
27479 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27480 Addr, MachinePointerInfo(TrmpAddr, 10));
27481
27482 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27483 DAG.getConstant(12, dl, MVT::i64));
27484 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
27485 MachinePointerInfo(TrmpAddr, 12), Align(2));
27486
27487 // Jump to the nested function.
27488 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
27489 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27490 DAG.getConstant(20, dl, MVT::i64));
27491 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27492 Addr, MachinePointerInfo(TrmpAddr, 20));
27493
27494 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
27495 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27496 DAG.getConstant(22, dl, MVT::i64));
27497 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
27498 Addr, MachinePointerInfo(TrmpAddr, 22));
27499
27500 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27501 } else {
27502 const Function *Func =
27503 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
27504 CallingConv::ID CC = Func->getCallingConv();
27505 unsigned NestReg;
27506
27507 switch (CC) {
27508 default:
27509 llvm_unreachable("Unsupported calling convention");
27510 case CallingConv::C:
27512 // Pass 'nest' parameter in ECX.
27513 // Must be kept in sync with X86CallingConv.td
27514 NestReg = X86::ECX;
27515
27516 // Check that ECX wasn't needed by an 'inreg' parameter.
27517 FunctionType *FTy = Func->getFunctionType();
27518 const AttributeList &Attrs = Func->getAttributes();
27519
27520 if (!Attrs.isEmpty() && !Func->isVarArg()) {
27521 unsigned InRegCount = 0;
27522 unsigned Idx = 0;
27523
27524 for (FunctionType::param_iterator I = FTy->param_begin(),
27525 E = FTy->param_end(); I != E; ++I, ++Idx)
27526 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
27527 const DataLayout &DL = DAG.getDataLayout();
27528 // FIXME: should only count parameters that are lowered to integers.
27529 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
27530 }
27531
27532 if (InRegCount > 2) {
27533 report_fatal_error("Nest register in use - reduce number of inreg"
27534 " parameters!");
27535 }
27536 }
27537 break;
27538 }
27541 case CallingConv::Fast:
27542 case CallingConv::Tail:
27544 // Pass 'nest' parameter in EAX.
27545 // Must be kept in sync with X86CallingConv.td
27546 NestReg = X86::EAX;
27547 break;
27548 }
27549
27550 SDValue OutChains[4];
27551 SDValue Addr, Disp;
27552
27553 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27554 DAG.getConstant(10, dl, MVT::i32));
27555 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
27556
27557 // This is storing the opcode for MOV32ri.
27558 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
27559 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
27560 OutChains[0] =
27561 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
27562 Trmp, MachinePointerInfo(TrmpAddr));
27563
27564 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27565 DAG.getConstant(1, dl, MVT::i32));
27566 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
27567 MachinePointerInfo(TrmpAddr, 1), Align(1));
27568
27569 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
27570 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27571 DAG.getConstant(5, dl, MVT::i32));
27572 OutChains[2] =
27573 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
27574 MachinePointerInfo(TrmpAddr, 5), Align(1));
27575
27576 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27577 DAG.getConstant(6, dl, MVT::i32));
27578 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
27579 MachinePointerInfo(TrmpAddr, 6), Align(1));
27580
27581 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27582 }
27583}
27584
27585SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
27586 SelectionDAG &DAG) const {
27587 /*
27588 The rounding mode is in bits 11:10 of FPSR, and has the following
27589 settings:
27590 00 Round to nearest
27591 01 Round to -inf
27592 10 Round to +inf
27593 11 Round to 0
27594
27595 GET_ROUNDING, on the other hand, expects the following:
27596 -1 Undefined
27597 0 Round to 0
27598 1 Round to nearest
27599 2 Round to +inf
27600 3 Round to -inf
27601
27602 To perform the conversion, we use a packed lookup table of the four 2-bit
27603 values that we can index by FPSP[11:10]
27604 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
27605
27606 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
27607 */
27608
27610 MVT VT = Op.getSimpleValueType();
27611 SDLoc DL(Op);
27612
27613 // Save FP Control Word to stack slot
27614 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
27615 SDValue StackSlot =
27616 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
27617
27619
27620 SDValue Chain = Op.getOperand(0);
27621 SDValue Ops[] = {Chain, StackSlot};
27623 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
27625
27626 // Load FP Control Word from stack slot
27627 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
27628 Chain = CWD.getValue(1);
27629
27630 // Mask and turn the control bits into a shift for the lookup table.
27631 SDValue Shift =
27632 DAG.getNode(ISD::SRL, DL, MVT::i16,
27633 DAG.getNode(ISD::AND, DL, MVT::i16,
27634 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
27635 DAG.getConstant(9, DL, MVT::i8));
27636 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
27637
27638 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
27639 SDValue RetVal =
27640 DAG.getNode(ISD::AND, DL, MVT::i32,
27641 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
27642 DAG.getConstant(3, DL, MVT::i32));
27643
27644 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
27645
27646 return DAG.getMergeValues({RetVal, Chain}, DL);
27647}
27648
27649SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
27650 SelectionDAG &DAG) const {
27652 SDLoc DL(Op);
27653 SDValue Chain = Op.getNode()->getOperand(0);
27654
27655 // FP control word may be set only from data in memory. So we need to allocate
27656 // stack space to save/load FP control word.
27657 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
27658 SDValue StackSlot =
27659 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
27661 MachineMemOperand *MMO =
27663
27664 // Store FP control word into memory.
27665 SDValue Ops[] = {Chain, StackSlot};
27666 Chain = DAG.getMemIntrinsicNode(
27667 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
27668
27669 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
27670 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
27671 Chain = CWD.getValue(1);
27672 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
27673 DAG.getConstant(0xf3ff, DL, MVT::i16));
27674
27675 // Calculate new rounding mode.
27676 SDValue NewRM = Op.getNode()->getOperand(1);
27677 SDValue RMBits;
27678 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
27679 uint64_t RM = CVal->getZExtValue();
27680 int FieldVal;
27681 switch (static_cast<RoundingMode>(RM)) {
27682 // clang-format off
27683 case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
27684 case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;
27685 case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;
27686 case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;
27687 default:
27688 llvm_unreachable("rounding mode is not supported by X86 hardware");
27689 // clang-format on
27690 }
27691 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
27692 } else {
27693 // Need to convert argument into bits of control word:
27694 // 0 Round to 0 -> 11
27695 // 1 Round to nearest -> 00
27696 // 2 Round to +inf -> 10
27697 // 3 Round to -inf -> 01
27698 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
27699 // To make the conversion, put all these values into a value 0xc9 and shift
27700 // it left depending on the rounding mode:
27701 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
27702 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
27703 // ...
27704 // (0xc9 << (2 * NewRM + 4)) & 0xc00
27705 SDValue ShiftValue =
27706 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
27707 DAG.getNode(ISD::ADD, DL, MVT::i32,
27708 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
27709 DAG.getConstant(1, DL, MVT::i8)),
27710 DAG.getConstant(4, DL, MVT::i32)));
27711 SDValue Shifted =
27712 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
27713 ShiftValue);
27714 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
27715 DAG.getConstant(0xc00, DL, MVT::i16));
27716 }
27717
27718 // Update rounding mode bits and store the new FP Control Word into stack.
27719 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
27720 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
27721
27722 // Load FP control word from the slot.
27723 SDValue OpsLD[] = {Chain, StackSlot};
27724 MachineMemOperand *MMOL =
27726 Chain = DAG.getMemIntrinsicNode(
27727 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
27728
27729 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
27730 // same way but in bits 14:13.
27731 if (Subtarget.hasSSE1()) {
27732 // Store MXCSR into memory.
27733 Chain = DAG.getNode(
27734 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27735 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
27736 StackSlot);
27737
27738 // Load MXCSR from stack slot and clear RM field (bits 14:13).
27739 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
27740 Chain = CWD.getValue(1);
27741 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
27742 DAG.getConstant(0xffff9fff, DL, MVT::i32));
27743
27744 // Shift X87 RM bits from 11:10 to 14:13.
27745 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
27746 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
27747 DAG.getConstant(3, DL, MVT::i8));
27748
27749 // Update rounding mode bits and store the new FP Control Word into stack.
27750 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
27751 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
27752
27753 // Load MXCSR from the slot.
27754 Chain = DAG.getNode(
27755 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27756 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
27757 StackSlot);
27758 }
27759
27760 return Chain;
27761}
27762
27763const unsigned X87StateSize = 28;
27764const unsigned FPStateSize = 32;
27765[[maybe_unused]] const unsigned FPStateSizeInBits = FPStateSize * 8;
27766
27767SDValue X86TargetLowering::LowerGET_FPENV_MEM(SDValue Op,
27768 SelectionDAG &DAG) const {
27770 SDLoc DL(Op);
27771 SDValue Chain = Op->getOperand(0);
27772 SDValue Ptr = Op->getOperand(1);
27773 auto *Node = cast<FPStateAccessSDNode>(Op);
27774 EVT MemVT = Node->getMemoryVT();
27776 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
27777
27778 // Get x87 state, if it presents.
27779 if (Subtarget.hasX87()) {
27780 Chain =
27781 DAG.getMemIntrinsicNode(X86ISD::FNSTENVm, DL, DAG.getVTList(MVT::Other),
27782 {Chain, Ptr}, MemVT, MMO);
27783
27784 // FNSTENV changes the exception mask, so load back the stored environment.
27785 MachineMemOperand::Flags NewFlags =
27787 (MMO->getFlags() & ~MachineMemOperand::MOStore);
27788 MMO = MF.getMachineMemOperand(MMO, NewFlags);
27789 Chain =
27790 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
27791 {Chain, Ptr}, MemVT, MMO);
27792 }
27793
27794 // If target supports SSE, get MXCSR as well.
27795 if (Subtarget.hasSSE1()) {
27796 // Get pointer to the MXCSR location in memory.
27798 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
27799 DAG.getConstant(X87StateSize, DL, PtrVT));
27800 // Store MXCSR into memory.
27801 Chain = DAG.getNode(
27802 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27803 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
27804 MXCSRAddr);
27805 }
27806
27807 return Chain;
27808}
27809
27811 EVT MemVT, MachineMemOperand *MMO,
27812 SelectionDAG &DAG,
27813 const X86Subtarget &Subtarget) {
27814 // Set x87 state, if it presents.
27815 if (Subtarget.hasX87())
27816 Chain =
27817 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
27818 {Chain, Ptr}, MemVT, MMO);
27819 // If target supports SSE, set MXCSR as well.
27820 if (Subtarget.hasSSE1()) {
27821 // Get pointer to the MXCSR location in memory.
27823 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
27824 DAG.getConstant(X87StateSize, DL, PtrVT));
27825 // Load MXCSR from memory.
27826 Chain = DAG.getNode(
27827 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27828 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
27829 MXCSRAddr);
27830 }
27831 return Chain;
27832}
27833
27834SDValue X86TargetLowering::LowerSET_FPENV_MEM(SDValue Op,
27835 SelectionDAG &DAG) const {
27836 SDLoc DL(Op);
27837 SDValue Chain = Op->getOperand(0);
27838 SDValue Ptr = Op->getOperand(1);
27839 auto *Node = cast<FPStateAccessSDNode>(Op);
27840 EVT MemVT = Node->getMemoryVT();
27842 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
27843 return createSetFPEnvNodes(Ptr, Chain, DL, MemVT, MMO, DAG, Subtarget);
27844}
27845
27846SDValue X86TargetLowering::LowerRESET_FPENV(SDValue Op,
27847 SelectionDAG &DAG) const {
27849 SDLoc DL(Op);
27850 SDValue Chain = Op.getNode()->getOperand(0);
27851
27852 IntegerType *ItemTy = Type::getInt32Ty(*DAG.getContext());
27853 ArrayType *FPEnvTy = ArrayType::get(ItemTy, 8);
27855
27856 // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to
27857 // nearest. FPU precision is set to 53 bits on Windows and 64 bits otherwise
27858 // for compatibility with glibc.
27859 unsigned X87CW = Subtarget.isTargetWindowsMSVC() ? 0x27F : 0x37F;
27860 FPEnvVals.push_back(ConstantInt::get(ItemTy, X87CW));
27861 Constant *Zero = ConstantInt::get(ItemTy, 0);
27862 for (unsigned I = 0; I < 6; ++I)
27863 FPEnvVals.push_back(Zero);
27864
27865 // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear
27866 // all exceptions, sets DAZ and FTZ to 0.
27867 FPEnvVals.push_back(ConstantInt::get(ItemTy, 0x1F80));
27868 Constant *FPEnvBits = ConstantArray::get(FPEnvTy, FPEnvVals);
27870 SDValue Env = DAG.getConstantPool(FPEnvBits, PtrVT);
27871 MachinePointerInfo MPI =
27875
27876 return createSetFPEnvNodes(Env, Chain, DL, MVT::i32, MMO, DAG, Subtarget);
27877}
27878
27879/// Lower a vector CTLZ using native supported vector CTLZ instruction.
27880//
27881// i8/i16 vector implemented using dword LZCNT vector instruction
27882// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
27883// split the vector, perform operation on it's Lo a Hi part and
27884// concatenate the results.
27886 const X86Subtarget &Subtarget) {
27887 assert(Op.getOpcode() == ISD::CTLZ);
27888 SDLoc dl(Op);
27889 MVT VT = Op.getSimpleValueType();
27890 MVT EltVT = VT.getVectorElementType();
27891 unsigned NumElems = VT.getVectorNumElements();
27892
27893 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
27894 "Unsupported element type");
27895
27896 // Split vector, it's Lo and Hi parts will be handled in next iteration.
27897 if (NumElems > 16 ||
27898 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
27899 return splitVectorIntUnary(Op, DAG, dl);
27900
27901 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
27902 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
27903 "Unsupported value type for operation");
27904
27905 // Use native supported vector instruction vplzcntd.
27906 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
27907 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
27908 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
27909 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
27910
27911 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
27912}
27913
27914// Lower CTLZ using a PSHUFB lookup table implementation.
27916 const X86Subtarget &Subtarget,
27917 SelectionDAG &DAG) {
27918 MVT VT = Op.getSimpleValueType();
27919 int NumElts = VT.getVectorNumElements();
27920 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
27921 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
27922
27923 // Per-nibble leading zero PSHUFB lookup table.
27924 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
27925 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
27926 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
27927 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
27928
27930 for (int i = 0; i < NumBytes; ++i)
27931 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
27932 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
27933
27934 // Begin by bitcasting the input to byte vector, then split those bytes
27935 // into lo/hi nibbles and use the PSHUFB LUT to perform CTLZ on each of them.
27936 // If the hi input nibble is zero then we add both results together, otherwise
27937 // we just take the hi result (by masking the lo result to zero before the
27938 // add).
27939 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
27940 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
27941
27942 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
27943 SDValue Lo = Op0;
27944 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
27945 SDValue HiZ;
27946 if (CurrVT.is512BitVector()) {
27947 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27948 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
27949 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27950 } else {
27951 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
27952 }
27953
27954 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
27955 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
27956 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
27957 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
27958
27959 // Merge result back from vXi8 back to VT, working on the lo/hi halves
27960 // of the current vector width in the same way we did for the nibbles.
27961 // If the upper half of the input element is zero then add the halves'
27962 // leading zero counts together, otherwise just use the upper half's.
27963 // Double the width of the result until we are at target width.
27964 while (CurrVT != VT) {
27965 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
27966 int CurrNumElts = CurrVT.getVectorNumElements();
27967 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
27968 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
27969 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
27970
27971 // Check if the upper half of the input element is zero.
27972 if (CurrVT.is512BitVector()) {
27973 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27974 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
27975 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27976 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27977 } else {
27978 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
27979 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27980 }
27981 HiZ = DAG.getBitcast(NextVT, HiZ);
27982
27983 // Move the upper/lower halves to the lower bits as we'll be extending to
27984 // NextVT. Mask the lower result to zero if HiZ is true and add the results
27985 // together.
27986 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
27987 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
27988 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
27989 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
27990 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
27991 CurrVT = NextVT;
27992 }
27993
27994 return Res;
27995}
27996
27998 const X86Subtarget &Subtarget,
27999 SelectionDAG &DAG) {
28000 MVT VT = Op.getSimpleValueType();
28001
28002 if (Subtarget.hasCDI() &&
28003 // vXi8 vectors need to be promoted to 512-bits for vXi32.
28004 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
28005 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
28006
28007 // Decompose 256-bit ops into smaller 128-bit ops.
28008 if (VT.is256BitVector() && !Subtarget.hasInt256())
28009 return splitVectorIntUnary(Op, DAG, DL);
28010
28011 // Decompose 512-bit ops into smaller 256-bit ops.
28012 if (VT.is512BitVector() && !Subtarget.hasBWI())
28013 return splitVectorIntUnary(Op, DAG, DL);
28014
28015 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
28016 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
28017}
28018
28019static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
28020 SelectionDAG &DAG) {
28021 MVT VT = Op.getSimpleValueType();
28022 MVT OpVT = VT;
28023 unsigned NumBits = VT.getSizeInBits();
28024 SDLoc dl(Op);
28025 unsigned Opc = Op.getOpcode();
28026
28027 if (VT.isVector())
28028 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
28029
28030 Op = Op.getOperand(0);
28031 if (VT == MVT::i8) {
28032 // Zero extend to i32 since there is not an i8 bsr.
28033 OpVT = MVT::i32;
28034 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
28035 }
28036
28037 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
28038 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
28039 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
28040
28041 if (Opc == ISD::CTLZ) {
28042 // If src is zero (i.e. bsr sets ZF), returns NumBits.
28043 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
28044 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
28045 Op.getValue(1)};
28046 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
28047 }
28048
28049 // Finally xor with NumBits-1.
28050 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
28051 DAG.getConstant(NumBits - 1, dl, OpVT));
28052
28053 if (VT == MVT::i8)
28054 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
28055 return Op;
28056}
28057
28058static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
28059 SelectionDAG &DAG) {
28060 MVT VT = Op.getSimpleValueType();
28061 unsigned NumBits = VT.getScalarSizeInBits();
28062 SDValue N0 = Op.getOperand(0);
28063 SDLoc dl(Op);
28064
28065 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
28066 "Only scalar CTTZ requires custom lowering");
28067
28068 // Issue a bsf (scan bits forward) which also sets EFLAGS.
28069 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
28070 Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
28071
28072 // If src is known never zero we can skip the CMOV.
28073 if (DAG.isKnownNeverZero(N0))
28074 return Op;
28075
28076 // If src is zero (i.e. bsf sets ZF), returns NumBits.
28077 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
28078 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
28079 Op.getValue(1)};
28080 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
28081}
28082
28084 const X86Subtarget &Subtarget) {
28085 MVT VT = Op.getSimpleValueType();
28086 SDLoc DL(Op);
28087
28088 if (VT == MVT::i16 || VT == MVT::i32)
28089 return lowerAddSubToHorizontalOp(Op, DL, DAG, Subtarget);
28090
28091 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28092 return splitVectorIntBinary(Op, DAG, DL);
28093
28094 assert(Op.getSimpleValueType().is256BitVector() &&
28095 Op.getSimpleValueType().isInteger() &&
28096 "Only handle AVX 256-bit vector integer operation");
28097 return splitVectorIntBinary(Op, DAG, DL);
28098}
28099
28101 const X86Subtarget &Subtarget) {
28102 MVT VT = Op.getSimpleValueType();
28103 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
28104 unsigned Opcode = Op.getOpcode();
28105 SDLoc DL(Op);
28106
28107 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
28108 (VT.is256BitVector() && !Subtarget.hasInt256())) {
28109 assert(Op.getSimpleValueType().isInteger() &&
28110 "Only handle AVX vector integer operation");
28111 return splitVectorIntBinary(Op, DAG, DL);
28112 }
28113
28114 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
28115 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28116 EVT SetCCResultType =
28117 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28118
28119 unsigned BitWidth = VT.getScalarSizeInBits();
28120 if (Opcode == ISD::USUBSAT) {
28121 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
28122 // Handle a special-case with a bit-hack instead of cmp+select:
28123 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
28124 // If the target can use VPTERNLOG, DAGToDAG will match this as
28125 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
28126 // "broadcast" constant load.
28128 if (C && C->getAPIntValue().isSignMask()) {
28129 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
28130 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
28131 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
28132 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
28133 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
28134 }
28135 }
28136 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
28137 // usubsat X, Y --> (X >u Y) ? X - Y : 0
28138 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
28139 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
28140 // TODO: Move this to DAGCombiner?
28141 if (SetCCResultType == VT &&
28142 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
28143 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
28144 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
28145 }
28146 }
28147
28148 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
28149 (!VT.isVector() || VT == MVT::v2i64)) {
28152 SDValue Zero = DAG.getConstant(0, DL, VT);
28153 SDValue Result =
28154 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
28155 DAG.getVTList(VT, SetCCResultType), X, Y);
28156 SDValue SumDiff = Result.getValue(0);
28157 SDValue Overflow = Result.getValue(1);
28158 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
28159 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
28160 SDValue SumNeg =
28161 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
28162 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
28163 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
28164 }
28165
28166 // Use default expansion.
28167 return SDValue();
28168}
28169
28170static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
28171 SelectionDAG &DAG) {
28172 MVT VT = Op.getSimpleValueType();
28173 SDLoc DL(Op);
28174
28175 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
28176 // Since X86 does not have CMOV for 8-bit integer, we don't convert
28177 // 8-bit integer abs to NEG and CMOV.
28178 SDValue N0 = Op.getOperand(0);
28179 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
28180 DAG.getConstant(0, DL, VT), N0);
28181 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
28182 SDValue(Neg.getNode(), 1)};
28183 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
28184 }
28185
28186 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
28187 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
28188 SDValue Src = Op.getOperand(0);
28189 SDValue Neg = DAG.getNegative(Src, DL, VT);
28190 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Neg, Src);
28191 }
28192
28193 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
28194 assert(VT.isInteger() &&
28195 "Only handle AVX 256-bit vector integer operation");
28196 return splitVectorIntUnary(Op, DAG, DL);
28197 }
28198
28199 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28200 return splitVectorIntUnary(Op, DAG, DL);
28201
28202 // Default to expand.
28203 return SDValue();
28204}
28205
28206static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
28207 SelectionDAG &DAG) {
28208 MVT VT = Op.getSimpleValueType();
28209 SDLoc DL(Op);
28210
28211 // For AVX1 cases, split to use legal ops.
28212 if (VT.is256BitVector() && !Subtarget.hasInt256())
28213 return splitVectorIntBinary(Op, DAG, DL);
28214
28215 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28216 return splitVectorIntBinary(Op, DAG, DL);
28217
28218 // Default to expand.
28219 return SDValue();
28220}
28221
28222static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
28223 SelectionDAG &DAG) {
28224 MVT VT = Op.getSimpleValueType();
28225 SDLoc DL(Op);
28226
28227 // For AVX1 cases, split to use legal ops.
28228 if (VT.is256BitVector() && !Subtarget.hasInt256())
28229 return splitVectorIntBinary(Op, DAG, DL);
28230
28231 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28232 return splitVectorIntBinary(Op, DAG, DL);
28233
28234 // Default to expand.
28235 return SDValue();
28236}
28237
28239 SelectionDAG &DAG) {
28240 assert((Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMINIMUM) &&
28241 "Expected FMAXIMUM or FMINIMUM opcode");
28242 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28243 EVT VT = Op.getValueType();
28244 SDValue X = Op.getOperand(0);
28245 SDValue Y = Op.getOperand(1);
28246 SDLoc DL(Op);
28247 uint64_t SizeInBits = VT.getScalarSizeInBits();
28248 APInt PreferredZero = APInt::getZero(SizeInBits);
28249 APInt OppositeZero = PreferredZero;
28250 EVT IVT = VT.changeTypeToInteger();
28251 X86ISD::NodeType MinMaxOp;
28252 if (Op.getOpcode() == ISD::FMAXIMUM) {
28253 MinMaxOp = X86ISD::FMAX;
28254 OppositeZero.setSignBit();
28255 } else {
28256 PreferredZero.setSignBit();
28257 MinMaxOp = X86ISD::FMIN;
28258 }
28259 EVT SetCCType =
28260 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28261
28262 // The tables below show the expected result of Max in cases of NaN and
28263 // signed zeros.
28264 //
28265 // Y Y
28266 // Num xNaN +0 -0
28267 // --------------- ---------------
28268 // Num | Max | Y | +0 | +0 | +0 |
28269 // X --------------- X ---------------
28270 // xNaN | X | X/Y | -0 | +0 | -0 |
28271 // --------------- ---------------
28272 //
28273 // It is achieved by means of FMAX/FMIN with preliminary checks and operand
28274 // reordering.
28275 //
28276 // We check if any of operands is NaN and return NaN. Then we check if any of
28277 // operands is zero or negative zero (for fmaximum and fminimum respectively)
28278 // to ensure the correct zero is returned.
28279 auto MatchesZero = [](SDValue Op, APInt Zero) {
28281 if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
28282 return CstOp->getValueAPF().bitcastToAPInt() == Zero;
28283 if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))
28284 return CstOp->getAPIntValue() == Zero;
28285 if (Op->getOpcode() == ISD::BUILD_VECTOR ||
28286 Op->getOpcode() == ISD::SPLAT_VECTOR) {
28287 for (const SDValue &OpVal : Op->op_values()) {
28288 if (OpVal.isUndef())
28289 continue;
28290 auto *CstOp = dyn_cast<ConstantFPSDNode>(OpVal);
28291 if (!CstOp)
28292 return false;
28293 if (!CstOp->getValueAPF().isZero())
28294 continue;
28295 if (CstOp->getValueAPF().bitcastToAPInt() != Zero)
28296 return false;
28297 }
28298 return true;
28299 }
28300 return false;
28301 };
28302
28303 bool IsXNeverNaN = DAG.isKnownNeverNaN(X);
28304 bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);
28305 bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||
28306 Op->getFlags().hasNoSignedZeros() ||
28307 DAG.isKnownNeverZeroFloat(X) ||
28309 SDValue NewX, NewY;
28310 if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) ||
28311 MatchesZero(X, OppositeZero)) {
28312 // Operands are already in right order or order does not matter.
28313 NewX = X;
28314 NewY = Y;
28315 } else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) {
28316 NewX = Y;
28317 NewY = X;
28318 } else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) &&
28319 (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
28320 if (IsXNeverNaN)
28321 std::swap(X, Y);
28322 // VFPCLASSS consumes a vector type. So provide a minimal one corresponded
28323 // xmm register.
28324 MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);
28326 // Bits of classes:
28327 // Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7]
28328 // Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN
28329 SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,
28330 DL, MVT::i32);
28331 SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);
28332 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
28333 DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,
28334 DAG.getIntPtrConstant(0, DL));
28335 SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);
28336 NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
28337 NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
28338 return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
28339 } else {
28340 SDValue IsXSigned;
28341 if (Subtarget.is64Bit() || VT != MVT::f64) {
28342 SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
28343 SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
28344 IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
28345 } else {
28346 assert(VT == MVT::f64);
28347 SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
28348 DAG.getConstantFP(0, DL, MVT::v2f64), X,
28349 DAG.getIntPtrConstant(0, DL));
28350 SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
28351 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
28352 DAG.getIntPtrConstant(1, DL));
28353 Hi = DAG.getBitcast(MVT::i32, Hi);
28354 SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
28355 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
28356 *DAG.getContext(), MVT::i32);
28357 IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
28358 }
28359 if (MinMaxOp == X86ISD::FMAX) {
28360 NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
28361 NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
28362 } else {
28363 NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
28364 NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
28365 }
28366 }
28367
28368 bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
28369 Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
28370
28371 // If we did no ordering operands for signed zero handling and we need
28372 // to process NaN and we know that the second operand is not NaN then put
28373 // it in first operand and we will not need to post handle NaN after max/min.
28374 if (IgnoreSignedZero && !IgnoreNaN && DAG.isKnownNeverNaN(NewY))
28375 std::swap(NewX, NewY);
28376
28377 SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
28378
28379 if (IgnoreNaN || DAG.isKnownNeverNaN(NewX))
28380 return MinMax;
28381
28382 SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NewX, NewX, ISD::SETUO);
28383 return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
28384}
28385
28386static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
28387 SelectionDAG &DAG) {
28388 MVT VT = Op.getSimpleValueType();
28389 SDLoc dl(Op);
28390
28391 // For AVX1 cases, split to use legal ops.
28392 if (VT.is256BitVector() && !Subtarget.hasInt256())
28393 return splitVectorIntBinary(Op, DAG, dl);
28394
28395 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
28396 return splitVectorIntBinary(Op, DAG, dl);
28397
28398 bool IsSigned = Op.getOpcode() == ISD::ABDS;
28399 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28400
28401 // TODO: Move to TargetLowering expandABD() once we have ABD promotion.
28402 if (VT.isScalarInteger()) {
28403 unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);
28404 MVT WideVT = MVT::getIntegerVT(WideBits);
28405 if (TLI.isTypeLegal(WideVT)) {
28406 // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
28407 // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
28408 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28409 SDValue LHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(0));
28410 SDValue RHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(1));
28411 SDValue Diff = DAG.getNode(ISD::SUB, dl, WideVT, LHS, RHS);
28412 SDValue AbsDiff = DAG.getNode(ISD::ABS, dl, WideVT, Diff);
28413 return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);
28414 }
28415 }
28416
28417 // TODO: Move to TargetLowering expandABD().
28418 if (!Subtarget.hasSSE41() &&
28419 ((IsSigned && VT == MVT::v16i8) || VT == MVT::v4i32)) {
28420 SDValue LHS = DAG.getFreeze(Op.getOperand(0));
28421 SDValue RHS = DAG.getFreeze(Op.getOperand(1));
28423 SDValue Cmp = DAG.getSetCC(dl, VT, LHS, RHS, CC);
28424 SDValue Diff0 = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
28425 SDValue Diff1 = DAG.getNode(ISD::SUB, dl, VT, RHS, LHS);
28426 return getBitSelect(dl, VT, Diff0, Diff1, Cmp, DAG);
28427 }
28428
28429 // Default to expand.
28430 return SDValue();
28431}
28432
28433static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
28434 SelectionDAG &DAG) {
28435 SDLoc dl(Op);
28436 MVT VT = Op.getSimpleValueType();
28437
28438 // Decompose 256-bit ops into 128-bit ops.
28439 if (VT.is256BitVector() && !Subtarget.hasInt256())
28440 return splitVectorIntBinary(Op, DAG, dl);
28441
28442 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28443 return splitVectorIntBinary(Op, DAG, dl);
28444
28445 SDValue A = Op.getOperand(0);
28446 SDValue B = Op.getOperand(1);
28447
28448 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
28449 // vector pairs, multiply and truncate.
28450 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
28451 unsigned NumElts = VT.getVectorNumElements();
28452
28453 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28454 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28455 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
28456 return DAG.getNode(
28457 ISD::TRUNCATE, dl, VT,
28458 DAG.getNode(ISD::MUL, dl, ExVT,
28459 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
28460 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
28461 }
28462
28463 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28464
28465 // Extract the lo/hi parts to any extend to i16.
28466 // We're going to mask off the low byte of each result element of the
28467 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
28468 // element.
28469 SDValue Undef = DAG.getUNDEF(VT);
28470 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
28471 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
28472
28473 SDValue BLo, BHi;
28474 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
28475 // If the RHS is a constant, manually unpackl/unpackh.
28476 SmallVector<SDValue, 16> LoOps, HiOps;
28477 for (unsigned i = 0; i != NumElts; i += 16) {
28478 for (unsigned j = 0; j != 8; ++j) {
28479 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
28480 MVT::i16));
28481 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
28482 MVT::i16));
28483 }
28484 }
28485
28486 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
28487 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
28488 } else {
28489 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
28490 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
28491 }
28492
28493 // Multiply, mask the lower 8bits of the lo/hi results and pack.
28494 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
28495 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
28496 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
28497 }
28498
28499 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
28500 if (VT == MVT::v4i32) {
28501 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
28502 "Should not custom lower when pmulld is available!");
28503
28504 // Extract the odd parts.
28505 static const int UnpackMask[] = { 1, -1, 3, -1 };
28506 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
28507 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
28508
28509 // Multiply the even parts.
28510 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
28511 DAG.getBitcast(MVT::v2i64, A),
28512 DAG.getBitcast(MVT::v2i64, B));
28513 // Now multiply odd parts.
28514 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
28515 DAG.getBitcast(MVT::v2i64, Aodds),
28516 DAG.getBitcast(MVT::v2i64, Bodds));
28517
28518 Evens = DAG.getBitcast(VT, Evens);
28519 Odds = DAG.getBitcast(VT, Odds);
28520
28521 // Merge the two vectors back together with a shuffle. This expands into 2
28522 // shuffles.
28523 static const int ShufMask[] = { 0, 4, 2, 6 };
28524 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
28525 }
28526
28527 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
28528 "Only know how to lower V2I64/V4I64/V8I64 multiply");
28529 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
28530
28531 // Ahi = psrlqi(a, 32);
28532 // Bhi = psrlqi(b, 32);
28533 //
28534 // AloBlo = pmuludq(a, b);
28535 // AloBhi = pmuludq(a, Bhi);
28536 // AhiBlo = pmuludq(Ahi, b);
28537 //
28538 // Hi = psllqi(AloBhi + AhiBlo, 32);
28539 // return AloBlo + Hi;
28540 KnownBits AKnown = DAG.computeKnownBits(A);
28541 KnownBits BKnown = DAG.computeKnownBits(B);
28542
28543 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
28544 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
28545 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
28546
28547 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
28548 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
28549 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
28550
28551 SDValue Zero = DAG.getConstant(0, dl, VT);
28552
28553 // Only multiply lo/hi halves that aren't known to be zero.
28554 SDValue AloBlo = Zero;
28555 if (!ALoIsZero && !BLoIsZero)
28556 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
28557
28558 SDValue AloBhi = Zero;
28559 if (!ALoIsZero && !BHiIsZero) {
28560 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
28561 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
28562 }
28563
28564 SDValue AhiBlo = Zero;
28565 if (!AHiIsZero && !BLoIsZero) {
28566 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
28567 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
28568 }
28569
28570 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
28571 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
28572
28573 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
28574}
28575
28577 MVT VT, bool IsSigned,
28578 const X86Subtarget &Subtarget,
28579 SelectionDAG &DAG,
28580 SDValue *Low = nullptr) {
28581 unsigned NumElts = VT.getVectorNumElements();
28582
28583 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
28584 // to a vXi16 type. Do the multiplies, shift the results and pack the half
28585 // lane results back together.
28586
28587 // We'll take different approaches for signed and unsigned.
28588 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
28589 // and use pmullw to calculate the full 16-bit product.
28590 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
28591 // shift them left into the upper byte of each word. This allows us to use
28592 // pmulhw to calculate the full 16-bit product. This trick means we don't
28593 // need to sign extend the bytes to use pmullw.
28594
28595 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28596 SDValue Zero = DAG.getConstant(0, dl, VT);
28597
28598 SDValue ALo, AHi;
28599 if (IsSigned) {
28600 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
28601 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
28602 } else {
28603 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
28604 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
28605 }
28606
28607 SDValue BLo, BHi;
28608 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
28609 // If the RHS is a constant, manually unpackl/unpackh and extend.
28610 SmallVector<SDValue, 16> LoOps, HiOps;
28611 for (unsigned i = 0; i != NumElts; i += 16) {
28612 for (unsigned j = 0; j != 8; ++j) {
28613 SDValue LoOp = B.getOperand(i + j);
28614 SDValue HiOp = B.getOperand(i + j + 8);
28615
28616 if (IsSigned) {
28617 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
28618 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
28619 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
28620 DAG.getConstant(8, dl, MVT::i16));
28621 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
28622 DAG.getConstant(8, dl, MVT::i16));
28623 } else {
28624 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
28625 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
28626 }
28627
28628 LoOps.push_back(LoOp);
28629 HiOps.push_back(HiOp);
28630 }
28631 }
28632
28633 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
28634 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
28635 } else if (IsSigned) {
28636 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
28637 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
28638 } else {
28639 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
28640 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
28641 }
28642
28643 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
28644 // pack back to vXi8.
28645 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
28646 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
28647 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
28648
28649 if (Low)
28650 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
28651
28652 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
28653}
28654
28655static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
28656 SelectionDAG &DAG) {
28657 SDLoc dl(Op);
28658 MVT VT = Op.getSimpleValueType();
28659 bool IsSigned = Op->getOpcode() == ISD::MULHS;
28660 unsigned NumElts = VT.getVectorNumElements();
28661 SDValue A = Op.getOperand(0);
28662 SDValue B = Op.getOperand(1);
28663
28664 // Decompose 256-bit ops into 128-bit ops.
28665 if (VT.is256BitVector() && !Subtarget.hasInt256())
28666 return splitVectorIntBinary(Op, DAG, dl);
28667
28668 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28669 return splitVectorIntBinary(Op, DAG, dl);
28670
28671 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
28672 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
28673 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
28674 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
28675
28676 // PMULxD operations multiply each even value (starting at 0) of LHS with
28677 // the related value of RHS and produce a widen result.
28678 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
28679 // => <2 x i64> <ae|cg>
28680 //
28681 // In other word, to have all the results, we need to perform two PMULxD:
28682 // 1. one with the even values.
28683 // 2. one with the odd values.
28684 // To achieve #2, with need to place the odd values at an even position.
28685 //
28686 // Place the odd value at an even position (basically, shift all values 1
28687 // step to the left):
28688 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
28689 9, -1, 11, -1, 13, -1, 15, -1};
28690 // <a|b|c|d> => <b|undef|d|undef>
28691 SDValue Odd0 =
28692 DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
28693 // <e|f|g|h> => <f|undef|h|undef>
28694 SDValue Odd1 =
28695 DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
28696
28697 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
28698 // ints.
28699 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
28700 unsigned Opcode =
28701 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
28702 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
28703 // => <2 x i64> <ae|cg>
28704 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
28705 DAG.getBitcast(MulVT, A),
28706 DAG.getBitcast(MulVT, B)));
28707 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
28708 // => <2 x i64> <bf|dh>
28709 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
28710 DAG.getBitcast(MulVT, Odd0),
28711 DAG.getBitcast(MulVT, Odd1)));
28712
28713 // Shuffle it back into the right order.
28714 SmallVector<int, 16> ShufMask(NumElts);
28715 for (int i = 0; i != (int)NumElts; ++i)
28716 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
28717
28718 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
28719
28720 // If we have a signed multiply but no PMULDQ fix up the result of an
28721 // unsigned multiply.
28722 if (IsSigned && !Subtarget.hasSSE41()) {
28723 SDValue Zero = DAG.getConstant(0, dl, VT);
28724 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
28725 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
28726 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
28727 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
28728
28729 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
28730 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
28731 }
28732
28733 return Res;
28734 }
28735
28736 // Only i8 vectors should need custom lowering after this.
28737 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
28738 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
28739 "Unsupported vector type");
28740
28741 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
28742 // logical shift down the upper half and pack back to i8.
28743
28744 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
28745 // and then ashr/lshr the upper bits down to the lower bits before multiply.
28746
28747 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28748 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28749 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28750 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28751 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
28752 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
28753 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
28754 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28755 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
28756 }
28757
28758 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
28759}
28760
28761// Custom lowering for SMULO/UMULO.
28762static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
28763 SelectionDAG &DAG) {
28764 MVT VT = Op.getSimpleValueType();
28765
28766 // Scalars defer to LowerXALUO.
28767 if (!VT.isVector())
28768 return LowerXALUO(Op, DAG);
28769
28770 SDLoc dl(Op);
28771 bool IsSigned = Op->getOpcode() == ISD::SMULO;
28772 SDValue A = Op.getOperand(0);
28773 SDValue B = Op.getOperand(1);
28774 EVT OvfVT = Op->getValueType(1);
28775
28776 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
28777 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
28778 // Extract the LHS Lo/Hi vectors
28779 SDValue LHSLo, LHSHi;
28780 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
28781
28782 // Extract the RHS Lo/Hi vectors
28783 SDValue RHSLo, RHSHi;
28784 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
28785
28786 EVT LoOvfVT, HiOvfVT;
28787 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
28788 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
28789 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
28790
28791 // Issue the split operations.
28792 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
28793 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
28794
28795 // Join the separate data results and the overflow results.
28796 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
28797 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
28798 Hi.getValue(1));
28799
28800 return DAG.getMergeValues({Res, Ovf}, dl);
28801 }
28802
28803 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28804 EVT SetccVT =
28805 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28806
28807 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28808 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28809 unsigned NumElts = VT.getVectorNumElements();
28810 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28811 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28812 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
28813 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
28814 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
28815
28816 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
28817
28818 SDValue Ovf;
28819 if (IsSigned) {
28820 SDValue High, LowSign;
28821 if (OvfVT.getVectorElementType() == MVT::i1 &&
28822 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
28823 // Rather the truncating try to do the compare on vXi16 or vXi32.
28824 // Shift the high down filling with sign bits.
28825 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
28826 // Fill all 16 bits with the sign bit from the low.
28827 LowSign =
28828 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
28829 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
28830 15, DAG);
28831 SetccVT = OvfVT;
28832 if (!Subtarget.hasBWI()) {
28833 // We can't do a vXi16 compare so sign extend to v16i32.
28834 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
28835 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
28836 }
28837 } else {
28838 // Otherwise do the compare at vXi8.
28839 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28840 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28841 LowSign =
28842 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28843 }
28844
28845 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28846 } else {
28847 SDValue High =
28848 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28849 if (OvfVT.getVectorElementType() == MVT::i1 &&
28850 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
28851 // Rather the truncating try to do the compare on vXi16 or vXi32.
28852 SetccVT = OvfVT;
28853 if (!Subtarget.hasBWI()) {
28854 // We can't do a vXi16 compare so sign extend to v16i32.
28855 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
28856 }
28857 } else {
28858 // Otherwise do the compare at vXi8.
28859 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28860 }
28861
28862 Ovf =
28863 DAG.getSetCC(dl, SetccVT, High,
28864 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
28865 }
28866
28867 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28868
28869 return DAG.getMergeValues({Low, Ovf}, dl);
28870 }
28871
28872 SDValue Low;
28873 SDValue High =
28874 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
28875
28876 SDValue Ovf;
28877 if (IsSigned) {
28878 // SMULO overflows if the high bits don't match the sign of the low.
28879 SDValue LowSign =
28880 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28881 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28882 } else {
28883 // UMULO overflows if the high bits are non-zero.
28884 Ovf =
28885 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
28886 }
28887
28888 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28889
28890 return DAG.getMergeValues({Low, Ovf}, dl);
28891}
28892
28893SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
28894 assert(Subtarget.isTargetWin64() && "Unexpected target");
28895 EVT VT = Op.getValueType();
28896 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
28897 "Unexpected return type for lowering");
28898
28899 if (isa<ConstantSDNode>(Op->getOperand(1))) {
28901 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
28902 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
28903 }
28904
28905 RTLIB::Libcall LC;
28906 bool isSigned;
28907 switch (Op->getOpcode()) {
28908 // clang-format off
28909 default: llvm_unreachable("Unexpected request for libcall!");
28910 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
28911 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
28912 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
28913 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
28914 // clang-format on
28915 }
28916
28917 SDLoc dl(Op);
28918 SDValue InChain = DAG.getEntryNode();
28919
28922 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
28923 EVT ArgVT = Op->getOperand(i).getValueType();
28924 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
28925 "Unexpected argument type for lowering");
28926 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
28927 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
28928 MachinePointerInfo MPI =
28930 Entry.Node = StackPtr;
28931 InChain =
28932 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
28933 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
28934 Entry.Ty = PointerType::get(ArgTy,0);
28935 Entry.IsSExt = false;
28936 Entry.IsZExt = false;
28937 Args.push_back(Entry);
28938 }
28939
28942
28944 CLI.setDebugLoc(dl)
28945 .setChain(InChain)
28946 .setLibCallee(
28948 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
28949 std::move(Args))
28950 .setInRegister()
28951 .setSExtResult(isSigned)
28952 .setZExtResult(!isSigned);
28953
28954 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
28955 return DAG.getBitcast(VT, CallInfo.first);
28956}
28957
28958SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
28959 SelectionDAG &DAG,
28960 SDValue &Chain) const {
28961 assert(Subtarget.isTargetWin64() && "Unexpected target");
28962 EVT VT = Op.getValueType();
28963 bool IsStrict = Op->isStrictFPOpcode();
28964
28965 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
28966 EVT ArgVT = Arg.getValueType();
28967
28968 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
28969 "Unexpected return type for lowering");
28970
28971 RTLIB::Libcall LC;
28972 if (Op->getOpcode() == ISD::FP_TO_SINT ||
28973 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
28974 LC = RTLIB::getFPTOSINT(ArgVT, VT);
28975 else
28976 LC = RTLIB::getFPTOUINT(ArgVT, VT);
28977 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
28978
28979 SDLoc dl(Op);
28980 MakeLibCallOptions CallOptions;
28981 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
28982
28984 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
28985 // expected VT (i128).
28986 std::tie(Result, Chain) =
28987 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
28988 Result = DAG.getBitcast(VT, Result);
28989 return Result;
28990}
28991
28992SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
28993 SelectionDAG &DAG) const {
28994 assert(Subtarget.isTargetWin64() && "Unexpected target");
28995 EVT VT = Op.getValueType();
28996 bool IsStrict = Op->isStrictFPOpcode();
28997
28998 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
28999 EVT ArgVT = Arg.getValueType();
29000
29001 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
29002 "Unexpected argument type for lowering");
29003
29004 RTLIB::Libcall LC;
29005 if (Op->getOpcode() == ISD::SINT_TO_FP ||
29006 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
29007 LC = RTLIB::getSINTTOFP(ArgVT, VT);
29008 else
29009 LC = RTLIB::getUINTTOFP(ArgVT, VT);
29010 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
29011
29012 SDLoc dl(Op);
29013 MakeLibCallOptions CallOptions;
29014 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
29015
29016 // Pass the i128 argument as an indirect argument on the stack.
29017 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
29018 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29019 MachinePointerInfo MPI =
29021 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
29022
29024 std::tie(Result, Chain) =
29025 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
29026 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
29027}
29028
29029// Generate a GFNI gf2p8affine bitmask for vXi8 bitreverse/shift/rotate.
29030uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt = 0) {
29031 assert((Amt < 8) && "Shift/Rotation amount out of range");
29032 switch (Opcode) {
29033 case ISD::BITREVERSE:
29034 return 0x8040201008040201ULL;
29035 case ISD::SHL:
29036 return ((0x0102040810204080ULL >> (Amt)) &
29037 (0x0101010101010101ULL * (0xFF >> (Amt))));
29038 case ISD::SRL:
29039 return ((0x0102040810204080ULL << (Amt)) &
29040 (0x0101010101010101ULL * ((0xFF << (Amt)) & 0xFF)));
29041 case ISD::SRA:
29042 return (getGFNICtrlImm(ISD::SRL, Amt) |
29043 (0x8080808080808080ULL >> (64 - (8 * Amt))));
29044 case ISD::ROTL:
29045 return getGFNICtrlImm(ISD::SRL, 8 - Amt) | getGFNICtrlImm(ISD::SHL, Amt);
29046 case ISD::ROTR:
29047 return getGFNICtrlImm(ISD::SHL, 8 - Amt) | getGFNICtrlImm(ISD::SRL, Amt);
29048 }
29049 llvm_unreachable("Unsupported GFNI opcode");
29050}
29051
29052// Return true if the required (according to Opcode) shift-imm form is natively
29053// supported by the Subtarget
29054static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
29055 unsigned Opcode) {
29056 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
29057 "Unexpected shift opcode");
29058
29059 if (!VT.isSimple())
29060 return false;
29061
29062 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
29063 return false;
29064
29065 if (VT.getScalarSizeInBits() < 16)
29066 return false;
29067
29068 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
29069 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
29070 return true;
29071
29072 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
29073 (VT.is256BitVector() && Subtarget.hasInt256());
29074
29075 bool AShift = LShift && (Subtarget.hasAVX512() ||
29076 (VT != MVT::v2i64 && VT != MVT::v4i64));
29077 return (Opcode == ISD::SRA) ? AShift : LShift;
29078}
29079
29080// The shift amount is a variable, but it is the same for all vector lanes.
29081// These instructions are defined together with shift-immediate.
29082static
29084 unsigned Opcode) {
29085 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
29086}
29087
29088// Return true if the required (according to Opcode) variable-shift form is
29089// natively supported by the Subtarget
29090static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
29091 unsigned Opcode) {
29092 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
29093 "Unexpected shift opcode");
29094
29095 if (!VT.isSimple())
29096 return false;
29097
29098 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
29099 return false;
29100
29101 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
29102 return false;
29103
29104 // vXi16 supported only on AVX-512, BWI
29105 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
29106 return false;
29107
29108 if (Subtarget.hasAVX512() &&
29109 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
29110 return true;
29111
29112 bool LShift = VT.is128BitVector() || VT.is256BitVector();
29113 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
29114 return (Opcode == ISD::SRA) ? AShift : LShift;
29115}
29116
29118 const X86Subtarget &Subtarget) {
29119 MVT VT = Op.getSimpleValueType();
29120 SDLoc dl(Op);
29121 SDValue R = Op.getOperand(0);
29122 SDValue Amt = Op.getOperand(1);
29123 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
29124 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29125
29126 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
29127 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
29128 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
29129 SDValue Ex = DAG.getBitcast(ExVT, R);
29130
29131 // ashr(R, 63) === cmp_slt(R, 0)
29132 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
29133 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
29134 "Unsupported PCMPGT op");
29135 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
29136 }
29137
29138 if (ShiftAmt >= 32) {
29139 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
29140 SDValue Upper =
29141 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
29143 ShiftAmt - 32, DAG);
29144 if (VT == MVT::v2i64)
29145 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
29146 if (VT == MVT::v4i64)
29147 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
29148 {9, 1, 11, 3, 13, 5, 15, 7});
29149 } else {
29150 // SRA upper i32, SRL whole i64 and select lower i32.
29152 ShiftAmt, DAG);
29153 SDValue Lower =
29154 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
29155 Lower = DAG.getBitcast(ExVT, Lower);
29156 if (VT == MVT::v2i64)
29157 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
29158 if (VT == MVT::v4i64)
29159 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
29160 {8, 1, 10, 3, 12, 5, 14, 7});
29161 }
29162 return DAG.getBitcast(VT, Ex);
29163 };
29164
29165 // Optimize shl/srl/sra with constant shift amount.
29166 APInt APIntShiftAmt;
29167 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
29168 return SDValue();
29169
29170 // If the shift amount is out of range, return undef.
29171 if (APIntShiftAmt.uge(EltSizeInBits))
29172 return DAG.getUNDEF(VT);
29173
29174 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
29175
29176 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
29177 // Hardware support for vector shifts is sparse which makes us scalarize the
29178 // vector operations in many cases. Also, on sandybridge ADD is faster than
29179 // shl: (shl V, 1) -> (add (freeze V), (freeze V))
29180 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
29181 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
29182 // must be 0). (add undef, undef) however can be any value. To make this
29183 // safe, we must freeze R to ensure that register allocation uses the same
29184 // register for an undefined value. This ensures that the result will
29185 // still be even and preserves the original semantics.
29186 R = DAG.getFreeze(R);
29187 return DAG.getNode(ISD::ADD, dl, VT, R, R);
29188 }
29189
29190 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
29191 }
29192
29193 // i64 SRA needs to be performed as partial shifts.
29194 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
29195 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
29196 Op.getOpcode() == ISD::SRA)
29197 return ArithmeticShiftRight64(ShiftAmt);
29198
29199 // If we're logical shifting an all-signbits value then we can just perform as
29200 // a mask.
29201 if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
29202 DAG.ComputeNumSignBits(R) == EltSizeInBits) {
29203 SDValue Mask = DAG.getAllOnesConstant(dl, VT);
29204 Mask = DAG.getNode(Op.getOpcode(), dl, VT, Mask, Amt);
29205 return DAG.getNode(ISD::AND, dl, VT, R, Mask);
29206 }
29207
29208 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
29209 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
29210 unsigned NumElts = VT.getVectorNumElements();
29211 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29212
29213 // Simple i8 add case
29214 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
29215 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
29216 // must be 0). (add undef, undef) however can be any value. To make this
29217 // safe, we must freeze R to ensure that register allocation uses the same
29218 // register for an undefined value. This ensures that the result will
29219 // still be even and preserves the original semantics.
29220 R = DAG.getFreeze(R);
29221 return DAG.getNode(ISD::ADD, dl, VT, R, R);
29222 }
29223
29224 // ashr(R, 7) === cmp_slt(R, 0)
29225 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
29226 SDValue Zeros = DAG.getConstant(0, dl, VT);
29227 if (VT.is512BitVector()) {
29228 assert(VT == MVT::v64i8 && "Unexpected element type!");
29229 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
29230 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
29231 }
29232 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
29233 }
29234
29235 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
29236 if (VT == MVT::v16i8 && Subtarget.hasXOP())
29237 return SDValue();
29238
29239 if (Subtarget.hasGFNI()) {
29240 uint64_t ShiftMask = getGFNICtrlImm(Op.getOpcode(), ShiftAmt);
29241 MVT MaskVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
29242 SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(ShiftMask, dl, MaskVT));
29243 return DAG.getNode(X86ISD::GF2P8AFFINEQB, dl, VT, R, Mask,
29244 DAG.getTargetConstant(0, dl, MVT::i8));
29245 }
29246
29247 if (Op.getOpcode() == ISD::SHL) {
29248 // Make a large shift.
29249 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
29250 ShiftAmt, DAG);
29251 SHL = DAG.getBitcast(VT, SHL);
29252 // Zero out the rightmost bits.
29253 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
29254 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
29255 }
29256 if (Op.getOpcode() == ISD::SRL) {
29257 // Make a large shift.
29258 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
29259 ShiftAmt, DAG);
29260 SRL = DAG.getBitcast(VT, SRL);
29261 // Zero out the leftmost bits.
29262 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
29263 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
29264 }
29265 if (Op.getOpcode() == ISD::SRA) {
29266 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
29267 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
29268
29269 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
29270 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
29271 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
29272 return Res;
29273 }
29274 llvm_unreachable("Unknown shift opcode.");
29275 }
29276
29277 return SDValue();
29278}
29279
29281 const X86Subtarget &Subtarget) {
29282 MVT VT = Op.getSimpleValueType();
29283 SDLoc dl(Op);
29284 SDValue R = Op.getOperand(0);
29285 SDValue Amt = Op.getOperand(1);
29286 unsigned Opcode = Op.getOpcode();
29287 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
29288
29289 int BaseShAmtIdx = -1;
29290 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
29291 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
29292 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
29293 Subtarget, DAG);
29294
29295 // vXi8 shifts - shift as v8i16 + mask result.
29296 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
29297 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
29298 VT == MVT::v64i8) &&
29299 !Subtarget.hasXOP()) {
29300 unsigned NumElts = VT.getVectorNumElements();
29301 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29302 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
29303 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
29304 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
29305
29306 // Create the mask using vXi16 shifts. For shift-rights we need to move
29307 // the upper byte down before splatting the vXi8 mask.
29308 SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
29309 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
29310 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
29311 if (Opcode != ISD::SHL)
29312 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
29313 8, DAG);
29314 BitMask = DAG.getBitcast(VT, BitMask);
29315 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
29316 SmallVector<int, 64>(NumElts, 0));
29317
29318 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
29319 DAG.getBitcast(ExtVT, R), BaseShAmt,
29320 BaseShAmtIdx, Subtarget, DAG);
29321 Res = DAG.getBitcast(VT, Res);
29322 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
29323
29324 if (Opcode == ISD::SRA) {
29325 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
29326 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
29327 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
29328 SignMask =
29329 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
29330 BaseShAmtIdx, Subtarget, DAG);
29331 SignMask = DAG.getBitcast(VT, SignMask);
29332 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
29333 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
29334 }
29335 return Res;
29336 }
29337 }
29338 }
29339
29340 return SDValue();
29341}
29342
29343// Convert a shift/rotate left amount to a multiplication scale factor.
29345 const X86Subtarget &Subtarget,
29346 SelectionDAG &DAG) {
29347 MVT VT = Amt.getSimpleValueType();
29348 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
29349 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
29350 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
29351 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
29352 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
29353 (Subtarget.hasBWI() && VT == MVT::v64i8)))
29354 return SDValue();
29355
29356 MVT SVT = VT.getVectorElementType();
29357 unsigned SVTBits = SVT.getSizeInBits();
29358 unsigned NumElems = VT.getVectorNumElements();
29359
29360 APInt UndefElts;
29361 SmallVector<APInt> EltBits;
29362 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
29363 APInt One(SVTBits, 1);
29364 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
29365 for (unsigned I = 0; I != NumElems; ++I) {
29366 if (UndefElts[I] || EltBits[I].uge(SVTBits))
29367 continue;
29368 uint64_t ShAmt = EltBits[I].getZExtValue();
29369 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
29370 }
29371 return DAG.getBuildVector(VT, dl, Elts);
29372 }
29373
29374 // If the target doesn't support variable shifts, use either FP conversion
29375 // or integer multiplication to avoid shifting each element individually.
29376 if (VT == MVT::v4i32) {
29377 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
29378 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
29379 DAG.getConstant(0x3f800000U, dl, VT));
29380 Amt = DAG.getBitcast(MVT::v4f32, Amt);
29381 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
29382 }
29383
29384 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
29385 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
29386 SDValue Z = DAG.getConstant(0, dl, VT);
29387 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
29388 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
29389 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
29390 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
29391 if (Subtarget.hasSSE41())
29392 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
29393 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
29394 }
29395
29396 return SDValue();
29397}
29398
29399static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
29400 SelectionDAG &DAG) {
29401 MVT VT = Op.getSimpleValueType();
29402 SDLoc dl(Op);
29403 SDValue R = Op.getOperand(0);
29404 SDValue Amt = Op.getOperand(1);
29405 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29406 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
29407
29408 unsigned Opc = Op.getOpcode();
29409 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
29410 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
29411
29412 assert(VT.isVector() && "Custom lowering only for vector shifts!");
29413 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
29414
29415 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
29416 return V;
29417
29418 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
29419 return V;
29420
29421 if (supportedVectorVarShift(VT, Subtarget, Opc))
29422 return Op;
29423
29424 // i64 vector arithmetic shift can be emulated with the transform:
29425 // M = lshr(SIGN_MASK, Amt)
29426 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
29427 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
29428 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
29429 Opc == ISD::SRA) {
29430 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
29431 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
29432 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
29433 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
29434 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
29435 return R;
29436 }
29437
29438 // XOP has 128-bit variable logical/arithmetic shifts.
29439 // +ve/-ve Amt = shift left/right.
29440 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
29441 VT == MVT::v8i16 || VT == MVT::v16i8)) {
29442 if (Opc == ISD::SRL || Opc == ISD::SRA)
29443 Amt = DAG.getNegative(Amt, dl, VT);
29444 if (Opc == ISD::SHL || Opc == ISD::SRL)
29445 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
29446 if (Opc == ISD::SRA)
29447 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
29448 }
29449
29450 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
29451 // shifts per-lane and then shuffle the partial results back together.
29452 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
29453 // Splat the shift amounts so the scalar shifts above will catch it.
29454 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
29455 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
29456 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
29457 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
29458 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
29459 }
29460
29461 // If possible, lower this shift as a sequence of two shifts by
29462 // constant plus a BLENDing shuffle instead of scalarizing it.
29463 // Example:
29464 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
29465 //
29466 // Could be rewritten as:
29467 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
29468 //
29469 // The advantage is that the two shifts from the example would be
29470 // lowered as X86ISD::VSRLI nodes in parallel before blending.
29471 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
29472 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
29473 SDValue Amt1, Amt2;
29474 unsigned NumElts = VT.getVectorNumElements();
29475 SmallVector<int, 8> ShuffleMask;
29476 for (unsigned i = 0; i != NumElts; ++i) {
29477 SDValue A = Amt->getOperand(i);
29478 if (A.isUndef()) {
29479 ShuffleMask.push_back(SM_SentinelUndef);
29480 continue;
29481 }
29482 if (!Amt1 || Amt1 == A) {
29483 ShuffleMask.push_back(i);
29484 Amt1 = A;
29485 continue;
29486 }
29487 if (!Amt2 || Amt2 == A) {
29488 ShuffleMask.push_back(i + NumElts);
29489 Amt2 = A;
29490 continue;
29491 }
29492 break;
29493 }
29494
29495 // Only perform this blend if we can perform it without loading a mask.
29496 if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
29497 (VT != MVT::v16i16 ||
29498 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
29499 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
29500 canWidenShuffleElements(ShuffleMask))) {
29501 auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
29502 auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
29503 if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
29504 Cst2->getAPIntValue().ult(EltSizeInBits)) {
29505 SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
29506 Cst1->getZExtValue(), DAG);
29507 SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
29508 Cst2->getZExtValue(), DAG);
29509 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
29510 }
29511 }
29512 }
29513
29514 // If possible, lower this packed shift into a vector multiply instead of
29515 // expanding it into a sequence of scalar shifts.
29516 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
29517 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
29518 Subtarget.canExtendTo512BW())))
29519 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
29520 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
29521
29522 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
29523 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
29524 if (Opc == ISD::SRL && ConstantAmt &&
29525 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
29526 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
29527 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
29528 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
29529 SDValue Zero = DAG.getConstant(0, dl, VT);
29530 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
29531 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
29532 return DAG.getSelect(dl, VT, ZAmt, R, Res);
29533 }
29534 }
29535
29536 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
29537 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
29538 // TODO: Special case handling for shift by 0/1, really we can afford either
29539 // of these cases in pre-SSE41/XOP/AVX512 but not both.
29540 if (Opc == ISD::SRA && ConstantAmt &&
29541 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
29542 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
29543 !Subtarget.hasAVX512()) ||
29544 DAG.isKnownNeverZero(Amt))) {
29545 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
29546 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
29547 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
29548 SDValue Amt0 =
29549 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
29550 SDValue Amt1 =
29551 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
29552 SDValue Sra1 =
29553 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
29554 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
29555 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
29556 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
29557 }
29558 }
29559
29560 // v4i32 Non Uniform Shifts.
29561 // If the shift amount is constant we can shift each lane using the SSE2
29562 // immediate shifts, else we need to zero-extend each lane to the lower i64
29563 // and shift using the SSE2 variable shifts.
29564 // The separate results can then be blended together.
29565 if (VT == MVT::v4i32) {
29566 SDValue Amt0, Amt1, Amt2, Amt3;
29567 if (ConstantAmt) {
29568 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
29569 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
29570 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
29571 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
29572 } else {
29573 // The SSE2 shifts use the lower i64 as the same shift amount for
29574 // all lanes and the upper i64 is ignored. On AVX we're better off
29575 // just zero-extending, but for SSE just duplicating the top 16-bits is
29576 // cheaper and has the same effect for out of range values.
29577 if (Subtarget.hasAVX()) {
29578 SDValue Z = DAG.getConstant(0, dl, VT);
29579 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
29580 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
29581 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
29582 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
29583 } else {
29584 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
29585 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
29586 {4, 5, 6, 7, -1, -1, -1, -1});
29587 SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
29588 SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
29589 Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
29590 Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
29591 Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
29592 Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
29593 }
29594 }
29595
29596 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
29597 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
29598 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
29599 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
29600 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
29601
29602 // Merge the shifted lane results optimally with/without PBLENDW.
29603 // TODO - ideally shuffle combining would handle this.
29604 if (Subtarget.hasSSE41()) {
29605 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
29606 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
29607 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
29608 }
29609 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
29610 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
29611 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
29612 }
29613
29614 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
29615 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
29616 // make the existing SSE solution better.
29617 // NOTE: We honor prefered vector width before promoting to 512-bits.
29618 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
29619 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
29620 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
29621 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
29622 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
29623 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
29624 "Unexpected vector type");
29625 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
29626 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
29627 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29628 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
29629 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
29630 return DAG.getNode(ISD::TRUNCATE, dl, VT,
29631 DAG.getNode(Opc, dl, ExtVT, R, Amt));
29632 }
29633
29634 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
29635 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
29636 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
29637 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29638 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29639 !Subtarget.hasXOP()) {
29640 int NumElts = VT.getVectorNumElements();
29641 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
29642
29643 // Extend constant shift amount to vXi16 (it doesn't matter if the type
29644 // isn't legal).
29645 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29646 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
29647 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
29648 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
29650 "Constant build vector expected");
29651
29652 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
29653 bool IsSigned = Opc == ISD::SRA;
29654 R = DAG.getExtOrTrunc(IsSigned, R, dl, ExVT);
29655 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
29656 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
29657 return DAG.getZExtOrTrunc(R, dl, VT);
29658 }
29659
29660 SmallVector<SDValue, 16> LoAmt, HiAmt;
29661 for (int i = 0; i != NumElts; i += 16) {
29662 for (int j = 0; j != 8; ++j) {
29663 LoAmt.push_back(Amt.getOperand(i + j));
29664 HiAmt.push_back(Amt.getOperand(i + j + 8));
29665 }
29666 }
29667
29668 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
29669 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
29670 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
29671
29672 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
29673 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
29674 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
29675 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
29676 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
29677 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
29678 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
29679 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
29680 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
29681 }
29682
29683 if (VT == MVT::v16i8 ||
29684 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
29685 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
29686 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
29687
29688 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
29689 if (VT.is512BitVector()) {
29690 // On AVX512BW targets we make use of the fact that VSELECT lowers
29691 // to a masked blend which selects bytes based just on the sign bit
29692 // extracted to a mask.
29693 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
29694 V0 = DAG.getBitcast(VT, V0);
29695 V1 = DAG.getBitcast(VT, V1);
29696 Sel = DAG.getBitcast(VT, Sel);
29697 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
29698 ISD::SETGT);
29699 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
29700 } else if (Subtarget.hasSSE41()) {
29701 // On SSE41 targets we can use PBLENDVB which selects bytes based just
29702 // on the sign bit.
29703 V0 = DAG.getBitcast(VT, V0);
29704 V1 = DAG.getBitcast(VT, V1);
29705 Sel = DAG.getBitcast(VT, Sel);
29706 return DAG.getBitcast(SelVT,
29707 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
29708 }
29709 // On pre-SSE41 targets we test for the sign bit by comparing to
29710 // zero - a negative value will set all bits of the lanes to true
29711 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
29712 SDValue Z = DAG.getConstant(0, dl, SelVT);
29713 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
29714 return DAG.getSelect(dl, SelVT, C, V0, V1);
29715 };
29716
29717 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
29718 // We can safely do this using i16 shifts as we're only interested in
29719 // the 3 lower bits of each byte.
29720 Amt = DAG.getBitcast(ExtVT, Amt);
29721 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
29722 Amt = DAG.getBitcast(VT, Amt);
29723
29724 if (Opc == ISD::SHL || Opc == ISD::SRL) {
29725 // r = VSELECT(r, shift(r, 4), a);
29726 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
29727 R = SignBitSelect(VT, Amt, M, R);
29728
29729 // a += a
29730 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29731
29732 // r = VSELECT(r, shift(r, 2), a);
29733 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
29734 R = SignBitSelect(VT, Amt, M, R);
29735
29736 // a += a
29737 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29738
29739 // return VSELECT(r, shift(r, 1), a);
29740 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
29741 R = SignBitSelect(VT, Amt, M, R);
29742 return R;
29743 }
29744
29745 if (Opc == ISD::SRA) {
29746 // For SRA we need to unpack each byte to the higher byte of a i16 vector
29747 // so we can correctly sign extend. We don't care what happens to the
29748 // lower byte.
29749 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
29750 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
29751 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
29752 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
29753 ALo = DAG.getBitcast(ExtVT, ALo);
29754 AHi = DAG.getBitcast(ExtVT, AHi);
29755 RLo = DAG.getBitcast(ExtVT, RLo);
29756 RHi = DAG.getBitcast(ExtVT, RHi);
29757
29758 // r = VSELECT(r, shift(r, 4), a);
29759 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
29760 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
29761 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29762 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29763
29764 // a += a
29765 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
29766 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
29767
29768 // r = VSELECT(r, shift(r, 2), a);
29769 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
29770 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
29771 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29772 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29773
29774 // a += a
29775 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
29776 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
29777
29778 // r = VSELECT(r, shift(r, 1), a);
29779 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
29780 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
29781 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29782 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29783
29784 // Logical shift the result back to the lower byte, leaving a zero upper
29785 // byte meaning that we can safely pack with PACKUSWB.
29786 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
29787 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
29788 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
29789 }
29790 }
29791
29792 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
29793 MVT ExtVT = MVT::v8i32;
29794 SDValue Z = DAG.getConstant(0, dl, VT);
29795 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
29796 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
29797 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
29798 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
29799 ALo = DAG.getBitcast(ExtVT, ALo);
29800 AHi = DAG.getBitcast(ExtVT, AHi);
29801 RLo = DAG.getBitcast(ExtVT, RLo);
29802 RHi = DAG.getBitcast(ExtVT, RHi);
29803 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
29804 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
29805 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
29806 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
29807 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
29808 }
29809
29810 if (VT == MVT::v8i16) {
29811 // If we have a constant shift amount, the non-SSE41 path is best as
29812 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
29813 bool UseSSE41 = Subtarget.hasSSE41() &&
29815
29816 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
29817 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
29818 // the sign bit.
29819 if (UseSSE41) {
29820 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
29821 V0 = DAG.getBitcast(ExtVT, V0);
29822 V1 = DAG.getBitcast(ExtVT, V1);
29823 Sel = DAG.getBitcast(ExtVT, Sel);
29824 return DAG.getBitcast(
29825 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
29826 }
29827 // On pre-SSE41 targets we splat the sign bit - a negative value will
29828 // set all bits of the lanes to true and VSELECT uses that in
29829 // its OR(AND(V0,C),AND(V1,~C)) lowering.
29830 SDValue C =
29831 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
29832 return DAG.getSelect(dl, VT, C, V0, V1);
29833 };
29834
29835 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
29836 if (UseSSE41) {
29837 // On SSE41 targets we need to replicate the shift mask in both
29838 // bytes for PBLENDVB.
29839 Amt = DAG.getNode(
29840 ISD::OR, dl, VT,
29841 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
29842 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
29843 } else {
29844 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
29845 }
29846
29847 // r = VSELECT(r, shift(r, 8), a);
29848 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
29849 R = SignBitSelect(Amt, M, R);
29850
29851 // a += a
29852 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29853
29854 // r = VSELECT(r, shift(r, 4), a);
29855 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
29856 R = SignBitSelect(Amt, M, R);
29857
29858 // a += a
29859 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29860
29861 // r = VSELECT(r, shift(r, 2), a);
29862 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
29863 R = SignBitSelect(Amt, M, R);
29864
29865 // a += a
29866 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29867
29868 // return VSELECT(r, shift(r, 1), a);
29869 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
29870 R = SignBitSelect(Amt, M, R);
29871 return R;
29872 }
29873
29874 // Decompose 256-bit shifts into 128-bit shifts.
29875 if (VT.is256BitVector())
29876 return splitVectorIntBinary(Op, DAG, dl);
29877
29878 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29879 return splitVectorIntBinary(Op, DAG, dl);
29880
29881 return SDValue();
29882}
29883
29885 SelectionDAG &DAG) {
29886 MVT VT = Op.getSimpleValueType();
29887 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
29888 "Unexpected funnel shift opcode!");
29889
29890 SDLoc DL(Op);
29891 SDValue Op0 = Op.getOperand(0);
29892 SDValue Op1 = Op.getOperand(1);
29893 SDValue Amt = Op.getOperand(2);
29894 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29895 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
29896
29897 if (VT.isVector()) {
29898 APInt APIntShiftAmt;
29899 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
29900 unsigned NumElts = VT.getVectorNumElements();
29901
29902 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
29903 if (IsFSHR)
29904 std::swap(Op0, Op1);
29905
29906 if (IsCstSplat) {
29907 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
29908 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
29909 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
29910 {Op0, Op1, Imm}, DAG, Subtarget);
29911 }
29912 return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
29913 {Op0, Op1, Amt}, DAG, Subtarget);
29914 }
29915 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
29916 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
29917 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
29918 "Unexpected funnel shift type!");
29919
29920 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
29921 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
29922 if (IsCstSplat) {
29923 // TODO: Can't use generic expansion as UNDEF amt elements can be
29924 // converted to other values when folded to shift amounts, losing the
29925 // splat.
29926 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
29927 uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
29928 uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
29929 assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift");
29930 MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29931
29932 if (EltSizeInBits == 8 &&
29933 (Subtarget.hasXOP() ||
29934 (useVPTERNLOG(Subtarget, VT) &&
29935 supportedVectorShiftWithImm(WideVT, Subtarget, ISD::SHL)))) {
29936 // For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG
29937 // bit-select - lower using vXi16 shifts and then perform the bitmask at
29938 // the original vector width to handle cases where we split.
29939 APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
29940 APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
29941 SDValue ShX =
29942 DAG.getNode(ISD::SHL, DL, WideVT, DAG.getBitcast(WideVT, Op0),
29943 DAG.getShiftAmountConstant(ShXAmt, WideVT, DL));
29944 SDValue ShY =
29945 DAG.getNode(ISD::SRL, DL, WideVT, DAG.getBitcast(WideVT, Op1),
29946 DAG.getShiftAmountConstant(ShYAmt, WideVT, DL));
29947 ShX = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShX),
29948 DAG.getConstant(MaskX, DL, VT));
29949 ShY = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShY),
29950 DAG.getConstant(MaskY, DL, VT));
29951 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
29952 }
29953
29954 SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0,
29955 DAG.getShiftAmountConstant(ShXAmt, VT, DL));
29956 SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1,
29957 DAG.getShiftAmountConstant(ShYAmt, VT, DL));
29958 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
29959 }
29960
29961 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
29962 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
29963 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
29964
29965 // Constant vXi16 funnel shifts can be efficiently handled by default.
29966 if (IsCst && EltSizeInBits == 16)
29967 return SDValue();
29968
29969 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
29970 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
29971 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
29972
29973 // Split 256-bit integers on XOP/pre-AVX2 targets.
29974 // Split 512-bit integers on non 512-bit BWI targets.
29975 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
29976 !Subtarget.hasAVX2())) ||
29977 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
29978 EltSizeInBits < 32)) {
29979 // Pre-mask the amount modulo using the wider vector.
29980 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
29981 return splitVectorOp(Op, DAG, DL);
29982 }
29983
29984 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
29985 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
29986 int ScalarAmtIdx = -1;
29987 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
29988 // Uniform vXi16 funnel shifts can be efficiently handled by default.
29989 if (EltSizeInBits == 16)
29990 return SDValue();
29991
29992 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
29993 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
29994 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
29995 ScalarAmtIdx, Subtarget, DAG);
29996 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
29997 ScalarAmtIdx, Subtarget, DAG);
29998 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
29999 }
30000 }
30001
30002 MVT WideSVT = MVT::getIntegerVT(
30003 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
30004 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
30005
30006 // If per-element shifts are legal, fallback to generic expansion.
30007 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
30008 return SDValue();
30009
30010 // Attempt to fold as:
30011 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
30012 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
30013 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
30014 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
30015 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
30016 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
30017 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
30018 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
30019 EltSizeInBits, DAG);
30020 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
30021 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
30022 if (!IsFSHR)
30023 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
30024 EltSizeInBits, DAG);
30025 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
30026 }
30027
30028 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
30029 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
30030 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
30031 SDValue Z = DAG.getConstant(0, DL, VT);
30032 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
30033 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
30034 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
30035 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
30036 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
30037 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
30038 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
30039 }
30040
30041 // Fallback to generic expansion.
30042 return SDValue();
30043 }
30044 assert(
30045 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
30046 "Unexpected funnel shift type!");
30047
30048 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
30049 bool OptForSize = DAG.shouldOptForSize();
30050 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
30051
30052 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
30053 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
30054 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
30055 !isa<ConstantSDNode>(Amt)) {
30056 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
30057 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
30058 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
30059 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
30060 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
30061 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
30062 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
30063 if (IsFSHR) {
30064 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
30065 } else {
30066 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
30067 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
30068 }
30069 return DAG.getZExtOrTrunc(Res, DL, VT);
30070 }
30071
30072 if (VT == MVT::i8 || ExpandFunnel)
30073 return SDValue();
30074
30075 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
30076 if (VT == MVT::i16) {
30077 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
30078 DAG.getConstant(15, DL, Amt.getValueType()));
30079 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
30080 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
30081 }
30082
30083 return Op;
30084}
30085
30086static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
30087 SelectionDAG &DAG) {
30088 MVT VT = Op.getSimpleValueType();
30089 assert(VT.isVector() && "Custom lowering only for vector rotates!");
30090
30091 SDLoc DL(Op);
30092 SDValue R = Op.getOperand(0);
30093 SDValue Amt = Op.getOperand(1);
30094 unsigned Opcode = Op.getOpcode();
30095 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30096 int NumElts = VT.getVectorNumElements();
30097 bool IsROTL = Opcode == ISD::ROTL;
30098
30099 // Check for constant splat rotation amount.
30100 APInt CstSplatValue;
30101 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
30102
30103 // Check for splat rotate by zero.
30104 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
30105 return R;
30106
30107 // AVX512 implicitly uses modulo rotation amounts.
30108 if ((Subtarget.hasVLX() ||
30109 (Subtarget.hasAVX512() && Subtarget.hasEVEX512())) &&
30110 32 <= EltSizeInBits) {
30111 // Attempt to rotate by immediate.
30112 if (IsCstSplat) {
30113 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
30114 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30115 return DAG.getNode(RotOpc, DL, VT, R,
30116 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
30117 }
30118
30119 // Else, fall-back on VPROLV/VPRORV.
30120 return Op;
30121 }
30122
30123 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
30124 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
30125 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
30126 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
30127 }
30128
30129 SDValue Z = DAG.getConstant(0, DL, VT);
30130
30131 if (!IsROTL) {
30132 // If the ISD::ROTR amount is constant, we're always better converting to
30133 // ISD::ROTL.
30134 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
30135 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
30136
30137 // XOP targets always prefers ISD::ROTL.
30138 if (Subtarget.hasXOP())
30139 return DAG.getNode(ISD::ROTL, DL, VT, R,
30140 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
30141 }
30142
30143 // Attempt to use GFNI gf2p8affine to rotate vXi8 by an uniform constant.
30144 if (IsCstSplat && Subtarget.hasGFNI() && VT.getScalarType() == MVT::i8 &&
30146 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30147 uint64_t RotMask = getGFNICtrlImm(Opcode, RotAmt);
30148 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
30149 SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(RotMask, DL, MaskVT));
30150 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, R, Mask,
30151 DAG.getTargetConstant(0, DL, MVT::i8));
30152 }
30153
30154 // Split 256-bit integers on XOP/pre-AVX2 targets.
30155 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
30156 return splitVectorIntBinary(Op, DAG, DL);
30157
30158 // XOP has 128-bit vector variable + immediate rotates.
30159 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
30160 // XOP implicitly uses modulo rotation amounts.
30161 if (Subtarget.hasXOP()) {
30162 assert(IsROTL && "Only ROTL expected");
30163 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
30164
30165 // Attempt to rotate by immediate.
30166 if (IsCstSplat) {
30167 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30168 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
30169 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
30170 }
30171
30172 // Use general rotate by variable (per-element).
30173 return Op;
30174 }
30175
30176 // Rotate by an uniform constant - expand back to shifts.
30177 // TODO: Can't use generic expansion as UNDEF amt elements can be converted
30178 // to other values when folded to shift amounts, losing the splat.
30179 if (IsCstSplat) {
30180 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30181 uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt);
30182 uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt;
30183 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, R,
30184 DAG.getShiftAmountConstant(ShlAmt, VT, DL));
30185 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, R,
30186 DAG.getShiftAmountConstant(SrlAmt, VT, DL));
30187 return DAG.getNode(ISD::OR, DL, VT, Shl, Srl);
30188 }
30189
30190 // Split 512-bit integers on non 512-bit BWI targets.
30191 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
30192 return splitVectorIntBinary(Op, DAG, DL);
30193
30194 assert(
30195 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
30196 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
30197 Subtarget.hasAVX2()) ||
30198 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&
30199 "Only vXi32/vXi16/vXi8 vector rotates supported");
30200
30201 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
30202 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
30203
30204 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
30205 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30206
30207 // Attempt to fold as unpack(x,x) << zext(splat(y)):
30208 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
30209 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
30210 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
30211 int BaseRotAmtIdx = -1;
30212 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
30213 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
30214 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
30215 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
30216 }
30217 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
30218 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
30219 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
30220 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
30221 BaseRotAmtIdx, Subtarget, DAG);
30222 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
30223 BaseRotAmtIdx, Subtarget, DAG);
30224 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
30225 }
30226 }
30227
30228 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30229 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
30230
30231 // Attempt to fold as unpack(x,x) << zext(y):
30232 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
30233 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
30234 // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.
30235 if (!(ConstantAmt && EltSizeInBits != 8) &&
30236 !supportedVectorVarShift(VT, Subtarget, ShiftOpc) &&
30237 (ConstantAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc))) {
30238 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
30239 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
30240 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
30241 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
30242 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
30243 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
30244 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
30245 }
30246
30247 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
30248 // the amount bit.
30249 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
30250 if (EltSizeInBits == 8) {
30251 MVT WideVT =
30252 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
30253
30254 // Attempt to fold as:
30255 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
30256 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
30257 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
30258 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
30259 // If we're rotating by constant, just use default promotion.
30260 if (ConstantAmt)
30261 return SDValue();
30262 // See if we can perform this by widening to vXi16 or vXi32.
30263 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
30264 R = DAG.getNode(
30265 ISD::OR, DL, WideVT, R,
30266 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
30267 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
30268 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
30269 if (IsROTL)
30270 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
30271 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
30272 }
30273
30274 // We don't need ModuloAmt here as we just peek at individual bits.
30275 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
30276 if (Subtarget.hasSSE41()) {
30277 // On SSE41 targets we can use PBLENDVB which selects bytes based just
30278 // on the sign bit.
30279 V0 = DAG.getBitcast(VT, V0);
30280 V1 = DAG.getBitcast(VT, V1);
30281 Sel = DAG.getBitcast(VT, Sel);
30282 return DAG.getBitcast(SelVT,
30283 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
30284 }
30285 // On pre-SSE41 targets we test for the sign bit by comparing to
30286 // zero - a negative value will set all bits of the lanes to true
30287 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
30288 SDValue Z = DAG.getConstant(0, DL, SelVT);
30289 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
30290 return DAG.getSelect(DL, SelVT, C, V0, V1);
30291 };
30292
30293 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
30294 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
30295 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
30296 IsROTL = true;
30297 }
30298
30299 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
30300 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
30301
30302 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
30303 // We can safely do this using i16 shifts as we're only interested in
30304 // the 3 lower bits of each byte.
30305 Amt = DAG.getBitcast(ExtVT, Amt);
30306 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
30307 Amt = DAG.getBitcast(VT, Amt);
30308
30309 // r = VSELECT(r, rot(r, 4), a);
30310 SDValue M;
30311 M = DAG.getNode(
30312 ISD::OR, DL, VT,
30313 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
30314 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
30315 R = SignBitSelect(VT, Amt, M, R);
30316
30317 // a += a
30318 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
30319
30320 // r = VSELECT(r, rot(r, 2), a);
30321 M = DAG.getNode(
30322 ISD::OR, DL, VT,
30323 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
30324 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
30325 R = SignBitSelect(VT, Amt, M, R);
30326
30327 // a += a
30328 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
30329
30330 // return VSELECT(r, rot(r, 1), a);
30331 M = DAG.getNode(
30332 ISD::OR, DL, VT,
30333 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
30334 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
30335 return SignBitSelect(VT, Amt, M, R);
30336 }
30337
30338 bool IsSplatAmt = DAG.isSplatValue(Amt);
30339 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
30340 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
30341
30342 // Fallback for splats + all supported variable shifts.
30343 // Fallback for non-constants AVX2 vXi16 as well.
30344 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
30345 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30346 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
30347 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
30348 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
30349 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
30350 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
30351 }
30352
30353 // Everything below assumes ISD::ROTL.
30354 if (!IsROTL) {
30355 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
30356 IsROTL = true;
30357 }
30358
30359 // ISD::ROT* uses modulo rotate amounts.
30360 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30361
30362 assert(IsROTL && "Only ROTL supported");
30363
30364 // As with shifts, attempt to convert the rotation amount to a multiplication
30365 // factor, fallback to general expansion.
30366 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
30367 if (!Scale)
30368 return SDValue();
30369
30370 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
30371 if (EltSizeInBits == 16) {
30372 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
30373 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
30374 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
30375 }
30376
30377 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
30378 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
30379 // that can then be OR'd with the lower 32-bits.
30380 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
30381 static const int OddMask[] = {1, -1, 3, -1};
30382 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
30383 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
30384
30385 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
30386 DAG.getBitcast(MVT::v2i64, R),
30387 DAG.getBitcast(MVT::v2i64, Scale));
30388 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
30389 DAG.getBitcast(MVT::v2i64, R13),
30390 DAG.getBitcast(MVT::v2i64, Scale13));
30391 Res02 = DAG.getBitcast(VT, Res02);
30392 Res13 = DAG.getBitcast(VT, Res13);
30393
30394 return DAG.getNode(ISD::OR, DL, VT,
30395 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
30396 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
30397}
30398
30399/// Returns true if the operand type is exactly twice the native width, and
30400/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
30401/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
30402/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
30403bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
30404 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
30405
30406 if (OpWidth == 64)
30407 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
30408 if (OpWidth == 128)
30409 return Subtarget.canUseCMPXCHG16B();
30410
30411 return false;
30412}
30413
30415X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
30416 Type *MemType = SI->getValueOperand()->getType();
30417
30418 bool NoImplicitFloatOps =
30419 SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
30420 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
30421 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
30422 (Subtarget.hasSSE1() || Subtarget.hasX87()))
30424
30425 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
30427}
30428
30429// Note: this turns large loads into lock cmpxchg8b/16b.
30430// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
30432X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
30433 Type *MemType = LI->getType();
30434
30435 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
30436 // can use movq to do the load. If we have X87 we can load into an 80-bit
30437 // X87 register and store it to a stack temporary.
30438 bool NoImplicitFloatOps =
30439 LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
30440 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
30441 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
30442 (Subtarget.hasSSE1() || Subtarget.hasX87()))
30444
30445 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
30447}
30448
30449enum BitTestKind : unsigned {
30456
30457static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
30458 using namespace llvm::PatternMatch;
30459 BitTestKind BTK = UndefBit;
30460 auto *C = dyn_cast<ConstantInt>(V);
30461 if (C) {
30462 // Check if V is a power of 2 or NOT power of 2.
30463 if (isPowerOf2_64(C->getZExtValue()))
30464 BTK = ConstantBit;
30465 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
30466 BTK = NotConstantBit;
30467 return {V, BTK};
30468 }
30469
30470 // Check if V is some power of 2 pattern known to be non-zero
30471 auto *I = dyn_cast<Instruction>(V);
30472 if (I) {
30473 bool Not = false;
30474 // Check if we have a NOT
30475 Value *PeekI;
30476 if (match(I, m_c_Xor(m_Value(PeekI), m_AllOnes())) ||
30477 match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
30478 Not = true;
30479 I = dyn_cast<Instruction>(PeekI);
30480
30481 // If I is constant, it will fold and we can evaluate later. If its an
30482 // argument or something of that nature, we can't analyze.
30483 if (I == nullptr)
30484 return {nullptr, UndefBit};
30485 }
30486 // We can only use 1 << X without more sophisticated analysis. C << X where
30487 // C is a power of 2 but not 1 can result in zero which cannot be translated
30488 // to bittest. Likewise any C >> X (either arith or logical) can be zero.
30489 if (I->getOpcode() == Instruction::Shl) {
30490 // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
30491 // -X` and some other provable power of 2 patterns that we can use CTZ on
30492 // may be profitable.
30493 // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
30494 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
30495 // be provably a non-zero power of 2.
30496 // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
30497 // transformable to bittest.
30498 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
30499 if (!ShiftVal)
30500 return {nullptr, UndefBit};
30501 if (ShiftVal->equalsInt(1))
30502 BTK = Not ? NotShiftBit : ShiftBit;
30503
30504 if (BTK == UndefBit)
30505 return {nullptr, UndefBit};
30506
30507 Value *BitV = I->getOperand(1);
30508
30509 Value *AndOp;
30510 const APInt *AndC;
30511 if (match(BitV, m_c_And(m_Value(AndOp), m_APInt(AndC)))) {
30512 // Read past a shiftmask instruction to find count
30513 if (*AndC == (I->getType()->getPrimitiveSizeInBits() - 1))
30514 BitV = AndOp;
30515 }
30516 return {BitV, BTK};
30517 }
30518 }
30519 return {nullptr, UndefBit};
30520}
30521
30523X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
30524 using namespace llvm::PatternMatch;
30525 // If the atomicrmw's result isn't actually used, we can just add a "lock"
30526 // prefix to a normal instruction for these operations.
30527 if (AI->use_empty())
30529
30530 if (AI->getOperation() == AtomicRMWInst::Xor) {
30531 // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
30532 // preferable to both `cmpxchg` and `btc`.
30533 if (match(AI->getOperand(1), m_SignMask()))
30535 }
30536
30537 // If the atomicrmw's result is used by a single bit AND, we may use
30538 // bts/btr/btc instruction for these operations.
30539 // Note: InstCombinePass can cause a de-optimization here. It replaces the
30540 // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
30541 // (depending on CC). This pattern can only use bts/btr/btc but we don't
30542 // detect it.
30543 Instruction *I = AI->user_back();
30544 auto BitChange = FindSingleBitChange(AI->getValOperand());
30545 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
30546 I->getOpcode() != Instruction::And ||
30547 AI->getType()->getPrimitiveSizeInBits() == 8 ||
30548 AI->getParent() != I->getParent())
30550
30551 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
30552
30553 // This is a redundant AND, it should get cleaned up elsewhere.
30554 if (AI == I->getOperand(OtherIdx))
30556
30557 // The following instruction must be a AND single bit.
30558 if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
30559 auto *C1 = cast<ConstantInt>(AI->getValOperand());
30560 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
30561 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
30563 }
30564 if (AI->getOperation() == AtomicRMWInst::And) {
30565 return ~C1->getValue() == C2->getValue()
30568 }
30571 }
30572
30573 assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
30574
30575 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
30576 if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
30578
30579 assert(BitChange.first != nullptr && BitTested.first != nullptr);
30580
30581 // If shift amounts are not the same we can't use BitTestIntrinsic.
30582 if (BitChange.first != BitTested.first)
30584
30585 // If atomic AND need to be masking all be one bit and testing the one bit
30586 // unset in the mask.
30587 if (AI->getOperation() == AtomicRMWInst::And)
30588 return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
30591
30592 // If atomic XOR/OR need to be setting and testing the same bit.
30593 return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
30596}
30597
30598void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
30599 IRBuilder<> Builder(AI);
30600 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
30603 switch (AI->getOperation()) {
30604 default:
30605 llvm_unreachable("Unknown atomic operation");
30606 case AtomicRMWInst::Or:
30607 IID_C = Intrinsic::x86_atomic_bts;
30608 IID_I = Intrinsic::x86_atomic_bts_rm;
30609 break;
30610 case AtomicRMWInst::Xor:
30611 IID_C = Intrinsic::x86_atomic_btc;
30612 IID_I = Intrinsic::x86_atomic_btc_rm;
30613 break;
30614 case AtomicRMWInst::And:
30615 IID_C = Intrinsic::x86_atomic_btr;
30616 IID_I = Intrinsic::x86_atomic_btr_rm;
30617 break;
30618 }
30619 Instruction *I = AI->user_back();
30620 LLVMContext &Ctx = AI->getContext();
30621 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
30623 Function *BitTest = nullptr;
30624 Value *Result = nullptr;
30625 auto BitTested = FindSingleBitChange(AI->getValOperand());
30626 assert(BitTested.first != nullptr);
30627
30628 if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
30629 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
30630
30631 BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType());
30632
30633 unsigned Imm = llvm::countr_zero(C->getZExtValue());
30634 Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});
30635 } else {
30636 BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType());
30637
30638 assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
30639
30640 Value *SI = BitTested.first;
30641 assert(SI != nullptr);
30642
30643 // BT{S|R|C} on memory operand don't modulo bit position so we need to
30644 // mask it.
30645 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
30646 Value *BitPos =
30647 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
30648 // Todo(1): In many cases it may be provable that SI is less than
30649 // ShiftBits in which case this mask is unnecessary
30650 // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
30651 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
30652 // favor of just a raw BT{S|R|C}.
30653
30654 Result = Builder.CreateCall(BitTest, {Addr, BitPos});
30655 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
30656
30657 // If the result is only used for zero/non-zero status then we don't need to
30658 // shift value back. Otherwise do so.
30659 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
30660 if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
30661 if (ICmp->isEquality()) {
30662 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
30663 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
30664 if (C0 || C1) {
30665 assert(C0 == nullptr || C1 == nullptr);
30666 if ((C0 ? C0 : C1)->isZero())
30667 continue;
30668 }
30669 }
30670 }
30671 Result = Builder.CreateShl(Result, BitPos);
30672 break;
30673 }
30674 }
30675
30676 I->replaceAllUsesWith(Result);
30677 I->eraseFromParent();
30678 AI->eraseFromParent();
30679}
30680
30682 using namespace llvm::PatternMatch;
30683 if (!AI->hasOneUse())
30684 return false;
30685
30686 Value *Op = AI->getOperand(1);
30688 Instruction *I = AI->user_back();
30690 if (Opc == AtomicRMWInst::Add) {
30691 if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
30692 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
30693 if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
30694 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30695 return Pred == CmpInst::ICMP_SLT;
30696 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30697 return Pred == CmpInst::ICMP_SGT;
30698 }
30699 return false;
30700 }
30701 if (Opc == AtomicRMWInst::Sub) {
30702 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
30703 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
30704 if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
30705 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30706 return Pred == CmpInst::ICMP_SLT;
30707 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30708 return Pred == CmpInst::ICMP_SGT;
30709 }
30710 return false;
30711 }
30712 if ((Opc == AtomicRMWInst::Or &&
30714 (Opc == AtomicRMWInst::And &&
30716 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30717 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
30718 Pred == CmpInst::ICMP_SLT;
30719 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30720 return Pred == CmpInst::ICMP_SGT;
30721 return false;
30722 }
30723 if (Opc == AtomicRMWInst::Xor) {
30724 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
30725 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
30726 if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
30727 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30728 return Pred == CmpInst::ICMP_SLT;
30729 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30730 return Pred == CmpInst::ICMP_SGT;
30731 }
30732 return false;
30733 }
30734
30735 return false;
30736}
30737
30738void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
30739 AtomicRMWInst *AI) const {
30740 IRBuilder<> Builder(AI);
30741 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
30742 Instruction *TempI = nullptr;
30743 LLVMContext &Ctx = AI->getContext();
30744 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
30745 if (!ICI) {
30746 TempI = AI->user_back();
30747 assert(TempI->hasOneUse() && "Must have one use");
30748 ICI = cast<ICmpInst>(TempI->user_back());
30749 }
30751 ICmpInst::Predicate Pred = ICI->getPredicate();
30752 switch (Pred) {
30753 default:
30754 llvm_unreachable("Not supported Pred");
30755 case CmpInst::ICMP_EQ:
30756 CC = X86::COND_E;
30757 break;
30758 case CmpInst::ICMP_NE:
30759 CC = X86::COND_NE;
30760 break;
30761 case CmpInst::ICMP_SLT:
30762 CC = X86::COND_S;
30763 break;
30764 case CmpInst::ICMP_SGT:
30765 CC = X86::COND_NS;
30766 break;
30767 }
30769 switch (AI->getOperation()) {
30770 default:
30771 llvm_unreachable("Unknown atomic operation");
30772 case AtomicRMWInst::Add:
30773 IID = Intrinsic::x86_atomic_add_cc;
30774 break;
30775 case AtomicRMWInst::Sub:
30776 IID = Intrinsic::x86_atomic_sub_cc;
30777 break;
30778 case AtomicRMWInst::Or:
30779 IID = Intrinsic::x86_atomic_or_cc;
30780 break;
30781 case AtomicRMWInst::And:
30782 IID = Intrinsic::x86_atomic_and_cc;
30783 break;
30784 case AtomicRMWInst::Xor:
30785 IID = Intrinsic::x86_atomic_xor_cc;
30786 break;
30787 }
30788 Function *CmpArith =
30789 Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());
30790 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
30792 Value *Call = Builder.CreateCall(
30793 CmpArith, {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
30794 Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
30795 ICI->replaceAllUsesWith(Result);
30796 ICI->eraseFromParent();
30797 if (TempI)
30798 TempI->eraseFromParent();
30799 AI->eraseFromParent();
30800}
30801
30803X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
30804 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
30805 Type *MemType = AI->getType();
30806
30807 // If the operand is too big, we must see if cmpxchg8/16b is available
30808 // and default to library calls otherwise.
30809 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
30810 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
30812 }
30813
30815 switch (Op) {
30818 case AtomicRMWInst::Add:
30819 case AtomicRMWInst::Sub:
30822 // It's better to use xadd, xsub or xchg for these in other cases.
30824 case AtomicRMWInst::Or:
30825 case AtomicRMWInst::And:
30826 case AtomicRMWInst::Xor:
30829 return shouldExpandLogicAtomicRMWInIR(AI);
30831 case AtomicRMWInst::Max:
30832 case AtomicRMWInst::Min:
30841 default:
30842 // These always require a non-trivial set of data operations on x86. We must
30843 // use a cmpxchg loop.
30845 }
30846}
30847
30848LoadInst *
30849X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
30850 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
30851 Type *MemType = AI->getType();
30852 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
30853 // there is no benefit in turning such RMWs into loads, and it is actually
30854 // harmful as it introduces a mfence.
30855 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
30856 return nullptr;
30857
30858 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
30859 // lowering available in lowerAtomicArith.
30860 // TODO: push more cases through this path.
30861 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
30862 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
30863 AI->use_empty())
30864 return nullptr;
30865
30866 IRBuilder<> Builder(AI);
30867 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
30868 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
30869 auto SSID = AI->getSyncScopeID();
30870 // We must restrict the ordering to avoid generating loads with Release or
30871 // ReleaseAcquire orderings.
30873
30874 // Before the load we need a fence. Here is an example lifted from
30875 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
30876 // is required:
30877 // Thread 0:
30878 // x.store(1, relaxed);
30879 // r1 = y.fetch_add(0, release);
30880 // Thread 1:
30881 // y.fetch_add(42, acquire);
30882 // r2 = x.load(relaxed);
30883 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
30884 // lowered to just a load without a fence. A mfence flushes the store buffer,
30885 // making the optimization clearly correct.
30886 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
30887 // otherwise, we might be able to be more aggressive on relaxed idempotent
30888 // rmw. In practice, they do not look useful, so we don't try to be
30889 // especially clever.
30890 if (SSID == SyncScope::SingleThread)
30891 // FIXME: we could just insert an ISD::MEMBARRIER here, except we are at
30892 // the IR level, so we must wrap it in an intrinsic.
30893 return nullptr;
30894
30895 if (!Subtarget.hasMFence())
30896 // FIXME: it might make sense to use a locked operation here but on a
30897 // different cache-line to prevent cache-line bouncing. In practice it
30898 // is probably a small win, and x86 processors without mfence are rare
30899 // enough that we do not bother.
30900 return nullptr;
30901
30902 Function *MFence =
30903 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
30904 Builder.CreateCall(MFence, {});
30905
30906 // Finally we can emit the atomic load.
30907 LoadInst *Loaded = Builder.CreateAlignedLoad(
30908 AI->getType(), AI->getPointerOperand(), AI->getAlign());
30909 Loaded->setAtomic(Order, SSID);
30910 AI->replaceAllUsesWith(Loaded);
30911 AI->eraseFromParent();
30912 return Loaded;
30913}
30914
30915/// Emit a locked operation on a stack location which does not change any
30916/// memory location, but does involve a lock prefix. Location is chosen to be
30917/// a) very likely accessed only by a single thread to minimize cache traffic,
30918/// and b) definitely dereferenceable. Returns the new Chain result.
30920 const X86Subtarget &Subtarget, SDValue Chain,
30921 const SDLoc &DL) {
30922 // Implementation notes:
30923 // 1) LOCK prefix creates a full read/write reordering barrier for memory
30924 // operations issued by the current processor. As such, the location
30925 // referenced is not relevant for the ordering properties of the instruction.
30926 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
30927 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
30928 // 2) Using an immediate operand appears to be the best encoding choice
30929 // here since it doesn't require an extra register.
30930 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
30931 // is small enough it might just be measurement noise.)
30932 // 4) When choosing offsets, there are several contributing factors:
30933 // a) If there's no redzone, we default to TOS. (We could allocate a cache
30934 // line aligned stack object to improve this case.)
30935 // b) To minimize our chances of introducing a false dependence, we prefer
30936 // to offset the stack usage from TOS slightly.
30937 // c) To minimize concerns about cross thread stack usage - in particular,
30938 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
30939 // captures state in the TOS frame and accesses it from many threads -
30940 // we want to use an offset such that the offset is in a distinct cache
30941 // line from the TOS frame.
30942 //
30943 // For a general discussion of the tradeoffs and benchmark results, see:
30944 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
30945
30946 auto &MF = DAG.getMachineFunction();
30947 auto &TFL = *Subtarget.getFrameLowering();
30948 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
30949
30950 if (Subtarget.is64Bit()) {
30951 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
30952 SDValue Ops[] = {
30953 DAG.getRegister(X86::RSP, MVT::i64), // Base
30954 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
30955 DAG.getRegister(0, MVT::i64), // Index
30956 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
30957 DAG.getRegister(0, MVT::i16), // Segment.
30958 Zero,
30959 Chain};
30960 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
30961 MVT::Other, Ops);
30962 return SDValue(Res, 1);
30963 }
30964
30965 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
30966 SDValue Ops[] = {
30967 DAG.getRegister(X86::ESP, MVT::i32), // Base
30968 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
30969 DAG.getRegister(0, MVT::i32), // Index
30970 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
30971 DAG.getRegister(0, MVT::i16), // Segment.
30972 Zero,
30973 Chain
30974 };
30975 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
30976 MVT::Other, Ops);
30977 return SDValue(Res, 1);
30978}
30979
30981 SelectionDAG &DAG) {
30982 SDLoc dl(Op);
30983 AtomicOrdering FenceOrdering =
30984 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
30985 SyncScope::ID FenceSSID =
30986 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
30987
30988 // The only fence that needs an instruction is a sequentially-consistent
30989 // cross-thread fence.
30990 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
30991 FenceSSID == SyncScope::System) {
30992 if (Subtarget.hasMFence())
30993 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
30994
30995 SDValue Chain = Op.getOperand(0);
30996 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
30997 }
30998
30999 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
31000 return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
31001}
31002
31004 SelectionDAG &DAG) {
31005 MVT T = Op.getSimpleValueType();
31006 SDLoc DL(Op);
31007 unsigned Reg = 0;
31008 unsigned size = 0;
31009 switch(T.SimpleTy) {
31010 default: llvm_unreachable("Invalid value type!");
31011 case MVT::i8: Reg = X86::AL; size = 1; break;
31012 case MVT::i16: Reg = X86::AX; size = 2; break;
31013 case MVT::i32: Reg = X86::EAX; size = 4; break;
31014 case MVT::i64:
31015 assert(Subtarget.is64Bit() && "Node not type legal!");
31016 Reg = X86::RAX; size = 8;
31017 break;
31018 }
31019 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
31020 Op.getOperand(2), SDValue());
31021 SDValue Ops[] = { cpIn.getValue(0),
31022 Op.getOperand(1),
31023 Op.getOperand(3),
31024 DAG.getTargetConstant(size, DL, MVT::i8),
31025 cpIn.getValue(1) };
31026 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
31027 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
31029 Ops, T, MMO);
31030
31031 SDValue cpOut =
31032 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
31033 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
31034 MVT::i32, cpOut.getValue(2));
31035 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
31036
31037 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
31038 cpOut, Success, EFLAGS.getValue(1));
31039}
31040
31041// Create MOVMSKB, taking into account whether we need to split for AVX1.
31043 const X86Subtarget &Subtarget) {
31044 MVT InVT = V.getSimpleValueType();
31045
31046 if (InVT == MVT::v64i8) {
31047 SDValue Lo, Hi;
31048 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
31049 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
31050 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
31051 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
31052 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
31053 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
31054 DAG.getConstant(32, DL, MVT::i8));
31055 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
31056 }
31057 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
31058 SDValue Lo, Hi;
31059 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
31060 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
31061 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
31062 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
31063 DAG.getConstant(16, DL, MVT::i8));
31064 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
31065 }
31066
31067 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
31068}
31069
31070static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
31071 SelectionDAG &DAG) {
31072 SDValue Src = Op.getOperand(0);
31073 MVT SrcVT = Src.getSimpleValueType();
31074 MVT DstVT = Op.getSimpleValueType();
31075
31076 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
31077 // half to v32i1 and concatenating the result.
31078 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
31079 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
31080 assert(Subtarget.hasBWI() && "Expected BWI target");
31081 SDLoc dl(Op);
31082 SDValue Lo, Hi;
31083 std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
31084 Lo = DAG.getBitcast(MVT::v32i1, Lo);
31085 Hi = DAG.getBitcast(MVT::v32i1, Hi);
31086 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
31087 }
31088
31089 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
31090 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
31091 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
31092 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
31093 SDLoc DL(Op);
31094 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
31095 V = getPMOVMSKB(DL, V, DAG, Subtarget);
31096 return DAG.getZExtOrTrunc(V, DL, DstVT);
31097 }
31098
31099 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
31100 SrcVT == MVT::i64) && "Unexpected VT!");
31101
31102 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
31103 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
31104 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
31105 // This conversion needs to be expanded.
31106 return SDValue();
31107
31108 SDLoc dl(Op);
31109 if (SrcVT.isVector()) {
31110 // Widen the vector in input in the case of MVT::v2i32.
31111 // Example: from MVT::v2i32 to MVT::v4i32.
31113 SrcVT.getVectorNumElements() * 2);
31114 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
31115 DAG.getUNDEF(SrcVT));
31116 } else {
31117 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
31118 "Unexpected source type in LowerBITCAST");
31119 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
31120 }
31121
31122 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
31123 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
31124
31125 if (DstVT == MVT::x86mmx)
31126 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
31127
31128 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
31129 DAG.getIntPtrConstant(0, dl));
31130}
31131
31132/// Compute the horizontal sum of bytes in V for the elements of VT.
31133///
31134/// Requires V to be a byte vector and VT to be an integer vector type with
31135/// wider elements than V's type. The width of the elements of VT determines
31136/// how many bytes of V are summed horizontally to produce each element of the
31137/// result.
31139 const X86Subtarget &Subtarget,
31140 SelectionDAG &DAG) {
31141 SDLoc DL(V);
31142 MVT ByteVecVT = V.getSimpleValueType();
31143 MVT EltVT = VT.getVectorElementType();
31144 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
31145 "Expected value to have byte element type.");
31146 assert(EltVT != MVT::i8 &&
31147 "Horizontal byte sum only makes sense for wider elements!");
31148 unsigned VecSize = VT.getSizeInBits();
31149 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
31150
31151 // PSADBW instruction horizontally add all bytes and leave the result in i64
31152 // chunks, thus directly computes the pop count for v2i64 and v4i64.
31153 if (EltVT == MVT::i64) {
31154 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
31155 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
31156 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
31157 return DAG.getBitcast(VT, V);
31158 }
31159
31160 if (EltVT == MVT::i32) {
31161 // We unpack the low half and high half into i32s interleaved with zeros so
31162 // that we can use PSADBW to horizontally sum them. The most useful part of
31163 // this is that it lines up the results of two PSADBW instructions to be
31164 // two v2i64 vectors which concatenated are the 4 population counts. We can
31165 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
31166 SDValue Zeros = DAG.getConstant(0, DL, VT);
31167 SDValue V32 = DAG.getBitcast(VT, V);
31168 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
31169 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
31170
31171 // Do the horizontal sums into two v2i64s.
31172 Zeros = DAG.getConstant(0, DL, ByteVecVT);
31173 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
31174 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
31175 DAG.getBitcast(ByteVecVT, Low), Zeros);
31176 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
31177 DAG.getBitcast(ByteVecVT, High), Zeros);
31178
31179 // Merge them together.
31180 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
31181 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
31182 DAG.getBitcast(ShortVecVT, Low),
31183 DAG.getBitcast(ShortVecVT, High));
31184
31185 return DAG.getBitcast(VT, V);
31186 }
31187
31188 // The only element type left is i16.
31189 assert(EltVT == MVT::i16 && "Unknown how to handle type");
31190
31191 // To obtain pop count for each i16 element starting from the pop count for
31192 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
31193 // right by 8. It is important to shift as i16s as i8 vector shift isn't
31194 // directly supported.
31195 SDValue ShifterV = DAG.getConstant(8, DL, VT);
31196 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
31197 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
31198 DAG.getBitcast(ByteVecVT, V));
31199 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
31200}
31201
31203 const X86Subtarget &Subtarget,
31204 SelectionDAG &DAG) {
31205 MVT VT = Op.getSimpleValueType();
31206 MVT EltVT = VT.getVectorElementType();
31207 int NumElts = VT.getVectorNumElements();
31208 (void)EltVT;
31209 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
31210
31211 // Implement a lookup table in register by using an algorithm based on:
31212 // http://wm.ite.pl/articles/sse-popcount.html
31213 //
31214 // The general idea is that every lower byte nibble in the input vector is an
31215 // index into a in-register pre-computed pop count table. We then split up the
31216 // input vector in two new ones: (1) a vector with only the shifted-right
31217 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
31218 // masked out higher ones) for each byte. PSHUFB is used separately with both
31219 // to index the in-register table. Next, both are added and the result is a
31220 // i8 vector where each element contains the pop count for input byte.
31221 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
31222 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
31223 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
31224 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
31225
31227 for (int i = 0; i < NumElts; ++i)
31228 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
31229 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
31230 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
31231
31232 // High nibbles
31233 SDValue FourV = DAG.getConstant(4, DL, VT);
31234 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
31235
31236 // Low nibbles
31237 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
31238
31239 // The input vector is used as the shuffle mask that index elements into the
31240 // LUT. After counting low and high nibbles, add the vector to obtain the
31241 // final pop count per i8 element.
31242 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
31243 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
31244 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
31245}
31246
31247// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
31248// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
31250 const X86Subtarget &Subtarget,
31251 SelectionDAG &DAG) {
31252 MVT VT = Op.getSimpleValueType();
31253 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
31254 "Unknown CTPOP type to handle");
31255 SDValue Op0 = Op.getOperand(0);
31256
31257 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
31258 if (Subtarget.hasVPOPCNTDQ()) {
31259 unsigned NumElems = VT.getVectorNumElements();
31260 assert((VT.getVectorElementType() == MVT::i8 ||
31261 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
31262 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
31263 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
31264 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
31265 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
31266 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
31267 }
31268 }
31269
31270 // Decompose 256-bit ops into smaller 128-bit ops.
31271 if (VT.is256BitVector() && !Subtarget.hasInt256())
31272 return splitVectorIntUnary(Op, DAG, DL);
31273
31274 // Decompose 512-bit ops into smaller 256-bit ops.
31275 if (VT.is512BitVector() && !Subtarget.hasBWI())
31276 return splitVectorIntUnary(Op, DAG, DL);
31277
31278 // For element types greater than i8, do vXi8 pop counts and a bytesum.
31279 if (VT.getScalarType() != MVT::i8) {
31280 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
31281 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
31282 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
31283 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
31284 }
31285
31286 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
31287 if (!Subtarget.hasSSSE3())
31288 return SDValue();
31289
31290 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
31291}
31292
31293static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,
31294 SelectionDAG &DAG) {
31295 MVT VT = N.getSimpleValueType();
31296 SDValue Op = N.getOperand(0);
31297 SDLoc DL(N);
31298
31299 if (VT.isScalarInteger()) {
31300 // Compute the lower/upper bounds of the active bits of the value,
31301 // allowing us to shift the active bits down if necessary to fit into the
31302 // special cases below.
31303 KnownBits Known = DAG.computeKnownBits(Op);
31304 unsigned LZ = Known.countMinLeadingZeros();
31305 unsigned TZ = Known.countMinTrailingZeros();
31306 assert((LZ + TZ) < Known.getBitWidth() && "Illegal shifted mask");
31307 unsigned ActiveBits = Known.getBitWidth() - LZ;
31308 unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ);
31309
31310 // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))".
31311 if (ShiftedActiveBits <= 2) {
31312 if (ActiveBits > 2)
31313 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
31314 DAG.getShiftAmountConstant(TZ, VT, DL));
31315 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
31316 Op = DAG.getNode(ISD::SUB, DL, MVT::i32, Op,
31317 DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
31318 DAG.getShiftAmountConstant(1, VT, DL)));
31319 return DAG.getZExtOrTrunc(Op, DL, VT);
31320 }
31321
31322 // i3 CTPOP - perform LUT into i32 integer.
31323 if (ShiftedActiveBits <= 3) {
31324 if (ActiveBits > 3)
31325 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
31326 DAG.getShiftAmountConstant(TZ, VT, DL));
31327 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
31328 Op = DAG.getNode(ISD::SHL, DL, MVT::i32, Op,
31329 DAG.getShiftAmountConstant(1, VT, DL));
31330 Op = DAG.getNode(ISD::SRL, DL, MVT::i32,
31331 DAG.getConstant(0b1110100110010100U, DL, MVT::i32), Op);
31332 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op,
31333 DAG.getConstant(0x3, DL, MVT::i32));
31334 return DAG.getZExtOrTrunc(Op, DL, VT);
31335 }
31336
31337 // i4 CTPOP - perform LUT into i64 integer.
31338 if (ShiftedActiveBits <= 4 &&
31339 DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64)) {
31340 SDValue LUT = DAG.getConstant(0x4332322132212110ULL, DL, MVT::i64);
31341 if (ActiveBits > 4)
31342 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
31343 DAG.getShiftAmountConstant(TZ, VT, DL));
31344 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
31345 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
31346 DAG.getConstant(4, DL, MVT::i32));
31347 Op = DAG.getNode(ISD::SRL, DL, MVT::i64, LUT,
31348 DAG.getShiftAmountOperand(MVT::i64, Op));
31349 Op = DAG.getNode(ISD::AND, DL, MVT::i64, Op,
31350 DAG.getConstant(0x7, DL, MVT::i64));
31351 return DAG.getZExtOrTrunc(Op, DL, VT);
31352 }
31353
31354 // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
31355 if (ShiftedActiveBits <= 8) {
31356 SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
31357 if (ActiveBits > 8)
31358 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
31359 DAG.getShiftAmountConstant(TZ, VT, DL));
31360 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
31361 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
31362 DAG.getConstant(0x08040201U, DL, MVT::i32));
31363 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
31364 DAG.getShiftAmountConstant(3, MVT::i32, DL));
31365 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
31366 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
31367 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
31368 DAG.getShiftAmountConstant(28, MVT::i32, DL));
31369 return DAG.getZExtOrTrunc(Op, DL, VT);
31370 }
31371
31372 return SDValue(); // fallback to generic expansion.
31373 }
31374
31375 assert(VT.isVector() &&
31376 "We only do custom lowering for vector population count.");
31377 return LowerVectorCTPOP(N, DL, Subtarget, DAG);
31378}
31379
31381 MVT VT = Op.getSimpleValueType();
31382 SDValue In = Op.getOperand(0);
31383 SDLoc DL(Op);
31384
31385 // For scalars, its still beneficial to transfer to/from the SIMD unit to
31386 // perform the BITREVERSE.
31387 if (!VT.isVector()) {
31388 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
31389 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
31390 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
31391 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
31392 DAG.getIntPtrConstant(0, DL));
31393 }
31394
31395 int NumElts = VT.getVectorNumElements();
31396 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
31397
31398 // Decompose 256-bit ops into smaller 128-bit ops.
31399 if (VT.is256BitVector())
31400 return splitVectorIntUnary(Op, DAG, DL);
31401
31402 assert(VT.is128BitVector() &&
31403 "Only 128-bit vector bitreverse lowering supported.");
31404
31405 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
31406 // perform the BSWAP in the shuffle.
31407 // Its best to shuffle using the second operand as this will implicitly allow
31408 // memory folding for multiple vectors.
31409 SmallVector<SDValue, 16> MaskElts;
31410 for (int i = 0; i != NumElts; ++i) {
31411 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
31412 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
31413 int PermuteByte = SourceByte | (2 << 5);
31414 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
31415 }
31416 }
31417
31418 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
31419 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
31420 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
31421 Res, Mask);
31422 return DAG.getBitcast(VT, Res);
31423}
31424
31426 SelectionDAG &DAG) {
31427 MVT VT = Op.getSimpleValueType();
31428
31429 if (Subtarget.hasXOP() && !VT.is512BitVector())
31430 return LowerBITREVERSE_XOP(Op, DAG);
31431
31432 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
31433
31434 SDValue In = Op.getOperand(0);
31435 SDLoc DL(Op);
31436
31437 // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
31438 if (VT.is512BitVector() && !Subtarget.hasBWI())
31439 return splitVectorIntUnary(Op, DAG, DL);
31440
31441 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
31442 if (VT.is256BitVector() && !Subtarget.hasInt256())
31443 return splitVectorIntUnary(Op, DAG, DL);
31444
31445 // Lower i8/i16/i32/i64 as vXi8 BITREVERSE + BSWAP
31446 if (!VT.isVector()) {
31447 assert(
31448 (VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i16 || VT == MVT::i8) &&
31449 "Only tested for i8/i16/i32/i64");
31450 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
31451 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
31452 Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
31453 DAG.getBitcast(MVT::v16i8, Res));
31454 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
31455 DAG.getBitcast(VecVT, Res), DAG.getIntPtrConstant(0, DL));
31456 return (VT == MVT::i8) ? Res : DAG.getNode(ISD::BSWAP, DL, VT, Res);
31457 }
31458
31459 assert(VT.isVector() && VT.getSizeInBits() >= 128);
31460
31461 // Lower vXi16/vXi32/vXi64 as BSWAP + vXi8 BITREVERSE.
31462 if (VT.getScalarType() != MVT::i8) {
31463 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
31464 SDValue Res = DAG.getNode(ISD::BSWAP, DL, VT, In);
31465 Res = DAG.getBitcast(ByteVT, Res);
31466 Res = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Res);
31467 return DAG.getBitcast(VT, Res);
31468 }
31469 assert(VT.isVector() && VT.getScalarType() == MVT::i8 &&
31470 "Only byte vector BITREVERSE supported");
31471
31472 unsigned NumElts = VT.getVectorNumElements();
31473
31474 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
31475 if (Subtarget.hasGFNI()) {
31476 MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
31477 SDValue Matrix =
31479 Matrix = DAG.getBitcast(VT, Matrix);
31480 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
31481 DAG.getTargetConstant(0, DL, MVT::i8));
31482 }
31483
31484 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
31485 // two nibbles and a PSHUFB lookup to find the bitreverse of each
31486 // 0-15 value (moved to the other nibble).
31487 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
31488 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
31489 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
31490
31491 const int LoLUT[16] = {
31492 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
31493 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
31494 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
31495 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
31496 const int HiLUT[16] = {
31497 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
31498 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
31499 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
31500 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
31501
31502 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
31503 for (unsigned i = 0; i < NumElts; ++i) {
31504 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
31505 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
31506 }
31507
31508 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
31509 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
31510 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
31511 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
31512 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
31513}
31514
31515static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
31516 SelectionDAG &DAG) {
31517 SDLoc DL(Op);
31518 SDValue X = Op.getOperand(0);
31519 MVT VT = Op.getSimpleValueType();
31520
31521 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
31522 if (VT == MVT::i8 ||
31524 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
31525 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
31526 DAG.getConstant(0, DL, MVT::i8));
31527 // Copy the inverse of the parity flag into a register with setcc.
31528 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
31529 // Extend to the original type.
31530 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
31531 }
31532
31533 // If we have POPCNT, use the default expansion.
31534 if (Subtarget.hasPOPCNT())
31535 return SDValue();
31536
31537 if (VT == MVT::i64) {
31538 // Xor the high and low 16-bits together using a 32-bit operation.
31539 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
31540 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
31541 DAG.getConstant(32, DL, MVT::i8)));
31542 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
31543 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
31544 }
31545
31546 if (VT != MVT::i16) {
31547 // Xor the high and low 16-bits together using a 32-bit operation.
31548 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
31549 DAG.getConstant(16, DL, MVT::i8));
31550 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
31551 } else {
31552 // If the input is 16-bits, we need to extend to use an i32 shift below.
31553 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
31554 }
31555
31556 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
31557 // This should allow an h-reg to be used to save a shift.
31558 SDValue Hi = DAG.getNode(
31559 ISD::TRUNCATE, DL, MVT::i8,
31560 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
31561 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
31562 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
31563 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
31564
31565 // Copy the inverse of the parity flag into a register with setcc.
31566 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
31567 // Extend to the original type.
31568 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
31569}
31570
31572 const X86Subtarget &Subtarget) {
31573 unsigned NewOpc = 0;
31574 switch (N->getOpcode()) {
31576 NewOpc = X86ISD::LADD;
31577 break;
31579 NewOpc = X86ISD::LSUB;
31580 break;
31582 NewOpc = X86ISD::LOR;
31583 break;
31585 NewOpc = X86ISD::LXOR;
31586 break;
31588 NewOpc = X86ISD::LAND;
31589 break;
31590 default:
31591 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
31592 }
31593
31594 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
31595
31596 return DAG.getMemIntrinsicNode(
31597 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
31598 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
31599 /*MemVT=*/N->getSimpleValueType(0), MMO);
31600}
31601
31602/// Lower atomic_load_ops into LOCK-prefixed operations.
31604 const X86Subtarget &Subtarget) {
31605 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
31606 SDValue Chain = N->getOperand(0);
31607 SDValue LHS = N->getOperand(1);
31608 SDValue RHS = N->getOperand(2);
31609 unsigned Opc = N->getOpcode();
31610 MVT VT = N->getSimpleValueType(0);
31611 SDLoc DL(N);
31612
31613 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
31614 // can only be lowered when the result is unused. They should have already
31615 // been transformed into a cmpxchg loop in AtomicExpand.
31616 if (N->hasAnyUseOfValue(0)) {
31617 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
31618 // select LXADD if LOCK_SUB can't be selected.
31619 // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we
31620 // can use LXADD as opposed to cmpxchg.
31621 if (Opc == ISD::ATOMIC_LOAD_SUB ||
31623 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
31624 DAG.getNegative(RHS, DL, VT), AN->getMemOperand());
31625
31627 "Used AtomicRMW ops other than Add should have been expanded!");
31628 return N;
31629 }
31630
31631 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
31632 // The core idea here is that since the memory location isn't actually
31633 // changing, all we need is a lowering for the *ordering* impacts of the
31634 // atomicrmw. As such, we can chose a different operation and memory
31635 // location to minimize impact on other code.
31636 // The above holds unless the node is marked volatile in which
31637 // case it needs to be preserved according to the langref.
31638 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {
31639 // On X86, the only ordering which actually requires an instruction is
31640 // seq_cst which isn't SingleThread, everything just needs to be preserved
31641 // during codegen and then dropped. Note that we expect (but don't assume),
31642 // that orderings other than seq_cst and acq_rel have been canonicalized to
31643 // a store or load.
31646 // Prefer a locked operation against a stack location to minimize cache
31647 // traffic. This assumes that stack locations are very likely to be
31648 // accessed only by the owning thread.
31649 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
31650 assert(!N->hasAnyUseOfValue(0));
31651 // NOTE: The getUNDEF is needed to give something for the unused result 0.
31652 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31653 DAG.getUNDEF(VT), NewChain);
31654 }
31655 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
31656 SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
31657 assert(!N->hasAnyUseOfValue(0));
31658 // NOTE: The getUNDEF is needed to give something for the unused result 0.
31659 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31660 DAG.getUNDEF(VT), NewChain);
31661 }
31662
31663 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
31664 // RAUW the chain, but don't worry about the result, as it's unused.
31665 assert(!N->hasAnyUseOfValue(0));
31666 // NOTE: The getUNDEF is needed to give something for the unused result 0.
31667 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31668 DAG.getUNDEF(VT), LockOp.getValue(1));
31669}
31670
31672 const X86Subtarget &Subtarget) {
31673 auto *Node = cast<AtomicSDNode>(Op.getNode());
31674 SDLoc dl(Node);
31675 EVT VT = Node->getMemoryVT();
31676
31677 bool IsSeqCst =
31678 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
31679 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
31680
31681 // If this store is not sequentially consistent and the type is legal
31682 // we can just keep it.
31683 if (!IsSeqCst && IsTypeLegal)
31684 return Op;
31685
31686 if (VT == MVT::i64 && !IsTypeLegal) {
31687 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
31688 // is enabled.
31689 bool NoImplicitFloatOps =
31691 Attribute::NoImplicitFloat);
31692 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
31693 SDValue Chain;
31694 if (Subtarget.hasSSE1()) {
31695 SDValue SclToVec =
31696 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
31697 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
31698 SclToVec = DAG.getBitcast(StVT, SclToVec);
31699 SDVTList Tys = DAG.getVTList(MVT::Other);
31700 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
31701 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
31702 MVT::i64, Node->getMemOperand());
31703 } else if (Subtarget.hasX87()) {
31704 // First load this into an 80-bit X87 register using a stack temporary.
31705 // This will put the whole integer into the significand.
31706 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
31707 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
31708 MachinePointerInfo MPI =
31710 Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr,
31712 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
31713 SDValue LdOps[] = {Chain, StackPtr};
31715 X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
31716 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
31717 Chain = Value.getValue(1);
31718
31719 // Now use an FIST to do the atomic store.
31720 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
31721 Chain =
31722 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
31723 StoreOps, MVT::i64, Node->getMemOperand());
31724 }
31725
31726 if (Chain) {
31727 // If this is a sequentially consistent store, also emit an appropriate
31728 // barrier.
31729 if (IsSeqCst)
31730 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
31731
31732 return Chain;
31733 }
31734 }
31735 }
31736
31737 // Convert seq_cst store -> xchg
31738 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
31739 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
31740 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(),
31741 Node->getOperand(0), Node->getOperand(2),
31742 Node->getOperand(1), Node->getMemOperand());
31743 return Swap.getValue(1);
31744}
31745
31747 SDNode *N = Op.getNode();
31748 MVT VT = N->getSimpleValueType(0);
31749 unsigned Opc = Op.getOpcode();
31750
31751 // Let legalize expand this if it isn't a legal type yet.
31752 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
31753 return SDValue();
31754
31755 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
31756 SDLoc DL(N);
31757
31758 // Set the carry flag.
31759 SDValue Carry = Op.getOperand(2);
31760 EVT CarryVT = Carry.getValueType();
31761 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
31762 Carry, DAG.getAllOnesConstant(DL, CarryVT));
31763
31764 bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
31765 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
31766 Op.getOperand(0), Op.getOperand(1),
31767 Carry.getValue(1));
31768
31769 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
31770 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
31771 Sum.getValue(1), DL, DAG);
31772 if (N->getValueType(1) == MVT::i1)
31773 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
31774
31775 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
31776}
31777
31778static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
31779 SelectionDAG &DAG) {
31780 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
31781
31782 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
31783 // which returns the values as { float, float } (in XMM0) or
31784 // { double, double } (which is returned in XMM0, XMM1).
31785 SDLoc dl(Op);
31786 SDValue Arg = Op.getOperand(0);
31787 EVT ArgVT = Arg.getValueType();
31788 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
31789
31792
31793 Entry.Node = Arg;
31794 Entry.Ty = ArgTy;
31795 Entry.IsSExt = false;
31796 Entry.IsZExt = false;
31797 Args.push_back(Entry);
31798
31799 bool isF64 = ArgVT == MVT::f64;
31800 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
31801 // the small struct {f32, f32} is returned in (eax, edx). For f64,
31802 // the results are returned via SRet in memory.
31803 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31804 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
31805 const char *LibcallName = TLI.getLibcallName(LC);
31806 SDValue Callee =
31807 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
31808
31809 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
31810 : (Type *)FixedVectorType::get(ArgTy, 4);
31811
31813 CLI.setDebugLoc(dl)
31814 .setChain(DAG.getEntryNode())
31815 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
31816
31817 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
31818
31819 if (isF64)
31820 // Returned in xmm0 and xmm1.
31821 return CallResult.first;
31822
31823 // Returned in bits 0:31 and 32:64 xmm0.
31824 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
31825 CallResult.first, DAG.getIntPtrConstant(0, dl));
31826 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
31827 CallResult.first, DAG.getIntPtrConstant(1, dl));
31828 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
31829 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
31830}
31831
31832/// Widen a vector input to a vector of NVT. The
31833/// input vector must have the same element type as NVT.
31835 bool FillWithZeroes = false) {
31836 // Check if InOp already has the right width.
31837 MVT InVT = InOp.getSimpleValueType();
31838 if (InVT == NVT)
31839 return InOp;
31840
31841 if (InOp.isUndef())
31842 return DAG.getUNDEF(NVT);
31843
31845 "input and widen element type must match");
31846
31847 unsigned InNumElts = InVT.getVectorNumElements();
31848 unsigned WidenNumElts = NVT.getVectorNumElements();
31849 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
31850 "Unexpected request for vector widening");
31851
31852 SDLoc dl(InOp);
31853 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
31854 InOp.getNumOperands() == 2) {
31855 SDValue N1 = InOp.getOperand(1);
31856 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
31857 N1.isUndef()) {
31858 InOp = InOp.getOperand(0);
31859 InVT = InOp.getSimpleValueType();
31860 InNumElts = InVT.getVectorNumElements();
31861 }
31862 }
31866 for (unsigned i = 0; i < InNumElts; ++i)
31867 Ops.push_back(InOp.getOperand(i));
31868
31869 EVT EltVT = InOp.getOperand(0).getValueType();
31870
31871 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
31872 DAG.getUNDEF(EltVT);
31873 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
31874 Ops.push_back(FillVal);
31875 return DAG.getBuildVector(NVT, dl, Ops);
31876 }
31877 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
31878 DAG.getUNDEF(NVT);
31879 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
31880 InOp, DAG.getIntPtrConstant(0, dl));
31881}
31882
31884 SelectionDAG &DAG) {
31885 assert(Subtarget.hasAVX512() &&
31886 "MGATHER/MSCATTER are supported on AVX-512 arch only");
31887
31888 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
31889 SDValue Src = N->getValue();
31890 MVT VT = Src.getSimpleValueType();
31891 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
31892 SDLoc dl(Op);
31893
31894 SDValue Scale = N->getScale();
31895 SDValue Index = N->getIndex();
31896 SDValue Mask = N->getMask();
31897 SDValue Chain = N->getChain();
31898 SDValue BasePtr = N->getBasePtr();
31899
31900 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
31901 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
31902 // If the index is v2i64 and we have VLX we can use xmm for data and index.
31903 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
31904 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31905 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
31906 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
31907 SDVTList VTs = DAG.getVTList(MVT::Other);
31908 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
31909 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
31910 N->getMemoryVT(), N->getMemOperand());
31911 }
31912 return SDValue();
31913 }
31914
31915 MVT IndexVT = Index.getSimpleValueType();
31916
31917 // If the index is v2i32, we're being called by type legalization and we
31918 // should just let the default handling take care of it.
31919 if (IndexVT == MVT::v2i32)
31920 return SDValue();
31921
31922 // If we don't have VLX and neither the passthru or index is 512-bits, we
31923 // need to widen until one is.
31924 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
31925 !Index.getSimpleValueType().is512BitVector()) {
31926 // Determine how much we need to widen by to get a 512-bit type.
31927 unsigned Factor = std::min(512/VT.getSizeInBits(),
31928 512/IndexVT.getSizeInBits());
31929 unsigned NumElts = VT.getVectorNumElements() * Factor;
31930
31931 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
31932 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
31933 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
31934
31935 Src = ExtendToType(Src, VT, DAG);
31936 Index = ExtendToType(Index, IndexVT, DAG);
31937 Mask = ExtendToType(Mask, MaskVT, DAG, true);
31938 }
31939
31940 SDVTList VTs = DAG.getVTList(MVT::Other);
31941 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
31942 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
31943 N->getMemoryVT(), N->getMemOperand());
31944}
31945
31946static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
31947 SelectionDAG &DAG) {
31948
31949 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
31950 MVT VT = Op.getSimpleValueType();
31951 MVT ScalarVT = VT.getScalarType();
31952 SDValue Mask = N->getMask();
31953 MVT MaskVT = Mask.getSimpleValueType();
31954 SDValue PassThru = N->getPassThru();
31955 SDLoc dl(Op);
31956
31957 // Handle AVX masked loads which don't support passthru other than 0.
31958 if (MaskVT.getVectorElementType() != MVT::i1) {
31959 // We also allow undef in the isel pattern.
31960 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
31961 return Op;
31962
31963 SDValue NewLoad = DAG.getMaskedLoad(
31964 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
31965 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
31966 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
31967 N->isExpandingLoad());
31968 // Emit a blend.
31969 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
31970 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
31971 }
31972
31973 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
31974 "Expanding masked load is supported on AVX-512 target only!");
31975
31976 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
31977 "Expanding masked load is supported for 32 and 64-bit types only!");
31978
31979 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
31980 "Cannot lower masked load op.");
31981
31982 assert((ScalarVT.getSizeInBits() >= 32 ||
31983 (Subtarget.hasBWI() &&
31984 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
31985 "Unsupported masked load op.");
31986
31987 // This operation is legal for targets with VLX, but without
31988 // VLX the vector should be widened to 512 bit
31989 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
31990 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
31991 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
31992
31993 // Mask element has to be i1.
31994 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
31995 "Unexpected mask type");
31996
31997 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
31998
31999 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
32000 SDValue NewLoad = DAG.getMaskedLoad(
32001 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
32002 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
32003 N->getExtensionType(), N->isExpandingLoad());
32004
32005 SDValue Extract =
32006 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
32007 DAG.getIntPtrConstant(0, dl));
32008 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
32009 return DAG.getMergeValues(RetOps, dl);
32010}
32011
32012static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
32013 SelectionDAG &DAG) {
32014 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
32015 SDValue DataToStore = N->getValue();
32016 MVT VT = DataToStore.getSimpleValueType();
32017 MVT ScalarVT = VT.getScalarType();
32018 SDValue Mask = N->getMask();
32019 SDLoc dl(Op);
32020
32021 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
32022 "Expanding masked load is supported on AVX-512 target only!");
32023
32024 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
32025 "Expanding masked load is supported for 32 and 64-bit types only!");
32026
32027 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
32028 "Cannot lower masked store op.");
32029
32030 assert((ScalarVT.getSizeInBits() >= 32 ||
32031 (Subtarget.hasBWI() &&
32032 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
32033 "Unsupported masked store op.");
32034
32035 // This operation is legal for targets with VLX, but without
32036 // VLX the vector should be widened to 512 bit
32037 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
32038 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
32039
32040 // Mask element has to be i1.
32041 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
32042 "Unexpected mask type");
32043
32044 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
32045
32046 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
32047 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
32048 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
32049 N->getOffset(), Mask, N->getMemoryVT(),
32050 N->getMemOperand(), N->getAddressingMode(),
32051 N->isTruncatingStore(), N->isCompressingStore());
32052}
32053
32054static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
32055 SelectionDAG &DAG) {
32056 assert(Subtarget.hasAVX2() &&
32057 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
32058
32059 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
32060 SDLoc dl(Op);
32061 MVT VT = Op.getSimpleValueType();
32062 SDValue Index = N->getIndex();
32063 SDValue Mask = N->getMask();
32064 SDValue PassThru = N->getPassThru();
32065 MVT IndexVT = Index.getSimpleValueType();
32066
32067 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
32068
32069 // If the index is v2i32, we're being called by type legalization.
32070 if (IndexVT == MVT::v2i32)
32071 return SDValue();
32072
32073 // If we don't have VLX and neither the passthru or index is 512-bits, we
32074 // need to widen until one is.
32075 MVT OrigVT = VT;
32076 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
32077 !IndexVT.is512BitVector()) {
32078 // Determine how much we need to widen by to get a 512-bit type.
32079 unsigned Factor = std::min(512/VT.getSizeInBits(),
32080 512/IndexVT.getSizeInBits());
32081
32082 unsigned NumElts = VT.getVectorNumElements() * Factor;
32083
32084 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
32085 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
32086 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
32087
32088 PassThru = ExtendToType(PassThru, VT, DAG);
32089 Index = ExtendToType(Index, IndexVT, DAG);
32090 Mask = ExtendToType(Mask, MaskVT, DAG, true);
32091 }
32092
32093 // Break dependency on the data register.
32094 if (PassThru.isUndef())
32095 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
32096
32097 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
32098 N->getScale() };
32099 SDValue NewGather = DAG.getMemIntrinsicNode(
32100 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
32101 N->getMemOperand());
32102 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
32103 NewGather, DAG.getIntPtrConstant(0, dl));
32104 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
32105}
32106
32108 SDLoc dl(Op);
32109 SDValue Src = Op.getOperand(0);
32110 MVT DstVT = Op.getSimpleValueType();
32111
32112 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
32113 unsigned SrcAS = N->getSrcAddressSpace();
32114
32115 assert(SrcAS != N->getDestAddressSpace() &&
32116 "addrspacecast must be between different address spaces");
32117
32118 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
32119 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
32120 } else if (DstVT == MVT::i64) {
32121 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
32122 } else if (DstVT == MVT::i32) {
32123 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
32124 } else {
32125 report_fatal_error("Bad address space in addrspacecast");
32126 }
32127 return Op;
32128}
32129
32130SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
32131 SelectionDAG &DAG) const {
32132 // TODO: Eventually, the lowering of these nodes should be informed by or
32133 // deferred to the GC strategy for the function in which they appear. For
32134 // now, however, they must be lowered to something. Since they are logically
32135 // no-ops in the case of a null GC strategy (or a GC strategy which does not
32136 // require special handling for these nodes), lower them as literal NOOPs for
32137 // the time being.
32139 Ops.push_back(Op.getOperand(0));
32140 if (Op->getGluedNode())
32141 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
32142
32143 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
32144 return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
32145}
32146
32147// Custom split CVTPS2PH with wide types.
32149 SDLoc dl(Op);
32150 EVT VT = Op.getValueType();
32151 SDValue Lo, Hi;
32152 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
32153 EVT LoVT, HiVT;
32154 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
32155 SDValue RC = Op.getOperand(1);
32156 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
32157 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
32158 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32159}
32160
32162 SelectionDAG &DAG) {
32163 unsigned IsData = Op.getConstantOperandVal(4);
32164
32165 // We don't support non-data prefetch without PREFETCHI.
32166 // Just preserve the chain.
32167 if (!IsData && !Subtarget.hasPREFETCHI())
32168 return Op.getOperand(0);
32169
32170 return Op;
32171}
32172
32174 unsigned OpNo) {
32175 const APInt Operand(32, OpNo);
32176 std::string OpNoStr = llvm::toString(Operand, 10, false);
32177 std::string Str(" $");
32178
32179 std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)
32180 std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
32181
32182 auto I = StringRef::npos;
32183 for (auto &AsmStr : AsmStrs) {
32184 // Match the OpNo string. We should match exactly to exclude match
32185 // sub-string, e.g. "$12" contain "$1"
32186 if (AsmStr.ends_with(OpNoStr1))
32187 I = AsmStr.size() - OpNoStr1.size();
32188
32189 // Get the index of operand in AsmStr.
32190 if (I == StringRef::npos)
32191 I = AsmStr.find(OpNoStr1 + ",");
32192 if (I == StringRef::npos)
32193 I = AsmStr.find(OpNoStr2);
32194
32195 if (I == StringRef::npos)
32196 continue;
32197
32198 assert(I > 0 && "Unexpected inline asm string!");
32199 // Remove the operand string and label (if exsit).
32200 // For example:
32201 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
32202 // ==>
32203 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
32204 // ==>
32205 // "call dword ptr "
32206 auto TmpStr = AsmStr.substr(0, I);
32207 I = TmpStr.rfind(':');
32208 if (I != StringRef::npos)
32209 TmpStr = TmpStr.substr(I + 1);
32210 return TmpStr.take_while(llvm::isAlpha);
32211 }
32212
32213 return StringRef();
32214}
32215
32217 const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
32218 // In a __asm block, __asm inst foo where inst is CALL or JMP should be
32219 // changed from indirect TargetLowering::C_Memory to direct
32220 // TargetLowering::C_Address.
32221 // We don't need to special case LOOP* and Jcc, which cannot target a memory
32222 // location.
32223 StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);
32224 return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
32225}
32226
32227/// Provide custom lowering hooks for some operations.
32229 switch (Op.getOpcode()) {
32230 // clang-format off
32231 default: llvm_unreachable("Should not custom lower this!");
32232 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
32234 return LowerCMP_SWAP(Op, Subtarget, DAG);
32235 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
32240 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
32241 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
32242 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
32243 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
32244 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
32245 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
32246 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
32247 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
32248 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
32249 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
32250 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
32251 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
32252 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
32253 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
32254 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
32255 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
32256 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
32257 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
32258 case ISD::SHL_PARTS:
32259 case ISD::SRA_PARTS:
32260 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
32261 case ISD::FSHL:
32262 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
32264 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
32266 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
32267 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
32268 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
32269 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
32270 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
32273 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
32274 case ISD::FP_TO_SINT:
32276 case ISD::FP_TO_UINT:
32277 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
32279 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
32280 case ISD::FP_EXTEND:
32281 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
32282 case ISD::FP_ROUND:
32283 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
32284 case ISD::FP16_TO_FP:
32285 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
32286 case ISD::FP_TO_FP16:
32287 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
32288 case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);
32289 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
32290 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
32291 case ISD::FADD:
32292 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
32293 case ISD::FROUND: return LowerFROUND(Op, DAG);
32294 case ISD::FABS:
32295 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
32296 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
32297 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
32298 case ISD::LRINT:
32299 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
32300 case ISD::SETCC:
32301 case ISD::STRICT_FSETCC:
32302 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
32303 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
32304 case ISD::SELECT: return LowerSELECT(Op, DAG);
32305 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
32306 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
32307 case ISD::VASTART: return LowerVASTART(Op, DAG);
32308 case ISD::VAARG: return LowerVAARG(Op, DAG);
32309 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
32310 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
32312 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
32313 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
32314 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
32315 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
32317 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
32318 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
32319 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
32320 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
32321 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
32323 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
32324 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
32326 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
32327 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
32328 case ISD::GET_FPENV_MEM: return LowerGET_FPENV_MEM(Op, DAG);
32329 case ISD::SET_FPENV_MEM: return LowerSET_FPENV_MEM(Op, DAG);
32330 case ISD::RESET_FPENV: return LowerRESET_FPENV(Op, DAG);
32331 case ISD::CTLZ:
32332 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
32333 case ISD::CTTZ:
32334 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
32335 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
32336 case ISD::MULHS:
32337 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
32338 case ISD::ROTL:
32339 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
32340 case ISD::SRA:
32341 case ISD::SRL:
32342 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
32343 case ISD::SADDO:
32344 case ISD::UADDO:
32345 case ISD::SSUBO:
32346 case ISD::USUBO: return LowerXALUO(Op, DAG);
32347 case ISD::SMULO:
32348 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
32349 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
32350 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
32351 case ISD::SADDO_CARRY:
32352 case ISD::SSUBO_CARRY:
32353 case ISD::UADDO_CARRY:
32354 case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG);
32355 case ISD::ADD:
32356 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
32357 case ISD::UADDSAT:
32358 case ISD::SADDSAT:
32359 case ISD::USUBSAT:
32360 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
32361 case ISD::SMAX:
32362 case ISD::SMIN:
32363 case ISD::UMAX:
32364 case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
32365 case ISD::FMINIMUM:
32366 case ISD::FMAXIMUM:
32367 return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
32368 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
32369 case ISD::ABDS:
32370 case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
32371 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
32372 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
32373 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
32374 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
32375 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
32376 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
32378 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
32379 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
32380 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
32381 case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
32382 // clang-format on
32383 }
32384}
32385
32386/// Replace a node with an illegal result type with a new node built out of
32387/// custom code.
32390 SelectionDAG &DAG) const {
32391 SDLoc dl(N);
32392 switch (N->getOpcode()) {
32393 default:
32394#ifndef NDEBUG
32395 dbgs() << "ReplaceNodeResults: ";
32396 N->dump(&DAG);
32397#endif
32398 llvm_unreachable("Do not know how to custom type legalize this operation!");
32399 case X86ISD::CVTPH2PS: {
32400 EVT VT = N->getValueType(0);
32401 SDValue Lo, Hi;
32402 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
32403 EVT LoVT, HiVT;
32404 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
32405 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
32406 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
32407 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32408 Results.push_back(Res);
32409 return;
32410 }
32412 EVT VT = N->getValueType(0);
32413 SDValue Lo, Hi;
32414 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
32415 EVT LoVT, HiVT;
32416 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
32417 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
32418 {N->getOperand(0), Lo});
32419 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
32420 {N->getOperand(0), Hi});
32421 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
32422 Lo.getValue(1), Hi.getValue(1));
32423 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32424 Results.push_back(Res);
32425 Results.push_back(Chain);
32426 return;
32427 }
32428 case X86ISD::CVTPS2PH:
32429 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
32430 return;
32431 case ISD::CTPOP: {
32432 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
32433 // If we have at most 32 active bits, then perform as i32 CTPOP.
32434 // TODO: Perform this in generic legalizer?
32435 KnownBits Known = DAG.computeKnownBits(N->getOperand(0));
32436 unsigned LZ = Known.countMinLeadingZeros();
32437 unsigned TZ = Known.countMinTrailingZeros();
32438 if ((LZ + TZ) >= 32) {
32439 SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0),
32440 DAG.getShiftAmountConstant(TZ, MVT::i64, dl));
32441 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Op);
32442 Op = DAG.getNode(ISD::CTPOP, dl, MVT::i32, Op);
32443 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Op);
32444 Results.push_back(Op);
32445 return;
32446 }
32447 // Use a v2i64 if possible.
32448 bool NoImplicitFloatOps =
32450 Attribute::NoImplicitFloat);
32451 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
32452 SDValue Wide =
32453 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
32454 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
32455 // Bit count should fit in 32-bits, extract it as that and then zero
32456 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
32457 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
32458 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
32459 DAG.getIntPtrConstant(0, dl));
32460 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
32461 Results.push_back(Wide);
32462 }
32463 return;
32464 }
32465 case ISD::MUL: {
32466 EVT VT = N->getValueType(0);
32468 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
32469 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
32470 // elements are needed.
32471 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
32472 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
32473 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
32474 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
32475 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
32476 unsigned NumConcats = 16 / VT.getVectorNumElements();
32477 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
32478 ConcatOps[0] = Res;
32479 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
32480 Results.push_back(Res);
32481 return;
32482 }
32483 case ISD::SMULO:
32484 case ISD::UMULO: {
32485 EVT VT = N->getValueType(0);
32487 VT == MVT::v2i32 && "Unexpected VT!");
32488 bool IsSigned = N->getOpcode() == ISD::SMULO;
32489 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
32490 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
32491 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
32492 SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
32493 // Extract the high 32 bits from each result using PSHUFD.
32494 // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
32495 SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
32496 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
32497 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
32498 DAG.getIntPtrConstant(0, dl));
32499
32500 // Truncate the low bits of the result. This will become PSHUFD.
32501 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
32502
32503 SDValue HiCmp;
32504 if (IsSigned) {
32505 // SMULO overflows if the high bits don't match the sign of the low.
32506 HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
32507 } else {
32508 // UMULO overflows if the high bits are non-zero.
32509 HiCmp = DAG.getConstant(0, dl, VT);
32510 }
32511 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
32512
32513 // Widen the result with by padding with undef.
32514 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
32515 DAG.getUNDEF(VT));
32516 Results.push_back(Res);
32517 Results.push_back(Ovf);
32518 return;
32519 }
32520 case X86ISD::VPMADDWD: {
32521 // Legalize types for X86ISD::VPMADDWD by widening.
32522 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32523
32524 EVT VT = N->getValueType(0);
32525 EVT InVT = N->getOperand(0).getValueType();
32526 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
32527 "Expected a VT that divides into 128 bits.");
32529 "Unexpected type action!");
32530 unsigned NumConcat = 128 / InVT.getSizeInBits();
32531
32532 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
32533 InVT.getVectorElementType(),
32534 NumConcat * InVT.getVectorNumElements());
32535 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
32537 NumConcat * VT.getVectorNumElements());
32538
32539 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
32540 Ops[0] = N->getOperand(0);
32541 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
32542 Ops[0] = N->getOperand(1);
32543 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
32544
32545 SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
32546 Results.push_back(Res);
32547 return;
32548 }
32549 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
32550 case X86ISD::FMINC:
32551 case X86ISD::FMIN:
32552 case X86ISD::FMAXC:
32553 case X86ISD::FMAX: {
32554 EVT VT = N->getValueType(0);
32555 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
32556 SDValue UNDEF = DAG.getUNDEF(VT);
32557 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
32558 N->getOperand(0), UNDEF);
32559 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
32560 N->getOperand(1), UNDEF);
32561 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
32562 return;
32563 }
32564 case ISD::SDIV:
32565 case ISD::UDIV:
32566 case ISD::SREM:
32567 case ISD::UREM: {
32568 EVT VT = N->getValueType(0);
32569 if (VT.isVector()) {
32571 "Unexpected type action!");
32572 // If this RHS is a constant splat vector we can widen this and let
32573 // division/remainder by constant optimize it.
32574 // TODO: Can we do something for non-splat?
32575 APInt SplatVal;
32576 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
32577 unsigned NumConcats = 128 / VT.getSizeInBits();
32578 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
32579 Ops0[0] = N->getOperand(0);
32580 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
32581 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
32582 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
32583 SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
32584 Results.push_back(Res);
32585 }
32586 return;
32587 }
32588
32589 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
32590 Results.push_back(V);
32591 return;
32592 }
32593 case ISD::TRUNCATE: {
32594 MVT VT = N->getSimpleValueType(0);
32595 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
32596 return;
32597
32598 // The generic legalizer will try to widen the input type to the same
32599 // number of elements as the widened result type. But this isn't always
32600 // the best thing so do some custom legalization to avoid some cases.
32601 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
32602 SDValue In = N->getOperand(0);
32603 EVT InVT = In.getValueType();
32604 EVT InEltVT = InVT.getVectorElementType();
32605 EVT EltVT = VT.getVectorElementType();
32606 unsigned MinElts = VT.getVectorNumElements();
32607 unsigned WidenNumElts = WidenVT.getVectorNumElements();
32608 unsigned InBits = InVT.getSizeInBits();
32609
32610 // See if there are sufficient leading bits to perform a PACKUS/PACKSS.
32611 unsigned PackOpcode;
32612 if (SDValue Src =
32613 matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG, Subtarget)) {
32614 if (SDValue Res = truncateVectorWithPACK(PackOpcode, VT, Src,
32615 dl, DAG, Subtarget)) {
32616 Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl);
32617 Results.push_back(Res);
32618 return;
32619 }
32620 }
32621
32622 if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {
32623 // 128 bit and smaller inputs should avoid truncate all together and
32624 // use a shuffle.
32625 if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {
32626 int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();
32627 SmallVector<int, 16> TruncMask(WidenNumElts, -1);
32628 for (unsigned I = 0; I < MinElts; ++I)
32629 TruncMask[I] = Scale * I;
32630 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);
32631 assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) &&
32632 "Illegal vector type in truncation");
32633 WidenIn = DAG.getBitcast(WidenVT, WidenIn);
32634 Results.push_back(
32635 DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));
32636 return;
32637 }
32638 }
32639
32640 // With AVX512 there are some cases that can use a target specific
32641 // truncate node to go from 256/512 to less than 128 with zeros in the
32642 // upper elements of the 128 bit result.
32643 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
32644 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
32645 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
32646 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
32647 return;
32648 }
32649 // There's one case we can widen to 512 bits and use VTRUNC.
32650 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
32651 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
32652 DAG.getUNDEF(MVT::v4i64));
32653 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
32654 return;
32655 }
32656 }
32657 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
32658 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
32659 isTypeLegal(MVT::v4i64)) {
32660 // Input needs to be split and output needs to widened. Let's use two
32661 // VTRUNCs, and shuffle their results together into the wider type.
32662 SDValue Lo, Hi;
32663 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
32664
32665 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
32666 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
32667 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
32668 { 0, 1, 2, 3, 16, 17, 18, 19,
32669 -1, -1, -1, -1, -1, -1, -1, -1 });
32670 Results.push_back(Res);
32671 return;
32672 }
32673
32674 // Attempt to widen the truncation input vector to let LowerTRUNCATE handle
32675 // this via type legalization.
32676 if ((InEltVT == MVT::i16 || InEltVT == MVT::i32 || InEltVT == MVT::i64) &&
32677 (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32) &&
32678 (!Subtarget.hasSSSE3() ||
32679 (!isTypeLegal(InVT) &&
32680 !(MinElts <= 4 && InEltVT == MVT::i64 && EltVT == MVT::i8)))) {
32681 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl,
32682 InEltVT.getSizeInBits() * WidenNumElts);
32683 Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, WidenVT, WidenIn));
32684 return;
32685 }
32686
32687 return;
32688 }
32689 case ISD::ANY_EXTEND:
32690 // Right now, only MVT::v8i8 has Custom action for an illegal type.
32691 // It's intended to custom handle the input type.
32692 assert(N->getValueType(0) == MVT::v8i8 &&
32693 "Do not know how to legalize this Node");
32694 return;
32695 case ISD::SIGN_EXTEND:
32696 case ISD::ZERO_EXTEND: {
32697 EVT VT = N->getValueType(0);
32698 SDValue In = N->getOperand(0);
32699 EVT InVT = In.getValueType();
32700 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
32701 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
32703 "Unexpected type action!");
32704 assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
32705 // Custom split this so we can extend i8/i16->i32 invec. This is better
32706 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
32707 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
32708 // we allow the sra from the extend to i32 to be shared by the split.
32709 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
32710
32711 // Fill a vector with sign bits for each element.
32712 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
32713 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
32714
32715 // Create an unpackl and unpackh to interleave the sign bits then bitcast
32716 // to v2i64.
32717 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
32718 {0, 4, 1, 5});
32719 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
32720 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
32721 {2, 6, 3, 7});
32722 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
32723
32724 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32725 Results.push_back(Res);
32726 return;
32727 }
32728
32729 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
32730 if (!InVT.is128BitVector()) {
32731 // Not a 128 bit vector, but maybe type legalization will promote
32732 // it to 128 bits.
32733 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
32734 return;
32735 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
32736 if (!InVT.is128BitVector())
32737 return;
32738
32739 // Promote the input to 128 bits. Type legalization will turn this into
32740 // zext_inreg/sext_inreg.
32741 In = DAG.getNode(N->getOpcode(), dl, InVT, In);
32742 }
32743
32744 // Perform custom splitting instead of the two stage extend we would get
32745 // by default.
32746 EVT LoVT, HiVT;
32747 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
32748 assert(isTypeLegal(LoVT) && "Split VT not legal?");
32749
32750 SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
32751
32752 // We need to shift the input over by half the number of elements.
32753 unsigned NumElts = InVT.getVectorNumElements();
32754 unsigned HalfNumElts = NumElts / 2;
32755 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
32756 for (unsigned i = 0; i != HalfNumElts; ++i)
32757 ShufMask[i] = i + HalfNumElts;
32758
32759 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
32760 Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
32761
32762 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32763 Results.push_back(Res);
32764 }
32765 return;
32766 }
32767 case ISD::FP_TO_SINT:
32769 case ISD::FP_TO_UINT:
32771 bool IsStrict = N->isStrictFPOpcode();
32772 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
32773 N->getOpcode() == ISD::STRICT_FP_TO_SINT;
32774 EVT VT = N->getValueType(0);
32775 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32776 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
32777 EVT SrcVT = Src.getValueType();
32778
32779 SDValue Res;
32780 if (isSoftF16(SrcVT, Subtarget)) {
32781 EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
32782 if (IsStrict) {
32783 Res =
32784 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
32785 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
32786 {NVT, MVT::Other}, {Chain, Src})});
32787 Chain = Res.getValue(1);
32788 } else {
32789 Res = DAG.getNode(N->getOpcode(), dl, VT,
32790 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
32791 }
32792 Results.push_back(Res);
32793 if (IsStrict)
32794 Results.push_back(Chain);
32795
32796 return;
32797 }
32798
32799 if (VT.isVector() && Subtarget.hasFP16() &&
32800 SrcVT.getVectorElementType() == MVT::f16) {
32801 EVT EleVT = VT.getVectorElementType();
32802 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
32803
32804 if (SrcVT != MVT::v8f16) {
32805 SDValue Tmp =
32806 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
32807 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
32808 Ops[0] = Src;
32809 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
32810 }
32811
32812 if (IsStrict) {
32813 unsigned Opc =
32815 Res =
32816 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
32817 Chain = Res.getValue(1);
32818 } else {
32819 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
32820 Res = DAG.getNode(Opc, dl, ResVT, Src);
32821 }
32822
32823 // TODO: Need to add exception check code for strict FP.
32824 if (EleVT.getSizeInBits() < 16) {
32825 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
32826 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
32827
32828 // Now widen to 128 bits.
32829 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
32830 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
32831 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
32832 ConcatOps[0] = Res;
32833 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
32834 }
32835
32836 Results.push_back(Res);
32837 if (IsStrict)
32838 Results.push_back(Chain);
32839
32840 return;
32841 }
32842
32843 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
32845 "Unexpected type action!");
32846
32847 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
32848 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
32849 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
32851 SDValue Res;
32852 SDValue Chain;
32853 if (IsStrict) {
32854 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
32855 {N->getOperand(0), Src});
32856 Chain = Res.getValue(1);
32857 } else
32858 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
32859
32860 // Preserve what we know about the size of the original result. If the
32861 // result is v2i32, we have to manually widen the assert.
32862 if (PromoteVT == MVT::v2i32)
32863 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
32864 DAG.getUNDEF(MVT::v2i32));
32865
32866 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
32867 Res.getValueType(), Res,
32869
32870 if (PromoteVT == MVT::v2i32)
32871 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
32872 DAG.getIntPtrConstant(0, dl));
32873
32874 // Truncate back to the original width.
32875 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
32876
32877 // Now widen to 128 bits.
32878 unsigned NumConcats = 128 / VT.getSizeInBits();
32880 VT.getVectorNumElements() * NumConcats);
32881 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
32882 ConcatOps[0] = Res;
32883 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
32884 Results.push_back(Res);
32885 if (IsStrict)
32886 Results.push_back(Chain);
32887 return;
32888 }
32889
32890
32891 if (VT == MVT::v2i32) {
32892 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
32893 "Strict unsigned conversion requires AVX512");
32894 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32896 "Unexpected type action!");
32897 if (Src.getValueType() == MVT::v2f64) {
32898 if (!IsSigned && !Subtarget.hasAVX512()) {
32899 SDValue Res =
32900 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
32901 Results.push_back(Res);
32902 return;
32903 }
32904
32905 unsigned Opc;
32906 if (IsStrict)
32908 else
32909 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
32910
32911 // If we have VLX we can emit a target specific FP_TO_UINT node,.
32912 if (!IsSigned && !Subtarget.hasVLX()) {
32913 // Otherwise we can defer to the generic legalizer which will widen
32914 // the input as well. This will be further widened during op
32915 // legalization to v8i32<-v8f64.
32916 // For strict nodes we'll need to widen ourselves.
32917 // FIXME: Fix the type legalizer to safely widen strict nodes?
32918 if (!IsStrict)
32919 return;
32920 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
32921 DAG.getConstantFP(0.0, dl, MVT::v2f64));
32922 Opc = N->getOpcode();
32923 }
32924 SDValue Res;
32925 SDValue Chain;
32926 if (IsStrict) {
32927 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
32928 {N->getOperand(0), Src});
32929 Chain = Res.getValue(1);
32930 } else {
32931 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
32932 }
32933 Results.push_back(Res);
32934 if (IsStrict)
32935 Results.push_back(Chain);
32936 return;
32937 }
32938
32939 // Custom widen strict v2f32->v2i32 by padding with zeros.
32940 // FIXME: Should generic type legalizer do this?
32941 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
32942 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
32943 DAG.getConstantFP(0.0, dl, MVT::v2f32));
32944 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
32945 {N->getOperand(0), Src});
32946 Results.push_back(Res);
32947 Results.push_back(Res.getValue(1));
32948 return;
32949 }
32950
32951 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
32952 // so early out here.
32953 return;
32954 }
32955
32956 assert(!VT.isVector() && "Vectors should have been handled above!");
32957
32958 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
32959 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
32960 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
32961 assert(!Subtarget.is64Bit() && "i64 should be legal");
32962 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
32963 // If we use a 128-bit result we might need to use a target specific node.
32964 unsigned SrcElts =
32965 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
32966 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
32967 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
32968 unsigned Opc = N->getOpcode();
32969 if (NumElts != SrcElts) {
32970 if (IsStrict)
32972 else
32973 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
32974 }
32975
32976 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
32977 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
32978 DAG.getConstantFP(0.0, dl, VecInVT), Src,
32979 ZeroIdx);
32980 SDValue Chain;
32981 if (IsStrict) {
32982 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
32983 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
32984 Chain = Res.getValue(1);
32985 } else
32986 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
32987 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
32988 Results.push_back(Res);
32989 if (IsStrict)
32990 Results.push_back(Chain);
32991 return;
32992 }
32993
32994 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
32995 SDValue Chain;
32996 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
32997 Results.push_back(V);
32998 if (IsStrict)
32999 Results.push_back(Chain);
33000 return;
33001 }
33002
33003 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
33004 Results.push_back(V);
33005 if (IsStrict)
33006 Results.push_back(Chain);
33007 }
33008 return;
33009 }
33010 case ISD::LRINT:
33011 case ISD::LLRINT: {
33012 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
33013 Results.push_back(V);
33014 return;
33015 }
33016
33017 case ISD::SINT_TO_FP:
33019 case ISD::UINT_TO_FP:
33021 bool IsStrict = N->isStrictFPOpcode();
33022 bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
33023 N->getOpcode() == ISD::STRICT_SINT_TO_FP;
33024 EVT VT = N->getValueType(0);
33025 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
33026 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
33027 Subtarget.hasVLX()) {
33028 if (Src.getValueType().getVectorElementType() == MVT::i16)
33029 return;
33030
33031 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
33032 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
33033 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
33034 : DAG.getUNDEF(MVT::v2i32));
33035 if (IsStrict) {
33036 unsigned Opc =
33038 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
33039 {N->getOperand(0), Src});
33040 Results.push_back(Res);
33041 Results.push_back(Res.getValue(1));
33042 } else {
33043 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
33044 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
33045 }
33046 return;
33047 }
33048 if (VT != MVT::v2f32)
33049 return;
33050 EVT SrcVT = Src.getValueType();
33051 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
33052 if (IsStrict) {
33053 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
33055 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
33056 {N->getOperand(0), Src});
33057 Results.push_back(Res);
33058 Results.push_back(Res.getValue(1));
33059 } else {
33060 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
33061 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
33062 }
33063 return;
33064 }
33065 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
33066 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
33067 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
33068 SDValue One = DAG.getConstant(1, dl, SrcVT);
33069 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
33070 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
33071 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
33072 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
33073 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
33074 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
33075 for (int i = 0; i != 2; ++i) {
33076 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
33077 SignSrc, DAG.getIntPtrConstant(i, dl));
33078 if (IsStrict)
33079 SignCvts[i] =
33080 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
33081 {N->getOperand(0), Elt});
33082 else
33083 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
33084 };
33085 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
33086 SDValue Slow, Chain;
33087 if (IsStrict) {
33088 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
33089 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
33090 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
33091 {Chain, SignCvt, SignCvt});
33092 Chain = Slow.getValue(1);
33093 } else {
33094 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
33095 }
33096 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
33097 IsNeg =
33098 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
33099 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
33100 Results.push_back(Cvt);
33101 if (IsStrict)
33102 Results.push_back(Chain);
33103 return;
33104 }
33105
33106 if (SrcVT != MVT::v2i32)
33107 return;
33108
33109 if (IsSigned || Subtarget.hasAVX512()) {
33110 if (!IsStrict)
33111 return;
33112
33113 // Custom widen strict v2i32->v2f32 to avoid scalarization.
33114 // FIXME: Should generic type legalizer do this?
33115 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
33116 DAG.getConstant(0, dl, MVT::v2i32));
33117 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
33118 {N->getOperand(0), Src});
33119 Results.push_back(Res);
33120 Results.push_back(Res.getValue(1));
33121 return;
33122 }
33123
33124 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33125 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
33126 SDValue VBias = DAG.getConstantFP(
33127 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
33128 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
33129 DAG.getBitcast(MVT::v2i64, VBias));
33130 Or = DAG.getBitcast(MVT::v2f64, Or);
33131 if (IsStrict) {
33132 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
33133 {N->getOperand(0), Or, VBias});
33135 {MVT::v4f32, MVT::Other},
33136 {Sub.getValue(1), Sub});
33137 Results.push_back(Res);
33138 Results.push_back(Res.getValue(1));
33139 } else {
33140 // TODO: Are there any fast-math-flags to propagate here?
33141 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
33142 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
33143 }
33144 return;
33145 }
33147 case ISD::FP_ROUND: {
33148 bool IsStrict = N->isStrictFPOpcode();
33149 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
33150 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
33151 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
33152 EVT SrcVT = Src.getValueType();
33153 EVT VT = N->getValueType(0);
33154 SDValue V;
33155 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
33156 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
33157 : DAG.getUNDEF(MVT::v2f32);
33158 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
33159 }
33160 if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
33161 assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
33162 if (SrcVT.getVectorElementType() != MVT::f32)
33163 return;
33164
33165 if (IsStrict)
33166 V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
33167 {Chain, Src, Rnd});
33168 else
33169 V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
33170
33171 Results.push_back(DAG.getBitcast(MVT::v8f16, V));
33172 if (IsStrict)
33173 Results.push_back(V.getValue(1));
33174 return;
33175 }
33176 if (!isTypeLegal(Src.getValueType()))
33177 return;
33178 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
33179 if (IsStrict)
33180 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
33181 {Chain, Src});
33182 else
33183 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
33184 Results.push_back(V);
33185 if (IsStrict)
33186 Results.push_back(V.getValue(1));
33187 return;
33188 }
33189 case ISD::FP_EXTEND:
33190 case ISD::STRICT_FP_EXTEND: {
33191 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
33192 // No other ValueType for FP_EXTEND should reach this point.
33193 assert(N->getValueType(0) == MVT::v2f32 &&
33194 "Do not know how to legalize this Node");
33195 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
33196 return;
33197 bool IsStrict = N->isStrictFPOpcode();
33198 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
33199 if (Src.getValueType().getVectorElementType() != MVT::f16)
33200 return;
33201 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
33202 : DAG.getUNDEF(MVT::v2f16);
33203 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
33204 if (IsStrict)
33205 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
33206 {N->getOperand(0), V});
33207 else
33208 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
33209 Results.push_back(V);
33210 if (IsStrict)
33211 Results.push_back(V.getValue(1));
33212 return;
33213 }
33215 unsigned IntNo = N->getConstantOperandVal(1);
33216 switch (IntNo) {
33217 default : llvm_unreachable("Do not know how to custom type "
33218 "legalize this intrinsic operation!");
33219 case Intrinsic::x86_rdtsc:
33220 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
33221 Results);
33222 case Intrinsic::x86_rdtscp:
33223 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
33224 Results);
33225 case Intrinsic::x86_rdpmc:
33226 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
33227 Results);
33228 return;
33229 case Intrinsic::x86_rdpru:
33230 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
33231 Results);
33232 return;
33233 case Intrinsic::x86_xgetbv:
33234 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
33235 Results);
33236 return;
33237 }
33238 }
33239 case ISD::READCYCLECOUNTER: {
33240 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
33241 }
33243 EVT T = N->getValueType(0);
33244 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
33245 bool Regs64bit = T == MVT::i128;
33246 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&
33247 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
33248 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
33249 SDValue cpInL, cpInH;
33250 std::tie(cpInL, cpInH) =
33251 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
33252 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
33253 Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
33254 cpInH =
33255 DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
33256 cpInH, cpInL.getValue(1));
33257 SDValue swapInL, swapInH;
33258 std::tie(swapInL, swapInH) =
33259 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
33260 swapInH =
33261 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
33262 swapInH, cpInH.getValue(1));
33263
33264 // In 64-bit mode we might need the base pointer in RBX, but we can't know
33265 // until later. So we keep the RBX input in a vreg and use a custom
33266 // inserter.
33267 // Since RBX will be a reserved register the register allocator will not
33268 // make sure its value will be properly saved and restored around this
33269 // live-range.
33270 SDValue Result;
33271 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
33272 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
33273 if (Regs64bit) {
33274 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
33275 swapInH.getValue(1)};
33276 Result =
33277 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
33278 } else {
33279 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
33280 swapInH.getValue(1));
33281 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
33282 swapInL.getValue(1)};
33283 Result =
33284 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
33285 }
33286
33287 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
33288 Regs64bit ? X86::RAX : X86::EAX,
33289 HalfT, Result.getValue(1));
33290 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
33291 Regs64bit ? X86::RDX : X86::EDX,
33292 HalfT, cpOutL.getValue(2));
33293 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
33294
33295 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
33296 MVT::i32, cpOutH.getValue(2));
33297 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
33298 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
33299
33300 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
33301 Results.push_back(Success);
33302 Results.push_back(EFLAGS.getValue(1));
33303 return;
33304 }
33305 case ISD::ATOMIC_LOAD: {
33306 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33307 bool NoImplicitFloatOps =
33309 Attribute::NoImplicitFloat);
33310 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
33311 auto *Node = cast<AtomicSDNode>(N);
33312 if (Subtarget.hasSSE1()) {
33313 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
33314 // Then extract the lower 64-bits.
33315 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
33316 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
33317 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
33318 SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
33319 MVT::i64, Node->getMemOperand());
33320 if (Subtarget.hasSSE2()) {
33321 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
33322 DAG.getIntPtrConstant(0, dl));
33323 Results.push_back(Res);
33324 Results.push_back(Ld.getValue(1));
33325 return;
33326 }
33327 // We use an alternative sequence for SSE1 that extracts as v2f32 and
33328 // then casts to i64. This avoids a 128-bit stack temporary being
33329 // created by type legalization if we were to cast v4f32->v2i64.
33330 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
33331 DAG.getIntPtrConstant(0, dl));
33332 Res = DAG.getBitcast(MVT::i64, Res);
33333 Results.push_back(Res);
33334 Results.push_back(Ld.getValue(1));
33335 return;
33336 }
33337 if (Subtarget.hasX87()) {
33338 // First load this into an 80-bit X87 register. This will put the whole
33339 // integer into the significand.
33340 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
33341 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
33343 dl, Tys, Ops, MVT::i64,
33344 Node->getMemOperand());
33345 SDValue Chain = Result.getValue(1);
33346
33347 // Now store the X87 register to a stack temporary and convert to i64.
33348 // This store is not atomic and doesn't need to be.
33349 // FIXME: We don't need a stack temporary if the result of the load
33350 // is already being stored. We could just directly store there.
33351 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
33352 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
33353 MachinePointerInfo MPI =
33355 SDValue StoreOps[] = { Chain, Result, StackPtr };
33356 Chain = DAG.getMemIntrinsicNode(
33357 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
33358 MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
33359
33360 // Finally load the value back from the stack temporary and return it.
33361 // This load is not atomic and doesn't need to be.
33362 // This load will be further type legalized.
33363 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
33364 Results.push_back(Result);
33365 Results.push_back(Result.getValue(1));
33366 return;
33367 }
33368 }
33369 // TODO: Use MOVLPS when SSE1 is available?
33370 // Delegate to generic TypeLegalization. Situations we can really handle
33371 // should have already been dealt with by AtomicExpandPass.cpp.
33372 break;
33373 }
33374 case ISD::ATOMIC_SWAP:
33385 // Delegate to generic TypeLegalization. Situations we can really handle
33386 // should have already been dealt with by AtomicExpandPass.cpp.
33387 break;
33388
33389 case ISD::BITCAST: {
33390 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33391 EVT DstVT = N->getValueType(0);
33392 EVT SrcVT = N->getOperand(0).getValueType();
33393
33394 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
33395 // we can split using the k-register rather than memory.
33396 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
33397 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
33398 SDValue Lo, Hi;
33399 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
33400 Lo = DAG.getBitcast(MVT::i32, Lo);
33401 Hi = DAG.getBitcast(MVT::i32, Hi);
33402 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
33403 Results.push_back(Res);
33404 return;
33405 }
33406
33407 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
33408 // FIXME: Use v4f32 for SSE1?
33409 assert(Subtarget.hasSSE2() && "Requires SSE2");
33410 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
33411 "Unexpected type action!");
33412 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
33413 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
33414 N->getOperand(0));
33415 Res = DAG.getBitcast(WideVT, Res);
33416 Results.push_back(Res);
33417 return;
33418 }
33419
33420 return;
33421 }
33422 case ISD::MGATHER: {
33423 EVT VT = N->getValueType(0);
33424 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
33425 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
33426 auto *Gather = cast<MaskedGatherSDNode>(N);
33427 SDValue Index = Gather->getIndex();
33428 if (Index.getValueType() != MVT::v2i64)
33429 return;
33431 "Unexpected type action!");
33432 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
33433 SDValue Mask = Gather->getMask();
33434 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
33435 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
33436 Gather->getPassThru(),
33437 DAG.getUNDEF(VT));
33438 if (!Subtarget.hasVLX()) {
33439 // We need to widen the mask, but the instruction will only use 2
33440 // of its elements. So we can use undef.
33441 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
33442 DAG.getUNDEF(MVT::v2i1));
33443 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
33444 }
33445 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
33446 Gather->getBasePtr(), Index, Gather->getScale() };
33447 SDValue Res = DAG.getMemIntrinsicNode(
33448 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
33449 Gather->getMemoryVT(), Gather->getMemOperand());
33450 Results.push_back(Res);
33451 Results.push_back(Res.getValue(1));
33452 return;
33453 }
33454 return;
33455 }
33456 case ISD::LOAD: {
33457 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
33458 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
33459 // cast since type legalization will try to use an i64 load.
33460 MVT VT = N->getSimpleValueType(0);
33461 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
33463 "Unexpected type action!");
33464 if (!ISD::isNON_EXTLoad(N))
33465 return;
33466 auto *Ld = cast<LoadSDNode>(N);
33467 if (Subtarget.hasSSE2()) {
33468 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
33469 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
33470 Ld->getPointerInfo(), Ld->getOriginalAlign(),
33471 Ld->getMemOperand()->getFlags());
33472 SDValue Chain = Res.getValue(1);
33473 MVT VecVT = MVT::getVectorVT(LdVT, 2);
33474 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
33475 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
33476 Res = DAG.getBitcast(WideVT, Res);
33477 Results.push_back(Res);
33478 Results.push_back(Chain);
33479 return;
33480 }
33481 assert(Subtarget.hasSSE1() && "Expected SSE");
33482 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
33483 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
33484 SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
33485 MVT::i64, Ld->getMemOperand());
33486 Results.push_back(Res);
33487 Results.push_back(Res.getValue(1));
33488 return;
33489 }
33490 case ISD::ADDRSPACECAST: {
33491 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
33492 Results.push_back(V);
33493 return;
33494 }
33495 case ISD::BITREVERSE: {
33496 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33497 assert(Subtarget.hasXOP() && "Expected XOP");
33498 // We can use VPPERM by copying to a vector register and back. We'll need
33499 // to move the scalar in two i32 pieces.
33500 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
33501 return;
33502 }
33504 // f16 = extract vXf16 %vec, i64 %idx
33505 assert(N->getSimpleValueType(0) == MVT::f16 &&
33506 "Unexpected Value type of EXTRACT_VECTOR_ELT!");
33507 assert(Subtarget.hasFP16() && "Expected FP16");
33508 SDValue VecOp = N->getOperand(0);
33510 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
33511 Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
33512 N->getOperand(1));
33513 Split = DAG.getBitcast(MVT::f16, Split);
33514 Results.push_back(Split);
33515 return;
33516 }
33517 }
33518}
33519
33520const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
33521 switch ((X86ISD::NodeType)Opcode) {
33522 case X86ISD::FIRST_NUMBER: break;
33523#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
33524 NODE_NAME_CASE(BSF)
33525 NODE_NAME_CASE(BSR)
33526 NODE_NAME_CASE(FSHL)
33527 NODE_NAME_CASE(FSHR)
33528 NODE_NAME_CASE(FAND)
33529 NODE_NAME_CASE(FANDN)
33530 NODE_NAME_CASE(FOR)
33531 NODE_NAME_CASE(FXOR)
33532 NODE_NAME_CASE(FILD)
33533 NODE_NAME_CASE(FIST)
33534 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
33535 NODE_NAME_CASE(FLD)
33536 NODE_NAME_CASE(FST)
33537 NODE_NAME_CASE(CALL)
33538 NODE_NAME_CASE(CALL_RVMARKER)
33540 NODE_NAME_CASE(CMP)
33541 NODE_NAME_CASE(FCMP)
33542 NODE_NAME_CASE(STRICT_FCMP)
33543 NODE_NAME_CASE(STRICT_FCMPS)
33545 NODE_NAME_CASE(UCOMI)
33546 NODE_NAME_CASE(CMPM)
33547 NODE_NAME_CASE(CMPMM)
33548 NODE_NAME_CASE(STRICT_CMPM)
33549 NODE_NAME_CASE(CMPMM_SAE)
33550 NODE_NAME_CASE(SETCC)
33551 NODE_NAME_CASE(SETCC_CARRY)
33552 NODE_NAME_CASE(FSETCC)
33553 NODE_NAME_CASE(FSETCCM)
33554 NODE_NAME_CASE(FSETCCM_SAE)
33555 NODE_NAME_CASE(CMOV)
33556 NODE_NAME_CASE(BRCOND)
33557 NODE_NAME_CASE(RET_GLUE)
33558 NODE_NAME_CASE(IRET)
33559 NODE_NAME_CASE(REP_STOS)
33560 NODE_NAME_CASE(REP_MOVS)
33561 NODE_NAME_CASE(GlobalBaseReg)
33563 NODE_NAME_CASE(WrapperRIP)
33564 NODE_NAME_CASE(MOVQ2DQ)
33565 NODE_NAME_CASE(MOVDQ2Q)
33566 NODE_NAME_CASE(MMX_MOVD2W)
33567 NODE_NAME_CASE(MMX_MOVW2D)
33568 NODE_NAME_CASE(PEXTRB)
33569 NODE_NAME_CASE(PEXTRW)
33570 NODE_NAME_CASE(INSERTPS)
33571 NODE_NAME_CASE(PINSRB)
33572 NODE_NAME_CASE(PINSRW)
33573 NODE_NAME_CASE(PSHUFB)
33574 NODE_NAME_CASE(ANDNP)
33575 NODE_NAME_CASE(BLENDI)
33577 NODE_NAME_CASE(HADD)
33578 NODE_NAME_CASE(HSUB)
33579 NODE_NAME_CASE(FHADD)
33580 NODE_NAME_CASE(FHSUB)
33581 NODE_NAME_CASE(CONFLICT)
33582 NODE_NAME_CASE(FMAX)
33583 NODE_NAME_CASE(FMAXS)
33584 NODE_NAME_CASE(FMAX_SAE)
33585 NODE_NAME_CASE(FMAXS_SAE)
33586 NODE_NAME_CASE(FMIN)
33587 NODE_NAME_CASE(FMINS)
33588 NODE_NAME_CASE(FMIN_SAE)
33589 NODE_NAME_CASE(FMINS_SAE)
33590 NODE_NAME_CASE(FMAXC)
33591 NODE_NAME_CASE(FMINC)
33592 NODE_NAME_CASE(FRSQRT)
33593 NODE_NAME_CASE(FRCP)
33594 NODE_NAME_CASE(EXTRQI)
33595 NODE_NAME_CASE(INSERTQI)
33596 NODE_NAME_CASE(TLSADDR)
33597 NODE_NAME_CASE(TLSBASEADDR)
33598 NODE_NAME_CASE(TLSCALL)
33599 NODE_NAME_CASE(TLSDESC)
33600 NODE_NAME_CASE(EH_SJLJ_SETJMP)
33601 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
33602 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
33603 NODE_NAME_CASE(EH_RETURN)
33604 NODE_NAME_CASE(TC_RETURN)
33605 NODE_NAME_CASE(FNSTCW16m)
33606 NODE_NAME_CASE(FLDCW16m)
33607 NODE_NAME_CASE(FNSTENVm)
33608 NODE_NAME_CASE(FLDENVm)
33609 NODE_NAME_CASE(LCMPXCHG_DAG)
33610 NODE_NAME_CASE(LCMPXCHG8_DAG)
33611 NODE_NAME_CASE(LCMPXCHG16_DAG)
33612 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
33613 NODE_NAME_CASE(LADD)
33614 NODE_NAME_CASE(LSUB)
33615 NODE_NAME_CASE(LOR)
33616 NODE_NAME_CASE(LXOR)
33617 NODE_NAME_CASE(LAND)
33618 NODE_NAME_CASE(LBTS)
33619 NODE_NAME_CASE(LBTC)
33620 NODE_NAME_CASE(LBTR)
33621 NODE_NAME_CASE(LBTS_RM)
33622 NODE_NAME_CASE(LBTC_RM)
33623 NODE_NAME_CASE(LBTR_RM)
33624 NODE_NAME_CASE(AADD)
33625 NODE_NAME_CASE(AOR)
33626 NODE_NAME_CASE(AXOR)
33627 NODE_NAME_CASE(AAND)
33628 NODE_NAME_CASE(VZEXT_MOVL)
33629 NODE_NAME_CASE(VZEXT_LOAD)
33630 NODE_NAME_CASE(VEXTRACT_STORE)
33631 NODE_NAME_CASE(VTRUNC)
33632 NODE_NAME_CASE(VTRUNCS)
33633 NODE_NAME_CASE(VTRUNCUS)
33634 NODE_NAME_CASE(VMTRUNC)
33635 NODE_NAME_CASE(VMTRUNCS)
33636 NODE_NAME_CASE(VMTRUNCUS)
33637 NODE_NAME_CASE(VTRUNCSTORES)
33638 NODE_NAME_CASE(VTRUNCSTOREUS)
33639 NODE_NAME_CASE(VMTRUNCSTORES)
33640 NODE_NAME_CASE(VMTRUNCSTOREUS)
33641 NODE_NAME_CASE(VFPEXT)
33642 NODE_NAME_CASE(STRICT_VFPEXT)
33643 NODE_NAME_CASE(VFPEXT_SAE)
33644 NODE_NAME_CASE(VFPEXTS)
33645 NODE_NAME_CASE(VFPEXTS_SAE)
33646 NODE_NAME_CASE(VFPROUND)
33647 NODE_NAME_CASE(STRICT_VFPROUND)
33648 NODE_NAME_CASE(VMFPROUND)
33649 NODE_NAME_CASE(VFPROUND_RND)
33650 NODE_NAME_CASE(VFPROUNDS)
33651 NODE_NAME_CASE(VFPROUNDS_RND)
33652 NODE_NAME_CASE(VSHLDQ)
33653 NODE_NAME_CASE(VSRLDQ)
33654 NODE_NAME_CASE(VSHL)
33655 NODE_NAME_CASE(VSRL)
33656 NODE_NAME_CASE(VSRA)
33657 NODE_NAME_CASE(VSHLI)
33658 NODE_NAME_CASE(VSRLI)
33659 NODE_NAME_CASE(VSRAI)
33660 NODE_NAME_CASE(VSHLV)
33661 NODE_NAME_CASE(VSRLV)
33662 NODE_NAME_CASE(VSRAV)
33663 NODE_NAME_CASE(VROTLI)
33664 NODE_NAME_CASE(VROTRI)
33665 NODE_NAME_CASE(VPPERM)
33666 NODE_NAME_CASE(CMPP)
33667 NODE_NAME_CASE(STRICT_CMPP)
33668 NODE_NAME_CASE(PCMPEQ)
33669 NODE_NAME_CASE(PCMPGT)
33670 NODE_NAME_CASE(PHMINPOS)
33671 NODE_NAME_CASE(ADD)
33672 NODE_NAME_CASE(SUB)
33673 NODE_NAME_CASE(ADC)
33674 NODE_NAME_CASE(SBB)
33675 NODE_NAME_CASE(SMUL)
33676 NODE_NAME_CASE(UMUL)
33677 NODE_NAME_CASE(OR)
33678 NODE_NAME_CASE(XOR)
33679 NODE_NAME_CASE(AND)
33680 NODE_NAME_CASE(BEXTR)
33682 NODE_NAME_CASE(BZHI)
33683 NODE_NAME_CASE(PDEP)
33684 NODE_NAME_CASE(PEXT)
33685 NODE_NAME_CASE(MUL_IMM)
33686 NODE_NAME_CASE(MOVMSK)
33687 NODE_NAME_CASE(PTEST)
33688 NODE_NAME_CASE(TESTP)
33689 NODE_NAME_CASE(KORTEST)
33690 NODE_NAME_CASE(KTEST)
33691 NODE_NAME_CASE(KADD)
33692 NODE_NAME_CASE(KSHIFTL)
33693 NODE_NAME_CASE(KSHIFTR)
33694 NODE_NAME_CASE(PACKSS)
33695 NODE_NAME_CASE(PACKUS)
33696 NODE_NAME_CASE(PALIGNR)
33697 NODE_NAME_CASE(VALIGN)
33698 NODE_NAME_CASE(VSHLD)
33699 NODE_NAME_CASE(VSHRD)
33700 NODE_NAME_CASE(VSHLDV)
33701 NODE_NAME_CASE(VSHRDV)
33702 NODE_NAME_CASE(PSHUFD)
33703 NODE_NAME_CASE(PSHUFHW)
33704 NODE_NAME_CASE(PSHUFLW)
33705 NODE_NAME_CASE(SHUFP)
33706 NODE_NAME_CASE(SHUF128)
33707 NODE_NAME_CASE(MOVLHPS)
33708 NODE_NAME_CASE(MOVHLPS)
33709 NODE_NAME_CASE(MOVDDUP)
33710 NODE_NAME_CASE(MOVSHDUP)
33711 NODE_NAME_CASE(MOVSLDUP)
33712 NODE_NAME_CASE(MOVSD)
33713 NODE_NAME_CASE(MOVSS)
33714 NODE_NAME_CASE(MOVSH)
33715 NODE_NAME_CASE(UNPCKL)
33716 NODE_NAME_CASE(UNPCKH)
33717 NODE_NAME_CASE(VBROADCAST)
33718 NODE_NAME_CASE(VBROADCAST_LOAD)
33719 NODE_NAME_CASE(VBROADCASTM)
33720 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
33721 NODE_NAME_CASE(VPERMILPV)
33722 NODE_NAME_CASE(VPERMILPI)
33723 NODE_NAME_CASE(VPERM2X128)
33724 NODE_NAME_CASE(VPERMV)
33725 NODE_NAME_CASE(VPERMV3)
33726 NODE_NAME_CASE(VPERMI)
33727 NODE_NAME_CASE(VPTERNLOG)
33728 NODE_NAME_CASE(VFIXUPIMM)
33729 NODE_NAME_CASE(VFIXUPIMM_SAE)
33730 NODE_NAME_CASE(VFIXUPIMMS)
33731 NODE_NAME_CASE(VFIXUPIMMS_SAE)
33732 NODE_NAME_CASE(VRANGE)
33733 NODE_NAME_CASE(VRANGE_SAE)
33734 NODE_NAME_CASE(VRANGES)
33735 NODE_NAME_CASE(VRANGES_SAE)
33736 NODE_NAME_CASE(PMULUDQ)
33737 NODE_NAME_CASE(PMULDQ)
33738 NODE_NAME_CASE(PSADBW)
33739 NODE_NAME_CASE(DBPSADBW)
33740 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
33741 NODE_NAME_CASE(VAARG_64)
33742 NODE_NAME_CASE(VAARG_X32)
33743 NODE_NAME_CASE(DYN_ALLOCA)
33744 NODE_NAME_CASE(MFENCE)
33745 NODE_NAME_CASE(SEG_ALLOCA)
33746 NODE_NAME_CASE(PROBED_ALLOCA)
33749 NODE_NAME_CASE(RDPKRU)
33750 NODE_NAME_CASE(WRPKRU)
33751 NODE_NAME_CASE(VPMADDUBSW)
33752 NODE_NAME_CASE(VPMADDWD)
33753 NODE_NAME_CASE(VPSHA)
33754 NODE_NAME_CASE(VPSHL)
33755 NODE_NAME_CASE(VPCOM)
33756 NODE_NAME_CASE(VPCOMU)
33757 NODE_NAME_CASE(VPERMIL2)
33759 NODE_NAME_CASE(STRICT_FMSUB)
33761 NODE_NAME_CASE(STRICT_FNMADD)
33763 NODE_NAME_CASE(STRICT_FNMSUB)
33764 NODE_NAME_CASE(FMADDSUB)
33765 NODE_NAME_CASE(FMSUBADD)
33766 NODE_NAME_CASE(FMADD_RND)
33767 NODE_NAME_CASE(FNMADD_RND)
33768 NODE_NAME_CASE(FMSUB_RND)
33769 NODE_NAME_CASE(FNMSUB_RND)
33770 NODE_NAME_CASE(FMADDSUB_RND)
33771 NODE_NAME_CASE(FMSUBADD_RND)
33772 NODE_NAME_CASE(VFMADDC)
33773 NODE_NAME_CASE(VFMADDC_RND)
33774 NODE_NAME_CASE(VFCMADDC)
33775 NODE_NAME_CASE(VFCMADDC_RND)
33776 NODE_NAME_CASE(VFMULC)
33777 NODE_NAME_CASE(VFMULC_RND)
33778 NODE_NAME_CASE(VFCMULC)
33779 NODE_NAME_CASE(VFCMULC_RND)
33780 NODE_NAME_CASE(VFMULCSH)
33781 NODE_NAME_CASE(VFMULCSH_RND)
33782 NODE_NAME_CASE(VFCMULCSH)
33783 NODE_NAME_CASE(VFCMULCSH_RND)
33784 NODE_NAME_CASE(VFMADDCSH)
33785 NODE_NAME_CASE(VFMADDCSH_RND)
33786 NODE_NAME_CASE(VFCMADDCSH)
33787 NODE_NAME_CASE(VFCMADDCSH_RND)
33788 NODE_NAME_CASE(VPMADD52H)
33789 NODE_NAME_CASE(VPMADD52L)
33790 NODE_NAME_CASE(VRNDSCALE)
33791 NODE_NAME_CASE(STRICT_VRNDSCALE)
33792 NODE_NAME_CASE(VRNDSCALE_SAE)
33793 NODE_NAME_CASE(VRNDSCALES)
33794 NODE_NAME_CASE(VRNDSCALES_SAE)
33795 NODE_NAME_CASE(VREDUCE)
33796 NODE_NAME_CASE(VREDUCE_SAE)
33797 NODE_NAME_CASE(VREDUCES)
33798 NODE_NAME_CASE(VREDUCES_SAE)
33799 NODE_NAME_CASE(VGETMANT)
33800 NODE_NAME_CASE(VGETMANT_SAE)
33801 NODE_NAME_CASE(VGETMANTS)
33802 NODE_NAME_CASE(VGETMANTS_SAE)
33803 NODE_NAME_CASE(PCMPESTR)
33804 NODE_NAME_CASE(PCMPISTR)
33806 NODE_NAME_CASE(COMPRESS)
33808 NODE_NAME_CASE(SELECTS)
33809 NODE_NAME_CASE(ADDSUB)
33810 NODE_NAME_CASE(RCP14)
33811 NODE_NAME_CASE(RCP14S)
33812 NODE_NAME_CASE(RCP28)
33813 NODE_NAME_CASE(RCP28_SAE)
33814 NODE_NAME_CASE(RCP28S)
33815 NODE_NAME_CASE(RCP28S_SAE)
33816 NODE_NAME_CASE(EXP2)
33817 NODE_NAME_CASE(EXP2_SAE)
33818 NODE_NAME_CASE(RSQRT14)
33819 NODE_NAME_CASE(RSQRT14S)
33820 NODE_NAME_CASE(RSQRT28)
33821 NODE_NAME_CASE(RSQRT28_SAE)
33822 NODE_NAME_CASE(RSQRT28S)
33823 NODE_NAME_CASE(RSQRT28S_SAE)
33824 NODE_NAME_CASE(FADD_RND)
33825 NODE_NAME_CASE(FADDS)
33826 NODE_NAME_CASE(FADDS_RND)
33827 NODE_NAME_CASE(FSUB_RND)
33828 NODE_NAME_CASE(FSUBS)
33829 NODE_NAME_CASE(FSUBS_RND)
33830 NODE_NAME_CASE(FMUL_RND)
33831 NODE_NAME_CASE(FMULS)
33832 NODE_NAME_CASE(FMULS_RND)
33833 NODE_NAME_CASE(FDIV_RND)
33834 NODE_NAME_CASE(FDIVS)
33835 NODE_NAME_CASE(FDIVS_RND)
33836 NODE_NAME_CASE(FSQRT_RND)
33837 NODE_NAME_CASE(FSQRTS)
33838 NODE_NAME_CASE(FSQRTS_RND)
33839 NODE_NAME_CASE(FGETEXP)
33840 NODE_NAME_CASE(FGETEXP_SAE)
33841 NODE_NAME_CASE(FGETEXPS)
33842 NODE_NAME_CASE(FGETEXPS_SAE)
33843 NODE_NAME_CASE(SCALEF)
33844 NODE_NAME_CASE(SCALEF_RND)
33845 NODE_NAME_CASE(SCALEFS)
33846 NODE_NAME_CASE(SCALEFS_RND)
33847 NODE_NAME_CASE(MULHRS)
33848 NODE_NAME_CASE(SINT_TO_FP_RND)
33849 NODE_NAME_CASE(UINT_TO_FP_RND)
33850 NODE_NAME_CASE(CVTTP2SI)
33851 NODE_NAME_CASE(CVTTP2UI)
33852 NODE_NAME_CASE(STRICT_CVTTP2SI)
33853 NODE_NAME_CASE(STRICT_CVTTP2UI)
33854 NODE_NAME_CASE(MCVTTP2SI)
33855 NODE_NAME_CASE(MCVTTP2UI)
33856 NODE_NAME_CASE(CVTTP2SI_SAE)
33857 NODE_NAME_CASE(CVTTP2UI_SAE)
33858 NODE_NAME_CASE(CVTTS2SI)
33859 NODE_NAME_CASE(CVTTS2UI)
33860 NODE_NAME_CASE(CVTTS2SI_SAE)
33861 NODE_NAME_CASE(CVTTS2UI_SAE)
33862 NODE_NAME_CASE(CVTSI2P)
33863 NODE_NAME_CASE(CVTUI2P)
33864 NODE_NAME_CASE(STRICT_CVTSI2P)
33865 NODE_NAME_CASE(STRICT_CVTUI2P)
33866 NODE_NAME_CASE(MCVTSI2P)
33867 NODE_NAME_CASE(MCVTUI2P)
33868 NODE_NAME_CASE(VFPCLASS)
33869 NODE_NAME_CASE(VFPCLASSS)
33870 NODE_NAME_CASE(MULTISHIFT)
33871 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
33872 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
33873 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
33874 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
33875 NODE_NAME_CASE(CVTPS2PH)
33876 NODE_NAME_CASE(STRICT_CVTPS2PH)
33877 NODE_NAME_CASE(CVTPS2PH_SAE)
33878 NODE_NAME_CASE(MCVTPS2PH)
33879 NODE_NAME_CASE(MCVTPS2PH_SAE)
33880 NODE_NAME_CASE(CVTPH2PS)
33881 NODE_NAME_CASE(STRICT_CVTPH2PS)
33882 NODE_NAME_CASE(CVTPH2PS_SAE)
33883 NODE_NAME_CASE(CVTP2SI)
33884 NODE_NAME_CASE(CVTP2UI)
33885 NODE_NAME_CASE(MCVTP2SI)
33886 NODE_NAME_CASE(MCVTP2UI)
33887 NODE_NAME_CASE(CVTP2SI_RND)
33888 NODE_NAME_CASE(CVTP2UI_RND)
33889 NODE_NAME_CASE(CVTS2SI)
33890 NODE_NAME_CASE(CVTS2UI)
33891 NODE_NAME_CASE(CVTS2SI_RND)
33892 NODE_NAME_CASE(CVTS2UI_RND)
33893 NODE_NAME_CASE(CVTNE2PS2BF16)
33894 NODE_NAME_CASE(CVTNEPS2BF16)
33895 NODE_NAME_CASE(MCVTNEPS2BF16)
33896 NODE_NAME_CASE(DPBF16PS)
33897 NODE_NAME_CASE(LWPINS)
33898 NODE_NAME_CASE(MGATHER)
33899 NODE_NAME_CASE(MSCATTER)
33900 NODE_NAME_CASE(VPDPBUSD)
33901 NODE_NAME_CASE(VPDPBUSDS)
33902 NODE_NAME_CASE(VPDPWSSD)
33903 NODE_NAME_CASE(VPDPWSSDS)
33904 NODE_NAME_CASE(VPSHUFBITQMB)
33905 NODE_NAME_CASE(GF2P8MULB)
33906 NODE_NAME_CASE(GF2P8AFFINEQB)
33907 NODE_NAME_CASE(GF2P8AFFINEINVQB)
33908 NODE_NAME_CASE(NT_CALL)
33909 NODE_NAME_CASE(NT_BRIND)
33910 NODE_NAME_CASE(UMWAIT)
33911 NODE_NAME_CASE(TPAUSE)
33912 NODE_NAME_CASE(ENQCMD)
33913 NODE_NAME_CASE(ENQCMDS)
33914 NODE_NAME_CASE(VP2INTERSECT)
33915 NODE_NAME_CASE(VPDPBSUD)
33916 NODE_NAME_CASE(VPDPBSUDS)
33917 NODE_NAME_CASE(VPDPBUUD)
33918 NODE_NAME_CASE(VPDPBUUDS)
33919 NODE_NAME_CASE(VPDPBSSD)
33920 NODE_NAME_CASE(VPDPBSSDS)
33921 NODE_NAME_CASE(AESENC128KL)
33922 NODE_NAME_CASE(AESDEC128KL)
33923 NODE_NAME_CASE(AESENC256KL)
33924 NODE_NAME_CASE(AESDEC256KL)
33925 NODE_NAME_CASE(AESENCWIDE128KL)
33926 NODE_NAME_CASE(AESDECWIDE128KL)
33927 NODE_NAME_CASE(AESENCWIDE256KL)
33928 NODE_NAME_CASE(AESDECWIDE256KL)
33929 NODE_NAME_CASE(CMPCCXADD)
33930 NODE_NAME_CASE(TESTUI)
33931 NODE_NAME_CASE(FP80_ADD)
33932 NODE_NAME_CASE(STRICT_FP80_ADD)
33933 }
33934 return nullptr;
33935#undef NODE_NAME_CASE
33936}
33937
33938/// Return true if the addressing mode represented by AM is legal for this
33939/// target, for a load/store of the specified type.
33941 const AddrMode &AM, Type *Ty,
33942 unsigned AS,
33943 Instruction *I) const {
33944 // X86 supports extremely general addressing modes.
33946
33947 // X86 allows a sign-extended 32-bit immediate field as a displacement.
33948 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
33949 return false;
33950
33951 if (AM.BaseGV) {
33952 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
33953
33954 // If a reference to this global requires an extra load, we can't fold it.
33955 if (isGlobalStubReference(GVFlags))
33956 return false;
33957
33958 // If BaseGV requires a register for the PIC base, we cannot also have a
33959 // BaseReg specified.
33960 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
33961 return false;
33962
33963 // If lower 4G is not available, then we must use rip-relative addressing.
33964 if ((M != CodeModel::Small || isPositionIndependent()) &&
33965 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
33966 return false;
33967 }
33968
33969 switch (AM.Scale) {
33970 case 0:
33971 case 1:
33972 case 2:
33973 case 4:
33974 case 8:
33975 // These scales always work.
33976 break;
33977 case 3:
33978 case 5:
33979 case 9:
33980 // These scales are formed with basereg+scalereg. Only accept if there is
33981 // no basereg yet.
33982 if (AM.HasBaseReg)
33983 return false;
33984 break;
33985 default: // Other stuff never works.
33986 return false;
33987 }
33988
33989 return true;
33990}
33991
33993 unsigned Bits = Ty->getScalarSizeInBits();
33994
33995 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
33996 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
33997 if (Subtarget.hasXOP() &&
33998 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
33999 return false;
34000
34001 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
34002 // shifts just as cheap as scalar ones.
34003 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
34004 return false;
34005
34006 // AVX512BW has shifts such as vpsllvw.
34007 if (Subtarget.hasBWI() && Bits == 16)
34008 return false;
34009
34010 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
34011 // fully general vector.
34012 return true;
34013}
34014
34015bool X86TargetLowering::isBinOp(unsigned Opcode) const {
34016 switch (Opcode) {
34017 // These are non-commutative binops.
34018 // TODO: Add more X86ISD opcodes once we have test coverage.
34019 case X86ISD::ANDNP:
34020 case X86ISD::PCMPGT:
34021 case X86ISD::FMAX:
34022 case X86ISD::FMIN:
34023 case X86ISD::FANDN:
34024 case X86ISD::VPSHA:
34025 case X86ISD::VPSHL:
34026 case X86ISD::VSHLV:
34027 case X86ISD::VSRLV:
34028 case X86ISD::VSRAV:
34029 return true;
34030 }
34031
34032 return TargetLoweringBase::isBinOp(Opcode);
34033}
34034
34035bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
34036 switch (Opcode) {
34037 // TODO: Add more X86ISD opcodes once we have test coverage.
34038 case X86ISD::PCMPEQ:
34039 case X86ISD::PMULDQ:
34040 case X86ISD::PMULUDQ:
34041 case X86ISD::FMAXC:
34042 case X86ISD::FMINC:
34043 case X86ISD::FAND:
34044 case X86ISD::FOR:
34045 case X86ISD::FXOR:
34046 return true;
34047 }
34048
34050}
34051
34053 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
34054 return false;
34055 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
34056 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
34057 return NumBits1 > NumBits2;
34058}
34059
34061 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
34062 return false;
34063
34064 if (!isTypeLegal(EVT::getEVT(Ty1)))
34065 return false;
34066
34067 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
34068
34069 // Assuming the caller doesn't have a zeroext or signext return parameter,
34070 // truncation all the way down to i1 is valid.
34071 return true;
34072}
34073
34075 return isInt<32>(Imm);
34076}
34077
34079 // Can also use sub to handle negated immediates.
34080 return isInt<32>(Imm);
34081}
34082
34084 return isInt<32>(Imm);
34085}
34086
34088 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
34089 return false;
34090 unsigned NumBits1 = VT1.getSizeInBits();
34091 unsigned NumBits2 = VT2.getSizeInBits();
34092 return NumBits1 > NumBits2;
34093}
34094
34096 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
34097 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
34098}
34099
34101 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
34102 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
34103}
34104
34106 EVT VT1 = Val.getValueType();
34107 if (isZExtFree(VT1, VT2))
34108 return true;
34109
34110 if (Val.getOpcode() != ISD::LOAD)
34111 return false;
34112
34113 if (!VT1.isSimple() || !VT1.isInteger() ||
34114 !VT2.isSimple() || !VT2.isInteger())
34115 return false;
34116
34117 switch (VT1.getSimpleVT().SimpleTy) {
34118 default: break;
34119 case MVT::i8:
34120 case MVT::i16:
34121 case MVT::i32:
34122 // X86 has 8, 16, and 32-bit zero-extending loads.
34123 return true;
34124 }
34125
34126 return false;
34127}
34128
34130 SmallVectorImpl<Use *> &Ops) const {
34131 using namespace llvm::PatternMatch;
34132
34133 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
34134 if (!VTy)
34135 return false;
34136
34137 if (I->getOpcode() == Instruction::Mul &&
34138 VTy->getElementType()->isIntegerTy(64)) {
34139 for (auto &Op : I->operands()) {
34140 // Make sure we are not already sinking this operand
34141 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
34142 continue;
34143
34144 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
34145 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
34146 if (Subtarget.hasSSE41() &&
34147 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
34148 m_SpecificInt(32)))) {
34149 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
34150 Ops.push_back(&Op);
34151 } else if (Subtarget.hasSSE2() &&
34152 match(Op.get(),
34153 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
34154 Ops.push_back(&Op);
34155 }
34156 }
34157
34158 return !Ops.empty();
34159 }
34160
34161 // A uniform shift amount in a vector shift or funnel shift may be much
34162 // cheaper than a generic variable vector shift, so make that pattern visible
34163 // to SDAG by sinking the shuffle instruction next to the shift.
34164 int ShiftAmountOpNum = -1;
34165 if (I->isShift())
34166 ShiftAmountOpNum = 1;
34167 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
34168 if (II->getIntrinsicID() == Intrinsic::fshl ||
34169 II->getIntrinsicID() == Intrinsic::fshr)
34170 ShiftAmountOpNum = 2;
34171 }
34172
34173 if (ShiftAmountOpNum == -1)
34174 return false;
34175
34176 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
34177 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
34178 isVectorShiftByScalarCheap(I->getType())) {
34179 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
34180 return true;
34181 }
34182
34183 return false;
34184}
34185
34187 if (!Subtarget.is64Bit())
34188 return false;
34190}
34191
34193 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
34194 return false;
34195
34196 EVT SrcVT = ExtVal.getOperand(0).getValueType();
34197
34198 // There is no extending load for vXi1.
34199 if (SrcVT.getScalarType() == MVT::i1)
34200 return false;
34201
34202 return true;
34203}
34204
34206 EVT VT) const {
34207 if (!Subtarget.hasAnyFMA())
34208 return false;
34209
34210 VT = VT.getScalarType();
34211
34212 if (!VT.isSimple())
34213 return false;
34214
34215 switch (VT.getSimpleVT().SimpleTy) {
34216 case MVT::f16:
34217 return Subtarget.hasFP16();
34218 case MVT::f32:
34219 case MVT::f64:
34220 return true;
34221 default:
34222 break;
34223 }
34224
34225 return false;
34226}
34227
34229 // i16 instructions are longer (0x66 prefix) and potentially slower.
34230 return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
34231}
34232
34234 EVT VT) const {
34235 // TODO: This is too general. There are cases where pre-AVX512 codegen would
34236 // benefit. The transform may also be profitable for scalar code.
34237 if (!Subtarget.hasAVX512())
34238 return false;
34239 if (!Subtarget.hasVLX() && !VT.is512BitVector())
34240 return false;
34241 if (!VT.isVector() || VT.getScalarType() == MVT::i1)
34242 return false;
34243
34244 return true;
34245}
34246
34247/// Targets can use this to indicate that they only support *some*
34248/// VECTOR_SHUFFLE operations, those with specific masks.
34249/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
34250/// are assumed to be legal.
34252 if (!VT.isSimple())
34253 return false;
34254
34255 // Not for i1 vectors
34256 if (VT.getSimpleVT().getScalarType() == MVT::i1)
34257 return false;
34258
34259 // Very little shuffling can be done for 64-bit vectors right now.
34260 if (VT.getSimpleVT().getSizeInBits() == 64)
34261 return false;
34262
34263 // We only care that the types being shuffled are legal. The lowering can
34264 // handle any possible shuffle mask that results.
34265 return isTypeLegal(VT.getSimpleVT());
34266}
34267
34269 EVT VT) const {
34270 // Don't convert an 'and' into a shuffle that we don't directly support.
34271 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
34272 if (!Subtarget.hasAVX2())
34273 if (VT == MVT::v32i8 || VT == MVT::v16i16)
34274 return false;
34275
34276 // Just delegate to the generic legality, clear masks aren't special.
34277 return isShuffleMaskLegal(Mask, VT);
34278}
34279
34281 // If the subtarget is using thunks, we need to not generate jump tables.
34282 if (Subtarget.useIndirectThunkBranches())
34283 return false;
34284
34285 // Otherwise, fallback on the generic logic.
34287}
34288
34290 EVT ConditionVT) const {
34291 // Avoid 8 and 16 bit types because they increase the chance for unnecessary
34292 // zero-extensions.
34293 if (ConditionVT.getSizeInBits() < 32)
34294 return MVT::i32;
34296 ConditionVT);
34297}
34298
34299//===----------------------------------------------------------------------===//
34300// X86 Scheduler Hooks
34301//===----------------------------------------------------------------------===//
34302
34303// Returns true if EFLAG is consumed after this iterator in the rest of the
34304// basic block or any successors of the basic block.
34306 MachineBasicBlock *BB) {
34307 // Scan forward through BB for a use/def of EFLAGS.
34308 for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {
34309 if (mi.readsRegister(X86::EFLAGS, /*TRI=*/nullptr))
34310 return true;
34311 // If we found a def, we can stop searching.
34312 if (mi.definesRegister(X86::EFLAGS, /*TRI=*/nullptr))
34313 return false;
34314 }
34315
34316 // If we hit the end of the block, check whether EFLAGS is live into a
34317 // successor.
34318 for (MachineBasicBlock *Succ : BB->successors())
34319 if (Succ->isLiveIn(X86::EFLAGS))
34320 return true;
34321
34322 return false;
34323}
34324
34325/// Utility function to emit xbegin specifying the start of an RTM region.
34327 const TargetInstrInfo *TII) {
34328 const MIMetadata MIMD(MI);
34329
34330 const BasicBlock *BB = MBB->getBasicBlock();
34332
34333 // For the v = xbegin(), we generate
34334 //
34335 // thisMBB:
34336 // xbegin sinkMBB
34337 //
34338 // mainMBB:
34339 // s0 = -1
34340 //
34341 // fallBB:
34342 // eax = # XABORT_DEF
34343 // s1 = eax
34344 //
34345 // sinkMBB:
34346 // v = phi(s0/mainBB, s1/fallBB)
34347
34348 MachineBasicBlock *thisMBB = MBB;
34349 MachineFunction *MF = MBB->getParent();
34350 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
34351 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
34352 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
34353 MF->insert(I, mainMBB);
34354 MF->insert(I, fallMBB);
34355 MF->insert(I, sinkMBB);
34356
34357 if (isEFLAGSLiveAfter(MI, MBB)) {
34358 mainMBB->addLiveIn(X86::EFLAGS);
34359 fallMBB->addLiveIn(X86::EFLAGS);
34360 sinkMBB->addLiveIn(X86::EFLAGS);
34361 }
34362
34363 // Transfer the remainder of BB and its successor edges to sinkMBB.
34364 sinkMBB->splice(sinkMBB->begin(), MBB,
34365 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
34367
34369 Register DstReg = MI.getOperand(0).getReg();
34370 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
34371 Register mainDstReg = MRI.createVirtualRegister(RC);
34372 Register fallDstReg = MRI.createVirtualRegister(RC);
34373
34374 // thisMBB:
34375 // xbegin fallMBB
34376 // # fallthrough to mainMBB
34377 // # abortion to fallMBB
34378 BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
34379 thisMBB->addSuccessor(mainMBB);
34380 thisMBB->addSuccessor(fallMBB);
34381
34382 // mainMBB:
34383 // mainDstReg := -1
34384 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
34385 BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
34386 mainMBB->addSuccessor(sinkMBB);
34387
34388 // fallMBB:
34389 // ; pseudo instruction to model hardware's definition from XABORT
34390 // EAX := XABORT_DEF
34391 // fallDstReg := EAX
34392 BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));
34393 BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)
34394 .addReg(X86::EAX);
34395 fallMBB->addSuccessor(sinkMBB);
34396
34397 // sinkMBB:
34398 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
34399 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
34400 .addReg(mainDstReg).addMBB(mainMBB)
34401 .addReg(fallDstReg).addMBB(fallMBB);
34402
34403 MI.eraseFromParent();
34404 return sinkMBB;
34405}
34406
34408X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
34409 MachineBasicBlock *MBB) const {
34410 // Emit va_arg instruction on X86-64.
34411
34412 // Operands to this pseudo-instruction:
34413 // 0 ) Output : destination address (reg)
34414 // 1-5) Input : va_list address (addr, i64mem)
34415 // 6 ) ArgSize : Size (in bytes) of vararg type
34416 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
34417 // 8 ) Align : Alignment of type
34418 // 9 ) EFLAGS (implicit-def)
34419
34420 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
34421 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
34422
34423 Register DestReg = MI.getOperand(0).getReg();
34424 MachineOperand &Base = MI.getOperand(1);
34425 MachineOperand &Scale = MI.getOperand(2);
34426 MachineOperand &Index = MI.getOperand(3);
34427 MachineOperand &Disp = MI.getOperand(4);
34428 MachineOperand &Segment = MI.getOperand(5);
34429 unsigned ArgSize = MI.getOperand(6).getImm();
34430 unsigned ArgMode = MI.getOperand(7).getImm();
34431 Align Alignment = Align(MI.getOperand(8).getImm());
34432
34433 MachineFunction *MF = MBB->getParent();
34434
34435 // Memory Reference
34436 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
34437
34438 MachineMemOperand *OldMMO = MI.memoperands().front();
34439
34440 // Clone the MMO into two separate MMOs for loading and storing
34441 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
34442 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
34443 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
34444 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
34445
34446 // Machine Information
34447 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34449 const TargetRegisterClass *AddrRegClass =
34451 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
34452 const MIMetadata MIMD(MI);
34453
34454 // struct va_list {
34455 // i32 gp_offset
34456 // i32 fp_offset
34457 // i64 overflow_area (address)
34458 // i64 reg_save_area (address)
34459 // }
34460 // sizeof(va_list) = 24
34461 // alignment(va_list) = 8
34462
34463 unsigned TotalNumIntRegs = 6;
34464 unsigned TotalNumXMMRegs = 8;
34465 bool UseGPOffset = (ArgMode == 1);
34466 bool UseFPOffset = (ArgMode == 2);
34467 unsigned MaxOffset = TotalNumIntRegs * 8 +
34468 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
34469
34470 /* Align ArgSize to a multiple of 8 */
34471 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
34472 bool NeedsAlign = (Alignment > 8);
34473
34474 MachineBasicBlock *thisMBB = MBB;
34475 MachineBasicBlock *overflowMBB;
34476 MachineBasicBlock *offsetMBB;
34477 MachineBasicBlock *endMBB;
34478
34479 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
34480 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
34481 unsigned OffsetReg = 0;
34482
34483 if (!UseGPOffset && !UseFPOffset) {
34484 // If we only pull from the overflow region, we don't create a branch.
34485 // We don't need to alter control flow.
34486 OffsetDestReg = 0; // unused
34487 OverflowDestReg = DestReg;
34488
34489 offsetMBB = nullptr;
34490 overflowMBB = thisMBB;
34491 endMBB = thisMBB;
34492 } else {
34493 // First emit code to check if gp_offset (or fp_offset) is below the bound.
34494 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
34495 // If not, pull from overflow_area. (branch to overflowMBB)
34496 //
34497 // thisMBB
34498 // | .
34499 // | .
34500 // offsetMBB overflowMBB
34501 // | .
34502 // | .
34503 // endMBB
34504
34505 // Registers for the PHI in endMBB
34506 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
34507 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
34508
34509 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
34510 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34511 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34512 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34513
34515
34516 // Insert the new basic blocks
34517 MF->insert(MBBIter, offsetMBB);
34518 MF->insert(MBBIter, overflowMBB);
34519 MF->insert(MBBIter, endMBB);
34520
34521 // Transfer the remainder of MBB and its successor edges to endMBB.
34522 endMBB->splice(endMBB->begin(), thisMBB,
34523 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
34524 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
34525
34526 // Make offsetMBB and overflowMBB successors of thisMBB
34527 thisMBB->addSuccessor(offsetMBB);
34528 thisMBB->addSuccessor(overflowMBB);
34529
34530 // endMBB is a successor of both offsetMBB and overflowMBB
34531 offsetMBB->addSuccessor(endMBB);
34532 overflowMBB->addSuccessor(endMBB);
34533
34534 // Load the offset value into a register
34535 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
34536 BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)
34537 .add(Base)
34538 .add(Scale)
34539 .add(Index)
34540 .addDisp(Disp, UseFPOffset ? 4 : 0)
34541 .add(Segment)
34542 .setMemRefs(LoadOnlyMMO);
34543
34544 // Check if there is enough room left to pull this argument.
34545 BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
34546 .addReg(OffsetReg)
34547 .addImm(MaxOffset + 8 - ArgSizeA8);
34548
34549 // Branch to "overflowMBB" if offset >= max
34550 // Fall through to "offsetMBB" otherwise
34551 BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
34552 .addMBB(overflowMBB).addImm(X86::COND_AE);
34553 }
34554
34555 // In offsetMBB, emit code to use the reg_save_area.
34556 if (offsetMBB) {
34557 assert(OffsetReg != 0);
34558
34559 // Read the reg_save_area address.
34560 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
34561 BuildMI(
34562 offsetMBB, MIMD,
34563 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
34564 RegSaveReg)
34565 .add(Base)
34566 .add(Scale)
34567 .add(Index)
34568 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
34569 .add(Segment)
34570 .setMemRefs(LoadOnlyMMO);
34571
34572 if (Subtarget.isTarget64BitLP64()) {
34573 // Zero-extend the offset
34574 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
34575 BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
34576 .addImm(0)
34577 .addReg(OffsetReg)
34578 .addImm(X86::sub_32bit);
34579
34580 // Add the offset to the reg_save_area to get the final address.
34581 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)
34582 .addReg(OffsetReg64)
34583 .addReg(RegSaveReg);
34584 } else {
34585 // Add the offset to the reg_save_area to get the final address.
34586 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)
34587 .addReg(OffsetReg)
34588 .addReg(RegSaveReg);
34589 }
34590
34591 // Compute the offset for the next argument
34592 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
34593 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
34594 .addReg(OffsetReg)
34595 .addImm(UseFPOffset ? 16 : 8);
34596
34597 // Store it back into the va_list.
34598 BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
34599 .add(Base)
34600 .add(Scale)
34601 .add(Index)
34602 .addDisp(Disp, UseFPOffset ? 4 : 0)
34603 .add(Segment)
34604 .addReg(NextOffsetReg)
34605 .setMemRefs(StoreOnlyMMO);
34606
34607 // Jump to endMBB
34608 BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
34609 .addMBB(endMBB);
34610 }
34611
34612 //
34613 // Emit code to use overflow area
34614 //
34615
34616 // Load the overflow_area address into a register.
34617 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
34618 BuildMI(overflowMBB, MIMD,
34619 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
34620 OverflowAddrReg)
34621 .add(Base)
34622 .add(Scale)
34623 .add(Index)
34624 .addDisp(Disp, 8)
34625 .add(Segment)
34626 .setMemRefs(LoadOnlyMMO);
34627
34628 // If we need to align it, do so. Otherwise, just copy the address
34629 // to OverflowDestReg.
34630 if (NeedsAlign) {
34631 // Align the overflow address
34632 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
34633
34634 // aligned_addr = (addr + (align-1)) & ~(align-1)
34635 BuildMI(
34636 overflowMBB, MIMD,
34637 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
34638 TmpReg)
34639 .addReg(OverflowAddrReg)
34640 .addImm(Alignment.value() - 1);
34641
34642 BuildMI(
34643 overflowMBB, MIMD,
34644 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
34645 OverflowDestReg)
34646 .addReg(TmpReg)
34647 .addImm(~(uint64_t)(Alignment.value() - 1));
34648 } else {
34649 BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
34650 .addReg(OverflowAddrReg);
34651 }
34652
34653 // Compute the next overflow address after this argument.
34654 // (the overflow address should be kept 8-byte aligned)
34655 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
34656 BuildMI(
34657 overflowMBB, MIMD,
34658 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
34659 NextAddrReg)
34660 .addReg(OverflowDestReg)
34661 .addImm(ArgSizeA8);
34662
34663 // Store the new overflow address.
34664 BuildMI(overflowMBB, MIMD,
34665 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
34666 .add(Base)
34667 .add(Scale)
34668 .add(Index)
34669 .addDisp(Disp, 8)
34670 .add(Segment)
34671 .addReg(NextAddrReg)
34672 .setMemRefs(StoreOnlyMMO);
34673
34674 // If we branched, emit the PHI to the front of endMBB.
34675 if (offsetMBB) {
34676 BuildMI(*endMBB, endMBB->begin(), MIMD,
34677 TII->get(X86::PHI), DestReg)
34678 .addReg(OffsetDestReg).addMBB(offsetMBB)
34679 .addReg(OverflowDestReg).addMBB(overflowMBB);
34680 }
34681
34682 // Erase the pseudo instruction
34683 MI.eraseFromParent();
34684
34685 return endMBB;
34686}
34687
34688// The EFLAGS operand of SelectItr might be missing a kill marker
34689// because there were multiple uses of EFLAGS, and ISel didn't know
34690// which to mark. Figure out whether SelectItr should have had a
34691// kill marker, and set it if it should. Returns the correct kill
34692// marker value.
34695 const TargetRegisterInfo* TRI) {
34696 if (isEFLAGSLiveAfter(SelectItr, BB))
34697 return false;
34698
34699 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
34700 // out. SelectMI should have a kill flag on EFLAGS.
34701 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
34702 return true;
34703}
34704
34705// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
34706// together with other CMOV pseudo-opcodes into a single basic-block with
34707// conditional jump around it.
34709 switch (MI.getOpcode()) {
34710 case X86::CMOV_FR16:
34711 case X86::CMOV_FR16X:
34712 case X86::CMOV_FR32:
34713 case X86::CMOV_FR32X:
34714 case X86::CMOV_FR64:
34715 case X86::CMOV_FR64X:
34716 case X86::CMOV_GR8:
34717 case X86::CMOV_GR16:
34718 case X86::CMOV_GR32:
34719 case X86::CMOV_RFP32:
34720 case X86::CMOV_RFP64:
34721 case X86::CMOV_RFP80:
34722 case X86::CMOV_VR64:
34723 case X86::CMOV_VR128:
34724 case X86::CMOV_VR128X:
34725 case X86::CMOV_VR256:
34726 case X86::CMOV_VR256X:
34727 case X86::CMOV_VR512:
34728 case X86::CMOV_VK1:
34729 case X86::CMOV_VK2:
34730 case X86::CMOV_VK4:
34731 case X86::CMOV_VK8:
34732 case X86::CMOV_VK16:
34733 case X86::CMOV_VK32:
34734 case X86::CMOV_VK64:
34735 return true;
34736
34737 default:
34738 return false;
34739 }
34740}
34741
34742// Helper function, which inserts PHI functions into SinkMBB:
34743// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
34744// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
34745// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
34746// the last PHI function inserted.
34749 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
34750 MachineBasicBlock *SinkMBB) {
34751 MachineFunction *MF = TrueMBB->getParent();
34753 const MIMetadata MIMD(*MIItBegin);
34754
34755 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
34757
34758 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
34759
34760 // As we are creating the PHIs, we have to be careful if there is more than
34761 // one. Later CMOVs may reference the results of earlier CMOVs, but later
34762 // PHIs have to reference the individual true/false inputs from earlier PHIs.
34763 // That also means that PHI construction must work forward from earlier to
34764 // later, and that the code must maintain a mapping from earlier PHI's
34765 // destination registers, and the registers that went into the PHI.
34768
34769 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
34770 Register DestReg = MIIt->getOperand(0).getReg();
34771 Register Op1Reg = MIIt->getOperand(1).getReg();
34772 Register Op2Reg = MIIt->getOperand(2).getReg();
34773
34774 // If this CMOV we are generating is the opposite condition from
34775 // the jump we generated, then we have to swap the operands for the
34776 // PHI that is going to be generated.
34777 if (MIIt->getOperand(3).getImm() == OppCC)
34778 std::swap(Op1Reg, Op2Reg);
34779
34780 if (RegRewriteTable.contains(Op1Reg))
34781 Op1Reg = RegRewriteTable[Op1Reg].first;
34782
34783 if (RegRewriteTable.contains(Op2Reg))
34784 Op2Reg = RegRewriteTable[Op2Reg].second;
34785
34786 MIB =
34787 BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
34788 .addReg(Op1Reg)
34789 .addMBB(FalseMBB)
34790 .addReg(Op2Reg)
34791 .addMBB(TrueMBB);
34792
34793 // Add this PHI to the rewrite table.
34794 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
34795 }
34796
34797 return MIB;
34798}
34799
34800// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
34802X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
34803 MachineInstr &SecondCascadedCMOV,
34804 MachineBasicBlock *ThisMBB) const {
34805 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34806 const MIMetadata MIMD(FirstCMOV);
34807
34808 // We lower cascaded CMOVs such as
34809 //
34810 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
34811 //
34812 // to two successive branches.
34813 //
34814 // Without this, we would add a PHI between the two jumps, which ends up
34815 // creating a few copies all around. For instance, for
34816 //
34817 // (sitofp (zext (fcmp une)))
34818 //
34819 // we would generate:
34820 //
34821 // ucomiss %xmm1, %xmm0
34822 // movss <1.0f>, %xmm0
34823 // movaps %xmm0, %xmm1
34824 // jne .LBB5_2
34825 // xorps %xmm1, %xmm1
34826 // .LBB5_2:
34827 // jp .LBB5_4
34828 // movaps %xmm1, %xmm0
34829 // .LBB5_4:
34830 // retq
34831 //
34832 // because this custom-inserter would have generated:
34833 //
34834 // A
34835 // | \
34836 // | B
34837 // | /
34838 // C
34839 // | \
34840 // | D
34841 // | /
34842 // E
34843 //
34844 // A: X = ...; Y = ...
34845 // B: empty
34846 // C: Z = PHI [X, A], [Y, B]
34847 // D: empty
34848 // E: PHI [X, C], [Z, D]
34849 //
34850 // If we lower both CMOVs in a single step, we can instead generate:
34851 //
34852 // A
34853 // | \
34854 // | C
34855 // | /|
34856 // |/ |
34857 // | |
34858 // | D
34859 // | /
34860 // E
34861 //
34862 // A: X = ...; Y = ...
34863 // D: empty
34864 // E: PHI [X, A], [X, C], [Y, D]
34865 //
34866 // Which, in our sitofp/fcmp example, gives us something like:
34867 //
34868 // ucomiss %xmm1, %xmm0
34869 // movss <1.0f>, %xmm0
34870 // jne .LBB5_4
34871 // jp .LBB5_4
34872 // xorps %xmm0, %xmm0
34873 // .LBB5_4:
34874 // retq
34875 //
34876
34877 // We lower cascaded CMOV into two successive branches to the same block.
34878 // EFLAGS is used by both, so mark it as live in the second.
34879 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
34880 MachineFunction *F = ThisMBB->getParent();
34881 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
34882 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
34883 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
34884
34885 MachineFunction::iterator It = ++ThisMBB->getIterator();
34886 F->insert(It, FirstInsertedMBB);
34887 F->insert(It, SecondInsertedMBB);
34888 F->insert(It, SinkMBB);
34889
34890 // For a cascaded CMOV, we lower it to two successive branches to
34891 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
34892 // the FirstInsertedMBB.
34893 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
34894
34895 // If the EFLAGS register isn't dead in the terminator, then claim that it's
34896 // live into the sink and copy blocks.
34897 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
34898 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
34899 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
34900 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
34901 SinkMBB->addLiveIn(X86::EFLAGS);
34902 }
34903
34904 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
34905 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
34906 std::next(MachineBasicBlock::iterator(FirstCMOV)),
34907 ThisMBB->end());
34908 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
34909
34910 // Fallthrough block for ThisMBB.
34911 ThisMBB->addSuccessor(FirstInsertedMBB);
34912 // The true block target of the first branch is always SinkMBB.
34913 ThisMBB->addSuccessor(SinkMBB);
34914 // Fallthrough block for FirstInsertedMBB.
34915 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
34916 // The true block for the branch of FirstInsertedMBB.
34917 FirstInsertedMBB->addSuccessor(SinkMBB);
34918 // This is fallthrough.
34919 SecondInsertedMBB->addSuccessor(SinkMBB);
34920
34921 // Create the conditional branch instructions.
34922 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
34923 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
34924
34925 X86::CondCode SecondCC =
34926 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
34927 BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))
34928 .addMBB(SinkMBB)
34929 .addImm(SecondCC);
34930
34931 // SinkMBB:
34932 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
34933 Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
34934 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
34935 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
34937 BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
34938 .addReg(Op1Reg)
34939 .addMBB(SecondInsertedMBB)
34940 .addReg(Op2Reg)
34941 .addMBB(ThisMBB);
34942
34943 // The second SecondInsertedMBB provides the same incoming value as the
34944 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
34945 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
34946
34947 // Now remove the CMOVs.
34948 FirstCMOV.eraseFromParent();
34949 SecondCascadedCMOV.eraseFromParent();
34950
34951 return SinkMBB;
34952}
34953
34955X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
34956 MachineBasicBlock *ThisMBB) const {
34957 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34958 const MIMetadata MIMD(MI);
34959
34960 // To "insert" a SELECT_CC instruction, we actually have to insert the
34961 // diamond control-flow pattern. The incoming instruction knows the
34962 // destination vreg to set, the condition code register to branch on, the
34963 // true/false values to select between and a branch opcode to use.
34964
34965 // ThisMBB:
34966 // ...
34967 // TrueVal = ...
34968 // cmpTY ccX, r1, r2
34969 // bCC copy1MBB
34970 // fallthrough --> FalseMBB
34971
34972 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
34973 // as described above, by inserting a BB, and then making a PHI at the join
34974 // point to select the true and false operands of the CMOV in the PHI.
34975 //
34976 // The code also handles two different cases of multiple CMOV opcodes
34977 // in a row.
34978 //
34979 // Case 1:
34980 // In this case, there are multiple CMOVs in a row, all which are based on
34981 // the same condition setting (or the exact opposite condition setting).
34982 // In this case we can lower all the CMOVs using a single inserted BB, and
34983 // then make a number of PHIs at the join point to model the CMOVs. The only
34984 // trickiness here, is that in a case like:
34985 //
34986 // t2 = CMOV cond1 t1, f1
34987 // t3 = CMOV cond1 t2, f2
34988 //
34989 // when rewriting this into PHIs, we have to perform some renaming on the
34990 // temps since you cannot have a PHI operand refer to a PHI result earlier
34991 // in the same block. The "simple" but wrong lowering would be:
34992 //
34993 // t2 = PHI t1(BB1), f1(BB2)
34994 // t3 = PHI t2(BB1), f2(BB2)
34995 //
34996 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
34997 // renaming is to note that on the path through BB1, t2 is really just a
34998 // copy of t1, and do that renaming, properly generating:
34999 //
35000 // t2 = PHI t1(BB1), f1(BB2)
35001 // t3 = PHI t1(BB1), f2(BB2)
35002 //
35003 // Case 2:
35004 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
35005 // function - EmitLoweredCascadedSelect.
35006
35007 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
35009 MachineInstr *LastCMOV = &MI;
35011
35012 // Check for case 1, where there are multiple CMOVs with the same condition
35013 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
35014 // number of jumps the most.
35015
35016 if (isCMOVPseudo(MI)) {
35017 // See if we have a string of CMOVS with the same condition. Skip over
35018 // intervening debug insts.
35019 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
35020 (NextMIIt->getOperand(3).getImm() == CC ||
35021 NextMIIt->getOperand(3).getImm() == OppCC)) {
35022 LastCMOV = &*NextMIIt;
35023 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
35024 }
35025 }
35026
35027 // This checks for case 2, but only do this if we didn't already find
35028 // case 1, as indicated by LastCMOV == MI.
35029 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
35030 NextMIIt->getOpcode() == MI.getOpcode() &&
35031 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
35032 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
35033 NextMIIt->getOperand(1).isKill()) {
35034 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
35035 }
35036
35037 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
35038 MachineFunction *F = ThisMBB->getParent();
35039 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
35040 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
35041
35042 MachineFunction::iterator It = ++ThisMBB->getIterator();
35043 F->insert(It, FalseMBB);
35044 F->insert(It, SinkMBB);
35045
35046 // Set the call frame size on entry to the new basic blocks.
35047 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
35048 FalseMBB->setCallFrameSize(CallFrameSize);
35049 SinkMBB->setCallFrameSize(CallFrameSize);
35050
35051 // If the EFLAGS register isn't dead in the terminator, then claim that it's
35052 // live into the sink and copy blocks.
35053 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
35054 if (!LastCMOV->killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
35055 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
35056 FalseMBB->addLiveIn(X86::EFLAGS);
35057 SinkMBB->addLiveIn(X86::EFLAGS);
35058 }
35059
35060 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
35062 MachineBasicBlock::iterator(LastCMOV));
35063 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
35064 if (MI.isDebugInstr())
35065 SinkMBB->push_back(MI.removeFromParent());
35066
35067 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
35068 SinkMBB->splice(SinkMBB->end(), ThisMBB,
35069 std::next(MachineBasicBlock::iterator(LastCMOV)),
35070 ThisMBB->end());
35071 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
35072
35073 // Fallthrough block for ThisMBB.
35074 ThisMBB->addSuccessor(FalseMBB);
35075 // The true block target of the first (or only) branch is always a SinkMBB.
35076 ThisMBB->addSuccessor(SinkMBB);
35077 // Fallthrough block for FalseMBB.
35078 FalseMBB->addSuccessor(SinkMBB);
35079
35080 // Create the conditional branch instruction.
35081 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
35082
35083 // SinkMBB:
35084 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
35085 // ...
35088 std::next(MachineBasicBlock::iterator(LastCMOV));
35089 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
35090
35091 // Now remove the CMOV(s).
35092 ThisMBB->erase(MIItBegin, MIItEnd);
35093
35094 return SinkMBB;
35095}
35096
35097static unsigned getSUBriOpcode(bool IsLP64) {
35098 if (IsLP64)
35099 return X86::SUB64ri32;
35100 else
35101 return X86::SUB32ri;
35102}
35103
35105X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
35106 MachineBasicBlock *MBB) const {
35107 MachineFunction *MF = MBB->getParent();
35108 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35109 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
35110 const MIMetadata MIMD(MI);
35111 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
35112
35113 const unsigned ProbeSize = getStackProbeSize(*MF);
35114
35116 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35117 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35118 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35119
35121 MF->insert(MBBIter, testMBB);
35122 MF->insert(MBBIter, blockMBB);
35123 MF->insert(MBBIter, tailMBB);
35124
35125 Register sizeVReg = MI.getOperand(1).getReg();
35126
35127 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
35128
35129 Register TmpStackPtr = MRI.createVirtualRegister(
35130 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
35131 Register FinalStackPtr = MRI.createVirtualRegister(
35132 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
35133
35134 BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)
35135 .addReg(physSPReg);
35136 {
35137 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
35138 BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)
35139 .addReg(TmpStackPtr)
35140 .addReg(sizeVReg);
35141 }
35142
35143 // test rsp size
35144
35145 BuildMI(testMBB, MIMD,
35146 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
35147 .addReg(FinalStackPtr)
35148 .addReg(physSPReg);
35149
35150 BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))
35151 .addMBB(tailMBB)
35153 testMBB->addSuccessor(blockMBB);
35154 testMBB->addSuccessor(tailMBB);
35155
35156 // Touch the block then extend it. This is done on the opposite side of
35157 // static probe where we allocate then touch, to avoid the need of probing the
35158 // tail of the static alloca. Possible scenarios are:
35159 //
35160 // + ---- <- ------------ <- ------------- <- ------------ +
35161 // | |
35162 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
35163 // | |
35164 // + <- ----------- <- ------------ <- ----------- <- ------------ +
35165 //
35166 // The property we want to enforce is to never have more than [page alloc] between two probes.
35167
35168 const unsigned XORMIOpc =
35169 TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
35170 addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)
35171 .addImm(0);
35172
35173 BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),
35174 physSPReg)
35175 .addReg(physSPReg)
35176 .addImm(ProbeSize);
35177
35178 BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);
35179 blockMBB->addSuccessor(testMBB);
35180
35181 // Replace original instruction by the expected stack ptr
35182 BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),
35183 MI.getOperand(0).getReg())
35184 .addReg(FinalStackPtr);
35185
35186 tailMBB->splice(tailMBB->end(), MBB,
35187 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35189 MBB->addSuccessor(testMBB);
35190
35191 // Delete the original pseudo instruction.
35192 MI.eraseFromParent();
35193
35194 // And we're done.
35195 return tailMBB;
35196}
35197
35199X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
35200 MachineBasicBlock *BB) const {
35201 MachineFunction *MF = BB->getParent();
35202 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35203 const MIMetadata MIMD(MI);
35204 const BasicBlock *LLVM_BB = BB->getBasicBlock();
35205
35206 assert(MF->shouldSplitStack());
35207
35208 const bool Is64Bit = Subtarget.is64Bit();
35209 const bool IsLP64 = Subtarget.isTarget64BitLP64();
35210
35211 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
35212 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
35213
35214 // BB:
35215 // ... [Till the alloca]
35216 // If stacklet is not large enough, jump to mallocMBB
35217 //
35218 // bumpMBB:
35219 // Allocate by subtracting from RSP
35220 // Jump to continueMBB
35221 //
35222 // mallocMBB:
35223 // Allocate by call to runtime
35224 //
35225 // continueMBB:
35226 // ...
35227 // [rest of original BB]
35228 //
35229
35230 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35231 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35232 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35233
35235 const TargetRegisterClass *AddrRegClass =
35237
35238 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
35239 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
35240 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
35241 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
35242 sizeVReg = MI.getOperand(1).getReg(),
35243 physSPReg =
35244 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
35245
35246 MachineFunction::iterator MBBIter = ++BB->getIterator();
35247
35248 MF->insert(MBBIter, bumpMBB);
35249 MF->insert(MBBIter, mallocMBB);
35250 MF->insert(MBBIter, continueMBB);
35251
35252 continueMBB->splice(continueMBB->begin(), BB,
35253 std::next(MachineBasicBlock::iterator(MI)), BB->end());
35254 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
35255
35256 // Add code to the main basic block to check if the stack limit has been hit,
35257 // and if so, jump to mallocMBB otherwise to bumpMBB.
35258 BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
35259 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
35260 .addReg(tmpSPVReg).addReg(sizeVReg);
35261 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
35262 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
35263 .addReg(SPLimitVReg);
35264 BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
35265
35266 // bumpMBB simply decreases the stack pointer, since we know the current
35267 // stacklet has enough space.
35268 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
35269 .addReg(SPLimitVReg);
35270 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
35271 .addReg(SPLimitVReg);
35272 BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
35273
35274 // Calls into a routine in libgcc to allocate more space from the heap.
35275 const uint32_t *RegMask =
35277 if (IsLP64) {
35278 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
35279 .addReg(sizeVReg);
35280 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
35281 .addExternalSymbol("__morestack_allocate_stack_space")
35282 .addRegMask(RegMask)
35283 .addReg(X86::RDI, RegState::Implicit)
35284 .addReg(X86::RAX, RegState::ImplicitDefine);
35285 } else if (Is64Bit) {
35286 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
35287 .addReg(sizeVReg);
35288 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
35289 .addExternalSymbol("__morestack_allocate_stack_space")
35290 .addRegMask(RegMask)
35291 .addReg(X86::EDI, RegState::Implicit)
35292 .addReg(X86::EAX, RegState::ImplicitDefine);
35293 } else {
35294 BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
35295 .addImm(12);
35296 BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
35297 BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
35298 .addExternalSymbol("__morestack_allocate_stack_space")
35299 .addRegMask(RegMask)
35300 .addReg(X86::EAX, RegState::ImplicitDefine);
35301 }
35302
35303 if (!Is64Bit)
35304 BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
35305 .addImm(16);
35306
35307 BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
35308 .addReg(IsLP64 ? X86::RAX : X86::EAX);
35309 BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
35310
35311 // Set up the CFG correctly.
35312 BB->addSuccessor(bumpMBB);
35313 BB->addSuccessor(mallocMBB);
35314 mallocMBB->addSuccessor(continueMBB);
35315 bumpMBB->addSuccessor(continueMBB);
35316
35317 // Take care of the PHI nodes.
35318 BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),
35319 MI.getOperand(0).getReg())
35320 .addReg(mallocPtrVReg)
35321 .addMBB(mallocMBB)
35322 .addReg(bumpSPPtrVReg)
35323 .addMBB(bumpMBB);
35324
35325 // Delete the original pseudo instruction.
35326 MI.eraseFromParent();
35327
35328 // And we're done.
35329 return continueMBB;
35330}
35331
35333X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
35334 MachineBasicBlock *BB) const {
35335 MachineFunction *MF = BB->getParent();
35336 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
35337 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
35338 const MIMetadata MIMD(MI);
35339
35342 "SEH does not use catchret!");
35343
35344 // Only 32-bit EH needs to worry about manually restoring stack pointers.
35345 if (!Subtarget.is32Bit())
35346 return BB;
35347
35348 // C++ EH creates a new target block to hold the restore code, and wires up
35349 // the new block to the return destination with a normal JMP_4.
35350 MachineBasicBlock *RestoreMBB =
35352 assert(BB->succ_size() == 1);
35353 MF->insert(std::next(BB->getIterator()), RestoreMBB);
35354 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
35355 BB->addSuccessor(RestoreMBB);
35356 MI.getOperand(0).setMBB(RestoreMBB);
35357
35358 // Marking this as an EH pad but not a funclet entry block causes PEI to
35359 // restore stack pointers in the block.
35360 RestoreMBB->setIsEHPad(true);
35361
35362 auto RestoreMBBI = RestoreMBB->begin();
35363 BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);
35364 return BB;
35365}
35366
35368X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
35369 MachineBasicBlock *BB) const {
35370 // So, here we replace TLSADDR with the sequence:
35371 // adjust_stackdown -> TLSADDR -> adjust_stackup.
35372 // We need this because TLSADDR is lowered into calls
35373 // inside MC, therefore without the two markers shrink-wrapping
35374 // may push the prologue/epilogue pass them.
35375 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
35376 const MIMetadata MIMD(MI);
35377 MachineFunction &MF = *BB->getParent();
35378
35379 // Emit CALLSEQ_START right before the instruction.
35380 BB->getParent()->getFrameInfo().setAdjustsStack(true);
35381 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
35382 MachineInstrBuilder CallseqStart =
35383 BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
35384 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
35385
35386 // Emit CALLSEQ_END right after the instruction.
35387 // We don't call erase from parent because we want to keep the
35388 // original instruction around.
35389 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
35390 MachineInstrBuilder CallseqEnd =
35391 BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);
35392 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
35393
35394 return BB;
35395}
35396
35398X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
35399 MachineBasicBlock *BB) const {
35400 // This is pretty easy. We're taking the value that we received from
35401 // our load from the relocation, sticking it in either RDI (x86-64)
35402 // or EAX and doing an indirect call. The return value will then
35403 // be in the normal return register.
35404 MachineFunction *F = BB->getParent();
35405 const X86InstrInfo *TII = Subtarget.getInstrInfo();
35406 const MIMetadata MIMD(MI);
35407
35408 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
35409 assert(MI.getOperand(3).isGlobal() && "This should be a global");
35410
35411 // Get a register mask for the lowered call.
35412 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
35413 // proper register mask.
35414 const uint32_t *RegMask =
35415 Subtarget.is64Bit() ?
35418 if (Subtarget.is64Bit()) {
35420 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
35421 .addReg(X86::RIP)
35422 .addImm(0)
35423 .addReg(0)
35424 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
35425 MI.getOperand(3).getTargetFlags())
35426 .addReg(0);
35427 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));
35428 addDirectMem(MIB, X86::RDI);
35429 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
35430 } else if (!isPositionIndependent()) {
35432 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
35433 .addReg(0)
35434 .addImm(0)
35435 .addReg(0)
35436 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
35437 MI.getOperand(3).getTargetFlags())
35438 .addReg(0);
35439 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
35440 addDirectMem(MIB, X86::EAX);
35441 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
35442 } else {
35444 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
35445 .addReg(TII->getGlobalBaseReg(F))
35446 .addImm(0)
35447 .addReg(0)
35448 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
35449 MI.getOperand(3).getTargetFlags())
35450 .addReg(0);
35451 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
35452 addDirectMem(MIB, X86::EAX);
35453 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
35454 }
35455
35456 MI.eraseFromParent(); // The pseudo instruction is gone now.
35457 return BB;
35458}
35459
35460static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
35461 switch (RPOpc) {
35462 case X86::INDIRECT_THUNK_CALL32:
35463 return X86::CALLpcrel32;
35464 case X86::INDIRECT_THUNK_CALL64:
35465 return X86::CALL64pcrel32;
35466 case X86::INDIRECT_THUNK_TCRETURN32:
35467 return X86::TCRETURNdi;
35468 case X86::INDIRECT_THUNK_TCRETURN64:
35469 return X86::TCRETURNdi64;
35470 }
35471 llvm_unreachable("not indirect thunk opcode");
35472}
35473
35474static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
35475 unsigned Reg) {
35476 if (Subtarget.useRetpolineExternalThunk()) {
35477 // When using an external thunk for retpolines, we pick names that match the
35478 // names GCC happens to use as well. This helps simplify the implementation
35479 // of the thunks for kernels where they have no easy ability to create
35480 // aliases and are doing non-trivial configuration of the thunk's body. For
35481 // example, the Linux kernel will do boot-time hot patching of the thunk
35482 // bodies and cannot easily export aliases of these to loaded modules.
35483 //
35484 // Note that at any point in the future, we may need to change the semantics
35485 // of how we implement retpolines and at that time will likely change the
35486 // name of the called thunk. Essentially, there is no hard guarantee that
35487 // LLVM will generate calls to specific thunks, we merely make a best-effort
35488 // attempt to help out kernels and other systems where duplicating the
35489 // thunks is costly.
35490 switch (Reg) {
35491 case X86::EAX:
35492 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35493 return "__x86_indirect_thunk_eax";
35494 case X86::ECX:
35495 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35496 return "__x86_indirect_thunk_ecx";
35497 case X86::EDX:
35498 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35499 return "__x86_indirect_thunk_edx";
35500 case X86::EDI:
35501 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35502 return "__x86_indirect_thunk_edi";
35503 case X86::R11:
35504 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
35505 return "__x86_indirect_thunk_r11";
35506 }
35507 llvm_unreachable("unexpected reg for external indirect thunk");
35508 }
35509
35510 if (Subtarget.useRetpolineIndirectCalls() ||
35511 Subtarget.useRetpolineIndirectBranches()) {
35512 // When targeting an internal COMDAT thunk use an LLVM-specific name.
35513 switch (Reg) {
35514 case X86::EAX:
35515 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35516 return "__llvm_retpoline_eax";
35517 case X86::ECX:
35518 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35519 return "__llvm_retpoline_ecx";
35520 case X86::EDX:
35521 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35522 return "__llvm_retpoline_edx";
35523 case X86::EDI:
35524 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35525 return "__llvm_retpoline_edi";
35526 case X86::R11:
35527 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
35528 return "__llvm_retpoline_r11";
35529 }
35530 llvm_unreachable("unexpected reg for retpoline");
35531 }
35532
35533 if (Subtarget.useLVIControlFlowIntegrity()) {
35534 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
35535 return "__llvm_lvi_thunk_r11";
35536 }
35537 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
35538}
35539
35541X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
35542 MachineBasicBlock *BB) const {
35543 // Copy the virtual register into the R11 physical register and
35544 // call the retpoline thunk.
35545 const MIMetadata MIMD(MI);
35546 const X86InstrInfo *TII = Subtarget.getInstrInfo();
35547 Register CalleeVReg = MI.getOperand(0).getReg();
35548 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
35549
35550 // Find an available scratch register to hold the callee. On 64-bit, we can
35551 // just use R11, but we scan for uses anyway to ensure we don't generate
35552 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
35553 // already a register use operand to the call to hold the callee. If none
35554 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
35555 // register and ESI is the base pointer to realigned stack frames with VLAs.
35556 SmallVector<unsigned, 3> AvailableRegs;
35557 if (Subtarget.is64Bit())
35558 AvailableRegs.push_back(X86::R11);
35559 else
35560 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
35561
35562 // Zero out any registers that are already used.
35563 for (const auto &MO : MI.operands()) {
35564 if (MO.isReg() && MO.isUse())
35565 for (unsigned &Reg : AvailableRegs)
35566 if (Reg == MO.getReg())
35567 Reg = 0;
35568 }
35569
35570 // Choose the first remaining non-zero available register.
35571 unsigned AvailableReg = 0;
35572 for (unsigned MaybeReg : AvailableRegs) {
35573 if (MaybeReg) {
35574 AvailableReg = MaybeReg;
35575 break;
35576 }
35577 }
35578 if (!AvailableReg)
35579 report_fatal_error("calling convention incompatible with retpoline, no "
35580 "available registers");
35581
35582 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
35583
35584 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)
35585 .addReg(CalleeVReg);
35586 MI.getOperand(0).ChangeToES(Symbol);
35587 MI.setDesc(TII->get(Opc));
35589 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
35590 return BB;
35591}
35592
35593/// SetJmp implies future control flow change upon calling the corresponding
35594/// LongJmp.
35595/// Instead of using the 'return' instruction, the long jump fixes the stack and
35596/// performs an indirect branch. To do so it uses the registers that were stored
35597/// in the jump buffer (when calling SetJmp).
35598/// In case the shadow stack is enabled we need to fix it as well, because some
35599/// return addresses will be skipped.
35600/// The function will save the SSP for future fixing in the function
35601/// emitLongJmpShadowStackFix.
35602/// \sa emitLongJmpShadowStackFix
35603/// \param [in] MI The temporary Machine Instruction for the builtin.
35604/// \param [in] MBB The Machine Basic Block that will be modified.
35605void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
35606 MachineBasicBlock *MBB) const {
35607 const MIMetadata MIMD(MI);
35608 MachineFunction *MF = MBB->getParent();
35609 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35612
35613 // Memory Reference.
35614 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35615 MI.memoperands_end());
35616
35617 // Initialize a register with zero.
35618 MVT PVT = getPointerTy(MF->getDataLayout());
35619 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35620 Register ZReg = MRI.createVirtualRegister(PtrRC);
35621 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
35622 BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))
35623 .addDef(ZReg)
35624 .addReg(ZReg, RegState::Undef)
35625 .addReg(ZReg, RegState::Undef);
35626
35627 // Read the current SSP Register value to the zeroed register.
35628 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
35629 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
35630 BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
35631
35632 // Write the SSP register value to offset 3 in input memory buffer.
35633 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
35634 MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));
35635 const int64_t SSPOffset = 3 * PVT.getStoreSize();
35636 const unsigned MemOpndSlot = 1;
35637 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35638 if (i == X86::AddrDisp)
35639 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
35640 else
35641 MIB.add(MI.getOperand(MemOpndSlot + i));
35642 }
35643 MIB.addReg(SSPCopyReg);
35644 MIB.setMemRefs(MMOs);
35645}
35646
35648X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
35649 MachineBasicBlock *MBB) const {
35650 const MIMetadata MIMD(MI);
35651 MachineFunction *MF = MBB->getParent();
35652 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35653 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
35655
35656 const BasicBlock *BB = MBB->getBasicBlock();
35658
35659 // Memory Reference
35660 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35661 MI.memoperands_end());
35662
35663 unsigned DstReg;
35664 unsigned MemOpndSlot = 0;
35665
35666 unsigned CurOp = 0;
35667
35668 DstReg = MI.getOperand(CurOp++).getReg();
35669 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35670 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
35671 (void)TRI;
35672 Register mainDstReg = MRI.createVirtualRegister(RC);
35673 Register restoreDstReg = MRI.createVirtualRegister(RC);
35674
35675 MemOpndSlot = CurOp;
35676
35677 MVT PVT = getPointerTy(MF->getDataLayout());
35678 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
35679 "Invalid Pointer Size!");
35680
35681 // For v = setjmp(buf), we generate
35682 //
35683 // thisMBB:
35684 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
35685 // SjLjSetup restoreMBB
35686 //
35687 // mainMBB:
35688 // v_main = 0
35689 //
35690 // sinkMBB:
35691 // v = phi(main, restore)
35692 //
35693 // restoreMBB:
35694 // if base pointer being used, load it from frame
35695 // v_restore = 1
35696
35697 MachineBasicBlock *thisMBB = MBB;
35698 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35699 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35700 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
35701 MF->insert(I, mainMBB);
35702 MF->insert(I, sinkMBB);
35703 MF->push_back(restoreMBB);
35704 restoreMBB->setMachineBlockAddressTaken();
35705
35707
35708 // Transfer the remainder of BB and its successor edges to sinkMBB.
35709 sinkMBB->splice(sinkMBB->begin(), MBB,
35710 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35712
35713 // thisMBB:
35714 unsigned PtrStoreOpc = 0;
35715 unsigned LabelReg = 0;
35716 const int64_t LabelOffset = 1 * PVT.getStoreSize();
35717 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
35719
35720 // Prepare IP either in reg or imm.
35721 if (!UseImmLabel) {
35722 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
35723 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35724 LabelReg = MRI.createVirtualRegister(PtrRC);
35725 if (Subtarget.is64Bit()) {
35726 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
35727 .addReg(X86::RIP)
35728 .addImm(0)
35729 .addReg(0)
35730 .addMBB(restoreMBB)
35731 .addReg(0);
35732 } else {
35733 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
35734 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
35735 .addReg(XII->getGlobalBaseReg(MF))
35736 .addImm(0)
35737 .addReg(0)
35738 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
35739 .addReg(0);
35740 }
35741 } else
35742 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
35743 // Store IP
35744 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));
35745 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35746 if (i == X86::AddrDisp)
35747 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
35748 else
35749 MIB.add(MI.getOperand(MemOpndSlot + i));
35750 }
35751 if (!UseImmLabel)
35752 MIB.addReg(LabelReg);
35753 else
35754 MIB.addMBB(restoreMBB);
35755 MIB.setMemRefs(MMOs);
35756
35757 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
35758 emitSetJmpShadowStackFix(MI, thisMBB);
35759 }
35760
35761 // Setup
35762 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
35763 .addMBB(restoreMBB);
35764
35765 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
35766 MIB.addRegMask(RegInfo->getNoPreservedMask());
35767 thisMBB->addSuccessor(mainMBB);
35768 thisMBB->addSuccessor(restoreMBB);
35769
35770 // mainMBB:
35771 // EAX = 0
35772 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);
35773 mainMBB->addSuccessor(sinkMBB);
35774
35775 // sinkMBB:
35776 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
35777 .addReg(mainDstReg)
35778 .addMBB(mainMBB)
35779 .addReg(restoreDstReg)
35780 .addMBB(restoreMBB);
35781
35782 // restoreMBB:
35783 if (RegInfo->hasBasePointer(*MF)) {
35784 const bool Uses64BitFramePtr =
35785 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
35787 X86FI->setRestoreBasePointer(MF);
35788 Register FramePtr = RegInfo->getFrameRegister(*MF);
35789 Register BasePtr = RegInfo->getBaseRegister();
35790 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
35791 addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
35792 FramePtr, true, X86FI->getRestoreBasePointerOffset())
35794 }
35795 BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
35796 BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35797 restoreMBB->addSuccessor(sinkMBB);
35798
35799 MI.eraseFromParent();
35800 return sinkMBB;
35801}
35802
35803/// Fix the shadow stack using the previously saved SSP pointer.
35804/// \sa emitSetJmpShadowStackFix
35805/// \param [in] MI The temporary Machine Instruction for the builtin.
35806/// \param [in] MBB The Machine Basic Block that will be modified.
35807/// \return The sink MBB that will perform the future indirect branch.
35809X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
35810 MachineBasicBlock *MBB) const {
35811 const MIMetadata MIMD(MI);
35812 MachineFunction *MF = MBB->getParent();
35813 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35815
35816 // Memory Reference
35817 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35818 MI.memoperands_end());
35819
35820 MVT PVT = getPointerTy(MF->getDataLayout());
35821 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35822
35823 // checkSspMBB:
35824 // xor vreg1, vreg1
35825 // rdssp vreg1
35826 // test vreg1, vreg1
35827 // je sinkMBB # Jump if Shadow Stack is not supported
35828 // fallMBB:
35829 // mov buf+24/12(%rip), vreg2
35830 // sub vreg1, vreg2
35831 // jbe sinkMBB # No need to fix the Shadow Stack
35832 // fixShadowMBB:
35833 // shr 3/2, vreg2
35834 // incssp vreg2 # fix the SSP according to the lower 8 bits
35835 // shr 8, vreg2
35836 // je sinkMBB
35837 // fixShadowLoopPrepareMBB:
35838 // shl vreg2
35839 // mov 128, vreg3
35840 // fixShadowLoopMBB:
35841 // incssp vreg3
35842 // dec vreg2
35843 // jne fixShadowLoopMBB # Iterate until you finish fixing
35844 // # the Shadow Stack
35845 // sinkMBB:
35846
35848 const BasicBlock *BB = MBB->getBasicBlock();
35849
35850 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
35851 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35852 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
35853 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
35854 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
35855 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35856 MF->insert(I, checkSspMBB);
35857 MF->insert(I, fallMBB);
35858 MF->insert(I, fixShadowMBB);
35859 MF->insert(I, fixShadowLoopPrepareMBB);
35860 MF->insert(I, fixShadowLoopMBB);
35861 MF->insert(I, sinkMBB);
35862
35863 // Transfer the remainder of BB and its successor edges to sinkMBB.
35864 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
35865 MBB->end());
35867
35868 MBB->addSuccessor(checkSspMBB);
35869
35870 // Initialize a register with zero.
35871 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
35872 BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);
35873
35874 if (PVT == MVT::i64) {
35875 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
35876 BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
35877 .addImm(0)
35878 .addReg(ZReg)
35879 .addImm(X86::sub_32bit);
35880 ZReg = TmpZReg;
35881 }
35882
35883 // Read the current SSP Register value to the zeroed register.
35884 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
35885 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
35886 BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
35887
35888 // Check whether the result of the SSP register is zero and jump directly
35889 // to the sink.
35890 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
35891 BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))
35892 .addReg(SSPCopyReg)
35893 .addReg(SSPCopyReg);
35894 BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))
35895 .addMBB(sinkMBB)
35897 checkSspMBB->addSuccessor(sinkMBB);
35898 checkSspMBB->addSuccessor(fallMBB);
35899
35900 // Reload the previously saved SSP register value.
35901 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
35902 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
35903 const int64_t SPPOffset = 3 * PVT.getStoreSize();
35905 BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);
35906 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35907 const MachineOperand &MO = MI.getOperand(i);
35908 if (i == X86::AddrDisp)
35909 MIB.addDisp(MO, SPPOffset);
35910 else if (MO.isReg()) // Don't add the whole operand, we don't want to
35911 // preserve kill flags.
35912 MIB.addReg(MO.getReg());
35913 else
35914 MIB.add(MO);
35915 }
35916 MIB.setMemRefs(MMOs);
35917
35918 // Subtract the current SSP from the previous SSP.
35919 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
35920 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
35921 BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)
35922 .addReg(PrevSSPReg)
35923 .addReg(SSPCopyReg);
35924
35925 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
35926 BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))
35927 .addMBB(sinkMBB)
35929 fallMBB->addSuccessor(sinkMBB);
35930 fallMBB->addSuccessor(fixShadowMBB);
35931
35932 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
35933 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
35934 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
35935 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
35936 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)
35937 .addReg(SspSubReg)
35938 .addImm(Offset);
35939
35940 // Increase SSP when looking only on the lower 8 bits of the delta.
35941 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
35942 BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
35943
35944 // Reset the lower 8 bits.
35945 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
35946 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)
35947 .addReg(SspFirstShrReg)
35948 .addImm(8);
35949
35950 // Jump if the result of the shift is zero.
35951 BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))
35952 .addMBB(sinkMBB)
35954 fixShadowMBB->addSuccessor(sinkMBB);
35955 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
35956
35957 // Do a single shift left.
35958 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64ri : X86::SHL32ri;
35959 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
35960 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)
35961 .addReg(SspSecondShrReg)
35962 .addImm(1);
35963
35964 // Save the value 128 to a register (will be used next with incssp).
35965 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
35966 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
35967 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)
35968 .addImm(128);
35969 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
35970
35971 // Since incssp only looks at the lower 8 bits, we might need to do several
35972 // iterations of incssp until we finish fixing the shadow stack.
35973 Register DecReg = MRI.createVirtualRegister(PtrRC);
35974 Register CounterReg = MRI.createVirtualRegister(PtrRC);
35975 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)
35976 .addReg(SspAfterShlReg)
35977 .addMBB(fixShadowLoopPrepareMBB)
35978 .addReg(DecReg)
35979 .addMBB(fixShadowLoopMBB);
35980
35981 // Every iteration we increase the SSP by 128.
35982 BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);
35983
35984 // Every iteration we decrement the counter by 1.
35985 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
35986 BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);
35987
35988 // Jump if the counter is not zero yet.
35989 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))
35990 .addMBB(fixShadowLoopMBB)
35992 fixShadowLoopMBB->addSuccessor(sinkMBB);
35993 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
35994
35995 return sinkMBB;
35996}
35997
35999X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
36000 MachineBasicBlock *MBB) const {
36001 const MIMetadata MIMD(MI);
36002 MachineFunction *MF = MBB->getParent();
36003 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36005
36006 // Memory Reference
36007 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
36008 MI.memoperands_end());
36009
36010 MVT PVT = getPointerTy(MF->getDataLayout());
36011 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
36012 "Invalid Pointer Size!");
36013
36014 const TargetRegisterClass *RC =
36015 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
36016 Register Tmp = MRI.createVirtualRegister(RC);
36017 // Since FP is only updated here but NOT referenced, it's treated as GPR.
36018 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
36019 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
36020 Register SP = RegInfo->getStackRegister();
36021
36023
36024 const int64_t LabelOffset = 1 * PVT.getStoreSize();
36025 const int64_t SPOffset = 2 * PVT.getStoreSize();
36026
36027 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
36028 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
36029
36030 MachineBasicBlock *thisMBB = MBB;
36031
36032 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
36033 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
36034 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
36035 }
36036
36037 // Reload FP
36038 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);
36039 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36040 const MachineOperand &MO = MI.getOperand(i);
36041 if (MO.isReg()) // Don't add the whole operand, we don't want to
36042 // preserve kill flags.
36043 MIB.addReg(MO.getReg());
36044 else
36045 MIB.add(MO);
36046 }
36047 MIB.setMemRefs(MMOs);
36048
36049 // Reload IP
36050 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);
36051 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36052 const MachineOperand &MO = MI.getOperand(i);
36053 if (i == X86::AddrDisp)
36054 MIB.addDisp(MO, LabelOffset);
36055 else if (MO.isReg()) // Don't add the whole operand, we don't want to
36056 // preserve kill flags.
36057 MIB.addReg(MO.getReg());
36058 else
36059 MIB.add(MO);
36060 }
36061 MIB.setMemRefs(MMOs);
36062
36063 // Reload SP
36064 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);
36065 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36066 if (i == X86::AddrDisp)
36067 MIB.addDisp(MI.getOperand(i), SPOffset);
36068 else
36069 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
36070 // the last instruction of the expansion.
36071 }
36072 MIB.setMemRefs(MMOs);
36073
36074 // Jump
36075 BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);
36076
36077 MI.eraseFromParent();
36078 return thisMBB;
36079}
36080
36081void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
36083 MachineBasicBlock *DispatchBB,
36084 int FI) const {
36085 const MIMetadata MIMD(MI);
36086 MachineFunction *MF = MBB->getParent();
36088 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36089
36090 MVT PVT = getPointerTy(MF->getDataLayout());
36091 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
36092
36093 unsigned Op = 0;
36094 unsigned VR = 0;
36095
36096 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
36098
36099 if (UseImmLabel) {
36100 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
36101 } else {
36102 const TargetRegisterClass *TRC =
36103 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
36104 VR = MRI->createVirtualRegister(TRC);
36105 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
36106
36107 if (Subtarget.is64Bit())
36108 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)
36109 .addReg(X86::RIP)
36110 .addImm(1)
36111 .addReg(0)
36112 .addMBB(DispatchBB)
36113 .addReg(0);
36114 else
36115 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)
36116 .addReg(0) /* TII->getGlobalBaseReg(MF) */
36117 .addImm(1)
36118 .addReg(0)
36119 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
36120 .addReg(0);
36121 }
36122
36123 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));
36124 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
36125 if (UseImmLabel)
36126 MIB.addMBB(DispatchBB);
36127 else
36128 MIB.addReg(VR);
36129}
36130
36132X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
36133 MachineBasicBlock *BB) const {
36134 const MIMetadata MIMD(MI);
36135 MachineFunction *MF = BB->getParent();
36137 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36138 int FI = MF->getFrameInfo().getFunctionContextIndex();
36139
36140 // Get a mapping of the call site numbers to all of the landing pads they're
36141 // associated with.
36143 unsigned MaxCSNum = 0;
36144 for (auto &MBB : *MF) {
36145 if (!MBB.isEHPad())
36146 continue;
36147
36148 MCSymbol *Sym = nullptr;
36149 for (const auto &MI : MBB) {
36150 if (MI.isDebugInstr())
36151 continue;
36152
36153 assert(MI.isEHLabel() && "expected EH_LABEL");
36154 Sym = MI.getOperand(0).getMCSymbol();
36155 break;
36156 }
36157
36158 if (!MF->hasCallSiteLandingPad(Sym))
36159 continue;
36160
36161 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
36162 CallSiteNumToLPad[CSI].push_back(&MBB);
36163 MaxCSNum = std::max(MaxCSNum, CSI);
36164 }
36165 }
36166
36167 // Get an ordered list of the machine basic blocks for the jump table.
36168 std::vector<MachineBasicBlock *> LPadList;
36170 LPadList.reserve(CallSiteNumToLPad.size());
36171
36172 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
36173 for (auto &LP : CallSiteNumToLPad[CSI]) {
36174 LPadList.push_back(LP);
36175 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
36176 }
36177 }
36178
36179 assert(!LPadList.empty() &&
36180 "No landing pad destinations for the dispatch jump table!");
36181
36182 // Create the MBBs for the dispatch code.
36183
36184 // Shove the dispatch's address into the return slot in the function context.
36185 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
36186 DispatchBB->setIsEHPad(true);
36187
36188 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
36189 BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));
36190 DispatchBB->addSuccessor(TrapBB);
36191
36192 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
36193 DispatchBB->addSuccessor(DispContBB);
36194
36195 // Insert MBBs.
36196 MF->push_back(DispatchBB);
36197 MF->push_back(DispContBB);
36198 MF->push_back(TrapBB);
36199
36200 // Insert code into the entry block that creates and registers the function
36201 // context.
36202 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
36203
36204 // Create the jump table and associated information
36205 unsigned JTE = getJumpTableEncoding();
36206 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
36207 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
36208
36209 const X86RegisterInfo &RI = TII->getRegisterInfo();
36210 // Add a register mask with no preserved registers. This results in all
36211 // registers being marked as clobbered.
36212 if (RI.hasBasePointer(*MF)) {
36213 const bool FPIs64Bit =
36214 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
36215 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
36216 MFI->setRestoreBasePointer(MF);
36217
36218 Register FP = RI.getFrameRegister(*MF);
36219 Register BP = RI.getBaseRegister();
36220 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
36221 addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,
36224 } else {
36225 BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))
36227 }
36228
36229 // IReg is used as an index in a memory operand and therefore can't be SP
36230 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
36231 addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,
36232 Subtarget.is64Bit() ? 8 : 4);
36233 BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))
36234 .addReg(IReg)
36235 .addImm(LPadList.size());
36236 BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))
36237 .addMBB(TrapBB)
36239
36240 if (Subtarget.is64Bit()) {
36241 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
36242 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
36243
36244 // leaq .LJTI0_0(%rip), BReg
36245 BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)
36246 .addReg(X86::RIP)
36247 .addImm(1)
36248 .addReg(0)
36249 .addJumpTableIndex(MJTI)
36250 .addReg(0);
36251 // movzx IReg64, IReg
36252 BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
36253 .addImm(0)
36254 .addReg(IReg)
36255 .addImm(X86::sub_32bit);
36256
36257 switch (JTE) {
36259 // jmpq *(BReg,IReg64,8)
36260 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))
36261 .addReg(BReg)
36262 .addImm(8)
36263 .addReg(IReg64)
36264 .addImm(0)
36265 .addReg(0);
36266 break;
36268 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
36269 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
36270 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
36271
36272 // movl (BReg,IReg64,4), OReg
36273 BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)
36274 .addReg(BReg)
36275 .addImm(4)
36276 .addReg(IReg64)
36277 .addImm(0)
36278 .addReg(0);
36279 // movsx OReg64, OReg
36280 BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)
36281 .addReg(OReg);
36282 // addq BReg, OReg64, TReg
36283 BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)
36284 .addReg(OReg64)
36285 .addReg(BReg);
36286 // jmpq *TReg
36287 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);
36288 break;
36289 }
36290 default:
36291 llvm_unreachable("Unexpected jump table encoding");
36292 }
36293 } else {
36294 // jmpl *.LJTI0_0(,IReg,4)
36295 BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))
36296 .addReg(0)
36297 .addImm(4)
36298 .addReg(IReg)
36299 .addJumpTableIndex(MJTI)
36300 .addReg(0);
36301 }
36302
36303 // Add the jump table entries as successors to the MBB.
36305 for (auto &LP : LPadList)
36306 if (SeenMBBs.insert(LP).second)
36307 DispContBB->addSuccessor(LP);
36308
36309 // N.B. the order the invoke BBs are processed in doesn't matter here.
36311 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
36312 for (MachineBasicBlock *MBB : InvokeBBs) {
36313 // Remove the landing pad successor from the invoke block and replace it
36314 // with the new dispatch block.
36315 // Keep a copy of Successors since it's modified inside the loop.
36317 MBB->succ_rend());
36318 // FIXME: Avoid quadratic complexity.
36319 for (auto *MBBS : Successors) {
36320 if (MBBS->isEHPad()) {
36321 MBB->removeSuccessor(MBBS);
36322 MBBLPads.push_back(MBBS);
36323 }
36324 }
36325
36326 MBB->addSuccessor(DispatchBB);
36327
36328 // Find the invoke call and mark all of the callee-saved registers as
36329 // 'implicit defined' so that they're spilled. This prevents code from
36330 // moving instructions to before the EH block, where they will never be
36331 // executed.
36332 for (auto &II : reverse(*MBB)) {
36333 if (!II.isCall())
36334 continue;
36335
36337 for (auto &MOp : II.operands())
36338 if (MOp.isReg())
36339 DefRegs[MOp.getReg()] = true;
36340
36341 MachineInstrBuilder MIB(*MF, &II);
36342 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
36343 unsigned Reg = SavedRegs[RegIdx];
36344 if (!DefRegs[Reg])
36346 }
36347
36348 break;
36349 }
36350 }
36351
36352 // Mark all former landing pads as non-landing pads. The dispatch is the only
36353 // landing pad now.
36354 for (auto &LP : MBBLPads)
36355 LP->setIsEHPad(false);
36356
36357 // The instruction is gone now.
36358 MI.eraseFromParent();
36359 return BB;
36360}
36361
36364 MachineBasicBlock *BB) const {
36365 MachineFunction *MF = BB->getParent();
36366 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36367 const MIMetadata MIMD(MI);
36368
36369 auto TMMImmToTMMReg = [](unsigned Imm) {
36370 assert (Imm < 8 && "Illegal tmm index");
36371 return X86::TMM0 + Imm;
36372 };
36373 switch (MI.getOpcode()) {
36374 default: llvm_unreachable("Unexpected instr type to insert");
36375 case X86::TLS_addr32:
36376 case X86::TLS_addr64:
36377 case X86::TLS_addrX32:
36378 case X86::TLS_base_addr32:
36379 case X86::TLS_base_addr64:
36380 case X86::TLS_base_addrX32:
36381 case X86::TLS_desc32:
36382 case X86::TLS_desc64:
36383 return EmitLoweredTLSAddr(MI, BB);
36384 case X86::INDIRECT_THUNK_CALL32:
36385 case X86::INDIRECT_THUNK_CALL64:
36386 case X86::INDIRECT_THUNK_TCRETURN32:
36387 case X86::INDIRECT_THUNK_TCRETURN64:
36388 return EmitLoweredIndirectThunk(MI, BB);
36389 case X86::CATCHRET:
36390 return EmitLoweredCatchRet(MI, BB);
36391 case X86::SEG_ALLOCA_32:
36392 case X86::SEG_ALLOCA_64:
36393 return EmitLoweredSegAlloca(MI, BB);
36394 case X86::PROBED_ALLOCA_32:
36395 case X86::PROBED_ALLOCA_64:
36396 return EmitLoweredProbedAlloca(MI, BB);
36397 case X86::TLSCall_32:
36398 case X86::TLSCall_64:
36399 return EmitLoweredTLSCall(MI, BB);
36400 case X86::CMOV_FR16:
36401 case X86::CMOV_FR16X:
36402 case X86::CMOV_FR32:
36403 case X86::CMOV_FR32X:
36404 case X86::CMOV_FR64:
36405 case X86::CMOV_FR64X:
36406 case X86::CMOV_GR8:
36407 case X86::CMOV_GR16:
36408 case X86::CMOV_GR32:
36409 case X86::CMOV_RFP32:
36410 case X86::CMOV_RFP64:
36411 case X86::CMOV_RFP80:
36412 case X86::CMOV_VR64:
36413 case X86::CMOV_VR128:
36414 case X86::CMOV_VR128X:
36415 case X86::CMOV_VR256:
36416 case X86::CMOV_VR256X:
36417 case X86::CMOV_VR512:
36418 case X86::CMOV_VK1:
36419 case X86::CMOV_VK2:
36420 case X86::CMOV_VK4:
36421 case X86::CMOV_VK8:
36422 case X86::CMOV_VK16:
36423 case X86::CMOV_VK32:
36424 case X86::CMOV_VK64:
36425 return EmitLoweredSelect(MI, BB);
36426
36427 case X86::FP80_ADDr:
36428 case X86::FP80_ADDm32: {
36429 // Change the floating point control register to use double extended
36430 // precision when performing the addition.
36431 int OrigCWFrameIdx =
36432 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36433 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
36434 OrigCWFrameIdx);
36435
36436 // Load the old value of the control word...
36437 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36438 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
36439 OrigCWFrameIdx);
36440
36441 // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
36442 // precision.
36443 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36444 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
36445 .addReg(OldCW, RegState::Kill)
36446 .addImm(0x300);
36447
36448 // Extract to 16 bits.
36449 Register NewCW16 =
36450 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
36451 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
36452 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
36453
36454 // Prepare memory for FLDCW.
36455 int NewCWFrameIdx =
36456 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36457 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
36458 NewCWFrameIdx)
36459 .addReg(NewCW16, RegState::Kill);
36460
36461 // Reload the modified control word now...
36462 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
36463 NewCWFrameIdx);
36464
36465 // Do the addition.
36466 if (MI.getOpcode() == X86::FP80_ADDr) {
36467 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))
36468 .add(MI.getOperand(0))
36469 .add(MI.getOperand(1))
36470 .add(MI.getOperand(2));
36471 } else {
36472 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))
36473 .add(MI.getOperand(0))
36474 .add(MI.getOperand(1))
36475 .add(MI.getOperand(2))
36476 .add(MI.getOperand(3))
36477 .add(MI.getOperand(4))
36478 .add(MI.getOperand(5))
36479 .add(MI.getOperand(6));
36480 }
36481
36482 // Reload the original control word now.
36483 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
36484 OrigCWFrameIdx);
36485
36486 MI.eraseFromParent(); // The pseudo instruction is gone now.
36487 return BB;
36488 }
36489
36490 case X86::FP32_TO_INT16_IN_MEM:
36491 case X86::FP32_TO_INT32_IN_MEM:
36492 case X86::FP32_TO_INT64_IN_MEM:
36493 case X86::FP64_TO_INT16_IN_MEM:
36494 case X86::FP64_TO_INT32_IN_MEM:
36495 case X86::FP64_TO_INT64_IN_MEM:
36496 case X86::FP80_TO_INT16_IN_MEM:
36497 case X86::FP80_TO_INT32_IN_MEM:
36498 case X86::FP80_TO_INT64_IN_MEM: {
36499 // Change the floating point control register to use "round towards zero"
36500 // mode when truncating to an integer value.
36501 int OrigCWFrameIdx =
36502 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36503 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
36504 OrigCWFrameIdx);
36505
36506 // Load the old value of the control word...
36507 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36508 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
36509 OrigCWFrameIdx);
36510
36511 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
36512 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36513 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
36514 .addReg(OldCW, RegState::Kill).addImm(0xC00);
36515
36516 // Extract to 16 bits.
36517 Register NewCW16 =
36518 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
36519 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
36520 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
36521
36522 // Prepare memory for FLDCW.
36523 int NewCWFrameIdx =
36524 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36525 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
36526 NewCWFrameIdx)
36527 .addReg(NewCW16, RegState::Kill);
36528
36529 // Reload the modified control word now...
36530 addFrameReference(BuildMI(*BB, MI, MIMD,
36531 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
36532
36533 // Get the X86 opcode to use.
36534 unsigned Opc;
36535 switch (MI.getOpcode()) {
36536 // clang-format off
36537 default: llvm_unreachable("illegal opcode!");
36538 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
36539 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
36540 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
36541 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
36542 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
36543 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
36544 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
36545 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
36546 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
36547 // clang-format on
36548 }
36549
36551 addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)
36552 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
36553
36554 // Reload the original control word now.
36555 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
36556 OrigCWFrameIdx);
36557
36558 MI.eraseFromParent(); // The pseudo instruction is gone now.
36559 return BB;
36560 }
36561
36562 // xbegin
36563 case X86::XBEGIN:
36564 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
36565
36566 case X86::VAARG_64:
36567 case X86::VAARG_X32:
36568 return EmitVAARGWithCustomInserter(MI, BB);
36569
36570 case X86::EH_SjLj_SetJmp32:
36571 case X86::EH_SjLj_SetJmp64:
36572 return emitEHSjLjSetJmp(MI, BB);
36573
36574 case X86::EH_SjLj_LongJmp32:
36575 case X86::EH_SjLj_LongJmp64:
36576 return emitEHSjLjLongJmp(MI, BB);
36577
36578 case X86::Int_eh_sjlj_setup_dispatch:
36579 return EmitSjLjDispatchBlock(MI, BB);
36580
36581 case TargetOpcode::STATEPOINT:
36582 // As an implementation detail, STATEPOINT shares the STACKMAP format at
36583 // this point in the process. We diverge later.
36584 return emitPatchPoint(MI, BB);
36585
36586 case TargetOpcode::STACKMAP:
36587 case TargetOpcode::PATCHPOINT:
36588 return emitPatchPoint(MI, BB);
36589
36590 case TargetOpcode::PATCHABLE_EVENT_CALL:
36591 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
36592 return BB;
36593
36594 case X86::LCMPXCHG8B: {
36595 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36596 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
36597 // requires a memory operand. If it happens that current architecture is
36598 // i686 and for current function we need a base pointer
36599 // - which is ESI for i686 - register allocator would not be able to
36600 // allocate registers for an address in form of X(%reg, %reg, Y)
36601 // - there never would be enough unreserved registers during regalloc
36602 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
36603 // We are giving a hand to register allocator by precomputing the address in
36604 // a new vreg using LEA.
36605
36606 // If it is not i686 or there is no base pointer - nothing to do here.
36607 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
36608 return BB;
36609
36610 // Even though this code does not necessarily needs the base pointer to
36611 // be ESI, we check for that. The reason: if this assert fails, there are
36612 // some changes happened in the compiler base pointer handling, which most
36613 // probably have to be addressed somehow here.
36614 assert(TRI->getBaseRegister() == X86::ESI &&
36615 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
36616 "base pointer in mind");
36617
36619 MVT SPTy = getPointerTy(MF->getDataLayout());
36620 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
36621 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
36622
36624 // Regalloc does not need any help when the memory operand of CMPXCHG8B
36625 // does not use index register.
36626 if (AM.IndexReg == X86::NoRegister)
36627 return BB;
36628
36629 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
36630 // four operand definitions that are E[ABCD] registers. We skip them and
36631 // then insert the LEA.
36632 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
36633 while (RMBBI != BB->rend() &&
36634 (RMBBI->definesRegister(X86::EAX, /*TRI=*/nullptr) ||
36635 RMBBI->definesRegister(X86::EBX, /*TRI=*/nullptr) ||
36636 RMBBI->definesRegister(X86::ECX, /*TRI=*/nullptr) ||
36637 RMBBI->definesRegister(X86::EDX, /*TRI=*/nullptr))) {
36638 ++RMBBI;
36639 }
36642 BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);
36643
36644 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
36645
36646 return BB;
36647 }
36648 case X86::LCMPXCHG16B_NO_RBX: {
36649 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36650 Register BasePtr = TRI->getBaseRegister();
36651 if (TRI->hasBasePointer(*MF) &&
36652 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
36653 if (!BB->isLiveIn(BasePtr))
36654 BB->addLiveIn(BasePtr);
36655 // Save RBX into a virtual register.
36656 Register SaveRBX =
36657 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36658 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
36659 .addReg(X86::RBX);
36660 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36662 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
36663 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
36664 MIB.add(MI.getOperand(Idx));
36665 MIB.add(MI.getOperand(X86::AddrNumOperands));
36666 MIB.addReg(SaveRBX);
36667 } else {
36668 // Simple case, just copy the virtual register to RBX.
36669 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)
36670 .add(MI.getOperand(X86::AddrNumOperands));
36672 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));
36673 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
36674 MIB.add(MI.getOperand(Idx));
36675 }
36676 MI.eraseFromParent();
36677 return BB;
36678 }
36679 case X86::MWAITX: {
36680 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36681 Register BasePtr = TRI->getBaseRegister();
36682 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
36683 // If no need to save the base pointer, we generate MWAITXrrr,
36684 // else we generate pseudo MWAITX_SAVE_RBX.
36685 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
36686 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
36687 .addReg(MI.getOperand(0).getReg());
36688 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
36689 .addReg(MI.getOperand(1).getReg());
36690 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)
36691 .addReg(MI.getOperand(2).getReg());
36692 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));
36693 MI.eraseFromParent();
36694 } else {
36695 if (!BB->isLiveIn(BasePtr)) {
36696 BB->addLiveIn(BasePtr);
36697 }
36698 // Parameters can be copied into ECX and EAX but not EBX yet.
36699 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
36700 .addReg(MI.getOperand(0).getReg());
36701 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
36702 .addReg(MI.getOperand(1).getReg());
36703 assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
36704 // Save RBX into a virtual register.
36705 Register SaveRBX =
36706 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36707 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
36708 .addReg(X86::RBX);
36709 // Generate mwaitx pseudo.
36710 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36711 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))
36712 .addDef(Dst) // Destination tied in with SaveRBX.
36713 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
36714 .addUse(SaveRBX); // Save of base pointer.
36715 MI.eraseFromParent();
36716 }
36717 return BB;
36718 }
36719 case TargetOpcode::PREALLOCATED_SETUP: {
36720 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
36721 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
36722 MFI->setHasPreallocatedCall(true);
36723 int64_t PreallocatedId = MI.getOperand(0).getImm();
36724 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
36725 assert(StackAdjustment != 0 && "0 stack adjustment");
36726 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
36727 << StackAdjustment << "\n");
36728 BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)
36729 .addReg(X86::ESP)
36730 .addImm(StackAdjustment);
36731 MI.eraseFromParent();
36732 return BB;
36733 }
36734 case TargetOpcode::PREALLOCATED_ARG: {
36735 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
36736 int64_t PreallocatedId = MI.getOperand(1).getImm();
36737 int64_t ArgIdx = MI.getOperand(2).getImm();
36738 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
36739 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
36740 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
36741 << ", arg offset " << ArgOffset << "\n");
36742 // stack pointer + offset
36743 addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),
36744 MI.getOperand(0).getReg()),
36745 X86::ESP, false, ArgOffset);
36746 MI.eraseFromParent();
36747 return BB;
36748 }
36749 case X86::PTDPBSSD:
36750 case X86::PTDPBSUD:
36751 case X86::PTDPBUSD:
36752 case X86::PTDPBUUD:
36753 case X86::PTDPBF16PS:
36754 case X86::PTDPFP16PS: {
36755 unsigned Opc;
36756 switch (MI.getOpcode()) {
36757 // clang-format off
36758 default: llvm_unreachable("illegal opcode!");
36759 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
36760 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
36761 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
36762 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
36763 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
36764 case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
36765 // clang-format on
36766 }
36767
36768 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
36769 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
36770 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
36771 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
36772 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
36773
36774 MI.eraseFromParent(); // The pseudo is gone now.
36775 return BB;
36776 }
36777 case X86::PTILEZERO: {
36778 unsigned Imm = MI.getOperand(0).getImm();
36779 BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
36780 MI.eraseFromParent(); // The pseudo is gone now.
36781 return BB;
36782 }
36783 case X86::PTILELOADD:
36784 case X86::PTILELOADDT1:
36785 case X86::PTILESTORED: {
36786 unsigned Opc;
36787 switch (MI.getOpcode()) {
36788 default: llvm_unreachable("illegal opcode!");
36789#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
36790 case X86::PTILELOADD:
36791 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
36792 break;
36793 case X86::PTILELOADDT1:
36794 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
36795 break;
36796 case X86::PTILESTORED:
36797 Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
36798 break;
36799#undef GET_EGPR_IF_ENABLED
36800 }
36801
36802 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
36803 unsigned CurOp = 0;
36804 if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX)
36805 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
36807
36808 MIB.add(MI.getOperand(CurOp++)); // base
36809 MIB.add(MI.getOperand(CurOp++)); // scale
36810 MIB.add(MI.getOperand(CurOp++)); // index -- stride
36811 MIB.add(MI.getOperand(CurOp++)); // displacement
36812 MIB.add(MI.getOperand(CurOp++)); // segment
36813
36814 if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX)
36815 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
36817
36818 MI.eraseFromParent(); // The pseudo is gone now.
36819 return BB;
36820 }
36821 case X86::PTCMMIMFP16PS:
36822 case X86::PTCMMRLFP16PS: {
36823 const MIMetadata MIMD(MI);
36824 unsigned Opc;
36825 switch (MI.getOpcode()) {
36826 // clang-format off
36827 default: llvm_unreachable("Unexpected instruction!");
36828 case X86::PTCMMIMFP16PS: Opc = X86::TCMMIMFP16PS; break;
36829 case X86::PTCMMRLFP16PS: Opc = X86::TCMMRLFP16PS; break;
36830 // clang-format on
36831 }
36832 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
36833 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
36834 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
36835 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
36836 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
36837 MI.eraseFromParent(); // The pseudo is gone now.
36838 return BB;
36839 }
36840 }
36841}
36842
36843//===----------------------------------------------------------------------===//
36844// X86 Optimization Hooks
36845//===----------------------------------------------------------------------===//
36846
36847bool
36849 const APInt &DemandedBits,
36850 const APInt &DemandedElts,
36851 TargetLoweringOpt &TLO) const {
36852 EVT VT = Op.getValueType();
36853 unsigned Opcode = Op.getOpcode();
36854 unsigned EltSize = VT.getScalarSizeInBits();
36855
36856 if (VT.isVector()) {
36857 // If the constant is only all signbits in the active bits, then we should
36858 // extend it to the entire constant to allow it act as a boolean constant
36859 // vector.
36860 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
36861 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
36862 return false;
36863 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
36864 if (!DemandedElts[i] || V.getOperand(i).isUndef())
36865 continue;
36866 const APInt &Val = V.getConstantOperandAPInt(i);
36867 if (Val.getBitWidth() > Val.getNumSignBits() &&
36868 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
36869 return true;
36870 }
36871 return false;
36872 };
36873 // For vectors - if we have a constant, then try to sign extend.
36874 // TODO: Handle AND cases.
36875 unsigned ActiveBits = DemandedBits.getActiveBits();
36876 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
36877 (Opcode == ISD::OR || Opcode == ISD::XOR || Opcode == X86ISD::ANDNP) &&
36878 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
36879 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
36880 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
36882 SDValue NewC =
36884 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
36885 SDValue NewOp =
36886 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
36887 return TLO.CombineTo(Op, NewOp);
36888 }
36889 return false;
36890 }
36891
36892 // Only optimize Ands to prevent shrinking a constant that could be
36893 // matched by movzx.
36894 if (Opcode != ISD::AND)
36895 return false;
36896
36897 // Make sure the RHS really is a constant.
36898 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
36899 if (!C)
36900 return false;
36901
36902 const APInt &Mask = C->getAPIntValue();
36903
36904 // Clear all non-demanded bits initially.
36905 APInt ShrunkMask = Mask & DemandedBits;
36906
36907 // Find the width of the shrunk mask.
36908 unsigned Width = ShrunkMask.getActiveBits();
36909
36910 // If the mask is all 0s there's nothing to do here.
36911 if (Width == 0)
36912 return false;
36913
36914 // Find the next power of 2 width, rounding up to a byte.
36915 Width = llvm::bit_ceil(std::max(Width, 8U));
36916 // Truncate the width to size to handle illegal types.
36917 Width = std::min(Width, EltSize);
36918
36919 // Calculate a possible zero extend mask for this constant.
36920 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
36921
36922 // If we aren't changing the mask, just return true to keep it and prevent
36923 // the caller from optimizing.
36924 if (ZeroExtendMask == Mask)
36925 return true;
36926
36927 // Make sure the new mask can be represented by a combination of mask bits
36928 // and non-demanded bits.
36929 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
36930 return false;
36931
36932 // Replace the constant with the zero extend mask.
36933 SDLoc DL(Op);
36934 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
36935 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
36936 return TLO.CombineTo(Op, NewOp);
36937}
36938
36940 KnownBits &Known,
36941 const APInt &DemandedElts,
36942 const SelectionDAG &DAG, unsigned Depth) {
36943 KnownBits Known2;
36944 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
36945 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
36946 Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1);
36947 Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1);
36948 Known = KnownBits::abdu(Known, Known2).zext(16);
36949 // Known = (((D0 + D1) + (D2 + D3)) + ((D4 + D5) + (D6 + D7)))
36950 Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true,
36951 Known, Known);
36952 Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true,
36953 Known, Known);
36954 Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true,
36955 Known, Known);
36956 Known = Known.zext(64);
36957}
36958
36960 KnownBits &Known,
36961 const APInt &DemandedElts,
36962 const SelectionDAG &DAG,
36963 unsigned Depth) const {
36964 unsigned BitWidth = Known.getBitWidth();
36965 unsigned NumElts = DemandedElts.getBitWidth();
36966 unsigned Opc = Op.getOpcode();
36967 EVT VT = Op.getValueType();
36968 assert((Opc >= ISD::BUILTIN_OP_END ||
36969 Opc == ISD::INTRINSIC_WO_CHAIN ||
36970 Opc == ISD::INTRINSIC_W_CHAIN ||
36971 Opc == ISD::INTRINSIC_VOID) &&
36972 "Should use MaskedValueIsZero if you don't know whether Op"
36973 " is a target node!");
36974
36975 Known.resetAll();
36976 switch (Opc) {
36977 default: break;
36978 case X86ISD::MUL_IMM: {
36979 KnownBits Known2;
36980 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36981 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36982 Known = KnownBits::mul(Known, Known2);
36983 break;
36984 }
36985 case X86ISD::SETCC:
36986 Known.Zero.setBitsFrom(1);
36987 break;
36988 case X86ISD::MOVMSK: {
36989 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
36990 Known.Zero.setBitsFrom(NumLoBits);
36991 break;
36992 }
36993 case X86ISD::PEXTRB:
36994 case X86ISD::PEXTRW: {
36995 SDValue Src = Op.getOperand(0);
36996 EVT SrcVT = Src.getValueType();
36997 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
36998 Op.getConstantOperandVal(1));
36999 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
37000 Known = Known.anyextOrTrunc(BitWidth);
37001 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
37002 break;
37003 }
37004 case X86ISD::VSRAI:
37005 case X86ISD::VSHLI:
37006 case X86ISD::VSRLI: {
37007 unsigned ShAmt = Op.getConstantOperandVal(1);
37008 if (ShAmt >= VT.getScalarSizeInBits()) {
37009 // Out of range logical bit shifts are guaranteed to be zero.
37010 // Out of range arithmetic bit shifts splat the sign bit.
37011 if (Opc != X86ISD::VSRAI) {
37012 Known.setAllZero();
37013 break;
37014 }
37015
37016 ShAmt = VT.getScalarSizeInBits() - 1;
37017 }
37018
37019 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37020 if (Opc == X86ISD::VSHLI) {
37021 Known.Zero <<= ShAmt;
37022 Known.One <<= ShAmt;
37023 // Low bits are known zero.
37024 Known.Zero.setLowBits(ShAmt);
37025 } else if (Opc == X86ISD::VSRLI) {
37026 Known.Zero.lshrInPlace(ShAmt);
37027 Known.One.lshrInPlace(ShAmt);
37028 // High bits are known zero.
37029 Known.Zero.setHighBits(ShAmt);
37030 } else {
37031 Known.Zero.ashrInPlace(ShAmt);
37032 Known.One.ashrInPlace(ShAmt);
37033 }
37034 break;
37035 }
37036 case X86ISD::PACKUS: {
37037 // PACKUS is just a truncation if the upper half is zero.
37038 APInt DemandedLHS, DemandedRHS;
37039 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
37040
37041 Known.One = APInt::getAllOnes(BitWidth * 2);
37042 Known.Zero = APInt::getAllOnes(BitWidth * 2);
37043
37044 KnownBits Known2;
37045 if (!!DemandedLHS) {
37046 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
37047 Known = Known.intersectWith(Known2);
37048 }
37049 if (!!DemandedRHS) {
37050 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
37051 Known = Known.intersectWith(Known2);
37052 }
37053
37054 if (Known.countMinLeadingZeros() < BitWidth)
37055 Known.resetAll();
37056 Known = Known.trunc(BitWidth);
37057 break;
37058 }
37059 case X86ISD::PSHUFB: {
37060 SDValue Src = Op.getOperand(0);
37061 SDValue Idx = Op.getOperand(1);
37062
37063 // If the index vector is never negative (MSB is zero), then all elements
37064 // come from the source vector. This is useful for cases where
37065 // PSHUFB is being used as a LUT (ctpop etc.) - the target shuffle handling
37066 // below will handle the more common constant shuffle mask case.
37067 KnownBits KnownIdx = DAG.computeKnownBits(Idx, DemandedElts, Depth + 1);
37068 if (KnownIdx.isNonNegative())
37069 Known = DAG.computeKnownBits(Src, Depth + 1);
37070 break;
37071 }
37072 case X86ISD::VBROADCAST: {
37073 SDValue Src = Op.getOperand(0);
37074 if (!Src.getSimpleValueType().isVector()) {
37075 Known = DAG.computeKnownBits(Src, Depth + 1);
37076 return;
37077 }
37078 break;
37079 }
37080 case X86ISD::AND: {
37081 if (Op.getResNo() == 0) {
37082 KnownBits Known2;
37083 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37084 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37085 Known &= Known2;
37086 }
37087 break;
37088 }
37089 case X86ISD::ANDNP: {
37090 KnownBits Known2;
37091 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37092 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37093
37094 // ANDNP = (~X & Y);
37095 Known.One &= Known2.Zero;
37096 Known.Zero |= Known2.One;
37097 break;
37098 }
37099 case X86ISD::FOR: {
37100 KnownBits Known2;
37101 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37102 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37103
37104 Known |= Known2;
37105 break;
37106 }
37107 case X86ISD::PSADBW: {
37108 SDValue LHS = Op.getOperand(0);
37109 SDValue RHS = Op.getOperand(1);
37110 assert(VT.getScalarType() == MVT::i64 &&
37111 LHS.getValueType() == RHS.getValueType() &&
37112 LHS.getValueType().getScalarType() == MVT::i8 &&
37113 "Unexpected PSADBW types");
37114 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
37115 break;
37116 }
37117 case X86ISD::PCMPGT:
37118 case X86ISD::PCMPEQ: {
37119 KnownBits KnownLhs =
37120 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37121 KnownBits KnownRhs =
37122 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37123 std::optional<bool> Res = Opc == X86ISD::PCMPEQ
37124 ? KnownBits::eq(KnownLhs, KnownRhs)
37125 : KnownBits::sgt(KnownLhs, KnownRhs);
37126 if (Res) {
37127 if (*Res)
37128 Known.setAllOnes();
37129 else
37130 Known.setAllZero();
37131 }
37132 break;
37133 }
37134 case X86ISD::PMULUDQ: {
37135 KnownBits Known2;
37136 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37137 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37138
37139 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
37140 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
37141 Known = KnownBits::mul(Known, Known2);
37142 break;
37143 }
37144 case X86ISD::CMOV: {
37145 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
37146 // If we don't know any bits, early out.
37147 if (Known.isUnknown())
37148 break;
37149 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
37150
37151 // Only known if known in both the LHS and RHS.
37152 Known = Known.intersectWith(Known2);
37153 break;
37154 }
37155 case X86ISD::BEXTR:
37156 case X86ISD::BEXTRI: {
37157 SDValue Op0 = Op.getOperand(0);
37158 SDValue Op1 = Op.getOperand(1);
37159
37160 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
37161 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
37162 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
37163
37164 // If the length is 0, the result is 0.
37165 if (Length == 0) {
37166 Known.setAllZero();
37167 break;
37168 }
37169
37170 if ((Shift + Length) <= BitWidth) {
37171 Known = DAG.computeKnownBits(Op0, Depth + 1);
37172 Known = Known.extractBits(Length, Shift);
37173 Known = Known.zextOrTrunc(BitWidth);
37174 }
37175 }
37176 break;
37177 }
37178 case X86ISD::PDEP: {
37179 KnownBits Known2;
37180 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37181 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37182 // Zeros are retained from the mask operand. But not ones.
37183 Known.One.clearAllBits();
37184 // The result will have at least as many trailing zeros as the non-mask
37185 // operand since bits can only map to the same or higher bit position.
37186 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
37187 break;
37188 }
37189 case X86ISD::PEXT: {
37190 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37191 // The result has as many leading zeros as the number of zeroes in the mask.
37192 unsigned Count = Known.Zero.popcount();
37193 Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
37194 Known.One.clearAllBits();
37195 break;
37196 }
37197 case X86ISD::VTRUNC:
37198 case X86ISD::VTRUNCS:
37199 case X86ISD::VTRUNCUS:
37200 case X86ISD::CVTSI2P:
37201 case X86ISD::CVTUI2P:
37202 case X86ISD::CVTP2SI:
37203 case X86ISD::CVTP2UI:
37204 case X86ISD::MCVTP2SI:
37205 case X86ISD::MCVTP2UI:
37206 case X86ISD::CVTTP2SI:
37207 case X86ISD::CVTTP2UI:
37208 case X86ISD::MCVTTP2SI:
37209 case X86ISD::MCVTTP2UI:
37210 case X86ISD::MCVTSI2P:
37211 case X86ISD::MCVTUI2P:
37212 case X86ISD::VFPROUND:
37213 case X86ISD::VMFPROUND:
37214 case X86ISD::CVTPS2PH:
37215 case X86ISD::MCVTPS2PH: {
37216 // Truncations/Conversions - upper elements are known zero.
37217 EVT SrcVT = Op.getOperand(0).getValueType();
37218 if (SrcVT.isVector()) {
37219 unsigned NumSrcElts = SrcVT.getVectorNumElements();
37220 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
37221 Known.setAllZero();
37222 }
37223 break;
37224 }
37231 // Strict Conversions - upper elements are known zero.
37232 EVT SrcVT = Op.getOperand(1).getValueType();
37233 if (SrcVT.isVector()) {
37234 unsigned NumSrcElts = SrcVT.getVectorNumElements();
37235 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
37236 Known.setAllZero();
37237 }
37238 break;
37239 }
37240 case X86ISD::MOVQ2DQ: {
37241 // Move from MMX to XMM. Upper half of XMM should be 0.
37242 if (DemandedElts.countr_zero() >= (NumElts / 2))
37243 Known.setAllZero();
37244 break;
37245 }
37247 APInt UndefElts;
37248 SmallVector<APInt, 16> EltBits;
37249 if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
37250 /*AllowWholeUndefs*/ false,
37251 /*AllowPartialUndefs*/ false)) {
37252 Known.Zero.setAllBits();
37253 Known.One.setAllBits();
37254 for (unsigned I = 0; I != NumElts; ++I) {
37255 if (!DemandedElts[I])
37256 continue;
37257 if (UndefElts[I]) {
37258 Known.resetAll();
37259 break;
37260 }
37261 KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
37262 Known = Known.intersectWith(Known2);
37263 }
37264 return;
37265 }
37266 break;
37267 }
37269 switch (Op->getConstantOperandVal(0)) {
37270 case Intrinsic::x86_sse2_psad_bw:
37271 case Intrinsic::x86_avx2_psad_bw:
37272 case Intrinsic::x86_avx512_psad_bw_512: {
37273 SDValue LHS = Op.getOperand(1);
37274 SDValue RHS = Op.getOperand(2);
37275 assert(VT.getScalarType() == MVT::i64 &&
37276 LHS.getValueType() == RHS.getValueType() &&
37277 LHS.getValueType().getScalarType() == MVT::i8 &&
37278 "Unexpected PSADBW types");
37279 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
37280 break;
37281 }
37282 }
37283 break;
37284 }
37285 }
37286
37287 // Handle target shuffles.
37288 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
37289 if (isTargetShuffle(Opc)) {
37292 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
37293 unsigned NumOps = Ops.size();
37294 unsigned NumElts = VT.getVectorNumElements();
37295 if (Mask.size() == NumElts) {
37296 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
37297 Known.Zero.setAllBits(); Known.One.setAllBits();
37298 for (unsigned i = 0; i != NumElts; ++i) {
37299 if (!DemandedElts[i])
37300 continue;
37301 int M = Mask[i];
37302 if (M == SM_SentinelUndef) {
37303 // For UNDEF elements, we don't know anything about the common state
37304 // of the shuffle result.
37305 Known.resetAll();
37306 break;
37307 }
37308 if (M == SM_SentinelZero) {
37309 Known.One.clearAllBits();
37310 continue;
37311 }
37312 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
37313 "Shuffle index out of range");
37314
37315 unsigned OpIdx = (unsigned)M / NumElts;
37316 unsigned EltIdx = (unsigned)M % NumElts;
37317 if (Ops[OpIdx].getValueType() != VT) {
37318 // TODO - handle target shuffle ops with different value types.
37319 Known.resetAll();
37320 break;
37321 }
37322 DemandedOps[OpIdx].setBit(EltIdx);
37323 }
37324 // Known bits are the values that are shared by every demanded element.
37325 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
37326 if (!DemandedOps[i])
37327 continue;
37328 KnownBits Known2 =
37329 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
37330 Known = Known.intersectWith(Known2);
37331 }
37332 }
37333 }
37334 }
37335}
37336
37338 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
37339 unsigned Depth) const {
37340 EVT VT = Op.getValueType();
37341 unsigned VTBits = VT.getScalarSizeInBits();
37342 unsigned Opcode = Op.getOpcode();
37343 switch (Opcode) {
37345 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
37346 return VTBits;
37347
37348 case X86ISD::VTRUNC: {
37349 SDValue Src = Op.getOperand(0);
37350 MVT SrcVT = Src.getSimpleValueType();
37351 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
37352 assert(VTBits < NumSrcBits && "Illegal truncation input type");
37353 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
37354 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
37355 if (Tmp > (NumSrcBits - VTBits))
37356 return Tmp - (NumSrcBits - VTBits);
37357 return 1;
37358 }
37359
37360 case X86ISD::PACKSS: {
37361 // PACKSS is just a truncation if the sign bits extend to the packed size.
37362 APInt DemandedLHS, DemandedRHS;
37363 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
37364 DemandedRHS);
37365
37366 // Helper to detect PACKSSDW(BITCAST(PACKSSDW(X)),BITCAST(PACKSSDW(Y)))
37367 // patterns often used to compact vXi64 allsignbit patterns.
37368 auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {
37370 if (BC.getOpcode() == X86ISD::PACKSS &&
37371 BC.getScalarValueSizeInBits() == 16 &&
37372 V.getScalarValueSizeInBits() == 32) {
37375 if (BC0.getScalarValueSizeInBits() == 64 &&
37376 BC1.getScalarValueSizeInBits() == 64 &&
37377 DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 &&
37378 DAG.ComputeNumSignBits(BC1, Depth + 1) == 64)
37379 return 32;
37380 }
37381 return DAG.ComputeNumSignBits(V, Elts, Depth + 1);
37382 };
37383
37384 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
37385 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
37386 if (!!DemandedLHS)
37387 Tmp0 = NumSignBitsPACKSS(Op.getOperand(0), DemandedLHS);
37388 if (!!DemandedRHS)
37389 Tmp1 = NumSignBitsPACKSS(Op.getOperand(1), DemandedRHS);
37390 unsigned Tmp = std::min(Tmp0, Tmp1);
37391 if (Tmp > (SrcBits - VTBits))
37392 return Tmp - (SrcBits - VTBits);
37393 return 1;
37394 }
37395
37396 case X86ISD::VBROADCAST: {
37397 SDValue Src = Op.getOperand(0);
37398 if (!Src.getSimpleValueType().isVector())
37399 return DAG.ComputeNumSignBits(Src, Depth + 1);
37400 break;
37401 }
37402
37403 case X86ISD::VSHLI: {
37404 SDValue Src = Op.getOperand(0);
37405 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
37406 if (ShiftVal.uge(VTBits))
37407 return VTBits; // Shifted all bits out --> zero.
37408 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
37409 if (ShiftVal.uge(Tmp))
37410 return 1; // Shifted all sign bits out --> unknown.
37411 return Tmp - ShiftVal.getZExtValue();
37412 }
37413
37414 case X86ISD::VSRAI: {
37415 SDValue Src = Op.getOperand(0);
37416 APInt ShiftVal = Op.getConstantOperandAPInt(1);
37417 if (ShiftVal.uge(VTBits - 1))
37418 return VTBits; // Sign splat.
37419 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
37420 ShiftVal += Tmp;
37421 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
37422 }
37423
37424 case X86ISD::FSETCC:
37425 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
37426 if (VT == MVT::f32 || VT == MVT::f64 ||
37427 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
37428 return VTBits;
37429 break;
37430
37431 case X86ISD::PCMPGT:
37432 case X86ISD::PCMPEQ:
37433 case X86ISD::CMPP:
37434 case X86ISD::VPCOM:
37435 case X86ISD::VPCOMU:
37436 // Vector compares return zero/all-bits result values.
37437 return VTBits;
37438
37439 case X86ISD::ANDNP: {
37440 unsigned Tmp0 =
37441 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
37442 if (Tmp0 == 1) return 1; // Early out.
37443 unsigned Tmp1 =
37444 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
37445 return std::min(Tmp0, Tmp1);
37446 }
37447
37448 case X86ISD::CMOV: {
37449 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
37450 if (Tmp0 == 1) return 1; // Early out.
37451 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
37452 return std::min(Tmp0, Tmp1);
37453 }
37454 }
37455
37456 // Handle target shuffles.
37457 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
37458 if (isTargetShuffle(Opcode)) {
37461 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
37462 unsigned NumOps = Ops.size();
37463 unsigned NumElts = VT.getVectorNumElements();
37464 if (Mask.size() == NumElts) {
37465 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
37466 for (unsigned i = 0; i != NumElts; ++i) {
37467 if (!DemandedElts[i])
37468 continue;
37469 int M = Mask[i];
37470 if (M == SM_SentinelUndef) {
37471 // For UNDEF elements, we don't know anything about the common state
37472 // of the shuffle result.
37473 return 1;
37474 } else if (M == SM_SentinelZero) {
37475 // Zero = all sign bits.
37476 continue;
37477 }
37478 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
37479 "Shuffle index out of range");
37480
37481 unsigned OpIdx = (unsigned)M / NumElts;
37482 unsigned EltIdx = (unsigned)M % NumElts;
37483 if (Ops[OpIdx].getValueType() != VT) {
37484 // TODO - handle target shuffle ops with different value types.
37485 return 1;
37486 }
37487 DemandedOps[OpIdx].setBit(EltIdx);
37488 }
37489 unsigned Tmp0 = VTBits;
37490 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
37491 if (!DemandedOps[i])
37492 continue;
37493 unsigned Tmp1 =
37494 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
37495 Tmp0 = std::min(Tmp0, Tmp1);
37496 }
37497 return Tmp0;
37498 }
37499 }
37500 }
37501
37502 // Fallback case.
37503 return 1;
37504}
37505
37507 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
37508 return N->getOperand(0);
37509 return N;
37510}
37511
37512// Helper to look for a normal load that can be narrowed into a vzload with the
37513// specified VT and memory VT. Returns SDValue() on failure.
37515 SelectionDAG &DAG) {
37516 // Can't if the load is volatile or atomic.
37517 if (!LN->isSimple())
37518 return SDValue();
37519
37520 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37521 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
37522 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
37523 LN->getPointerInfo(), LN->getOriginalAlign(),
37524 LN->getMemOperand()->getFlags());
37525}
37526
37527// Attempt to match a combined shuffle mask against supported unary shuffle
37528// instructions.
37529// TODO: Investigate sharing more of this with shuffle lowering.
37530static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
37531 bool AllowFloatDomain, bool AllowIntDomain,
37532 SDValue V1, const SelectionDAG &DAG,
37533 const X86Subtarget &Subtarget, unsigned &Shuffle,
37534 MVT &SrcVT, MVT &DstVT) {
37535 unsigned NumMaskElts = Mask.size();
37536 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
37537
37538 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
37539 if (Mask[0] == 0 &&
37540 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
37541 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
37543 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
37544 Shuffle = X86ISD::VZEXT_MOVL;
37545 if (MaskEltSize == 16)
37546 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
37547 else
37548 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
37549 return true;
37550 }
37551 }
37552
37553 // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction.
37554 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
37555 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
37556 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
37557 unsigned MaxScale = 64 / MaskEltSize;
37558 bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize &&
37559 DAG.ComputeNumSignBits(V1) == MaskEltSize;
37560 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
37561 bool MatchAny = true;
37562 bool MatchZero = true;
37563 bool MatchSign = UseSign;
37564 unsigned NumDstElts = NumMaskElts / Scale;
37565 for (unsigned i = 0;
37566 i != NumDstElts && (MatchAny || MatchSign || MatchZero); ++i) {
37567 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
37568 MatchAny = MatchSign = MatchZero = false;
37569 break;
37570 }
37571 unsigned Pos = (i * Scale) + 1;
37572 unsigned Len = Scale - 1;
37573 MatchAny &= isUndefInRange(Mask, Pos, Len);
37574 MatchZero &= isUndefOrZeroInRange(Mask, Pos, Len);
37575 MatchSign &= isUndefOrEqualInRange(Mask, (int)i, Pos, Len);
37576 }
37577 if (MatchAny || MatchSign || MatchZero) {
37578 assert((MatchSign || MatchZero) &&
37579 "Failed to match sext/zext but matched aext?");
37580 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
37581 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType()
37582 : MVT::getIntegerVT(MaskEltSize);
37583 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
37584
37585 Shuffle = unsigned(
37586 MatchAny ? ISD::ANY_EXTEND
37587 : (MatchSign ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND));
37588 if (SrcVT.getVectorNumElements() != NumDstElts)
37589 Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
37590
37591 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
37592 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
37593 return true;
37594 }
37595 }
37596 }
37597
37598 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
37599 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
37600 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
37601 isUndefOrEqual(Mask[0], 0) &&
37602 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
37603 Shuffle = X86ISD::VZEXT_MOVL;
37604 if (MaskEltSize == 16)
37605 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
37606 else
37607 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
37608 return true;
37609 }
37610
37611 // Check if we have SSE3 which will let us use MOVDDUP etc. The
37612 // instructions are no slower than UNPCKLPD but has the option to
37613 // fold the input operand into even an unaligned memory load.
37614 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
37615 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
37616 Shuffle = X86ISD::MOVDDUP;
37617 SrcVT = DstVT = MVT::v2f64;
37618 return true;
37619 }
37620 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
37621 Shuffle = X86ISD::MOVSLDUP;
37622 SrcVT = DstVT = MVT::v4f32;
37623 return true;
37624 }
37625 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
37626 Shuffle = X86ISD::MOVSHDUP;
37627 SrcVT = DstVT = MVT::v4f32;
37628 return true;
37629 }
37630 }
37631
37632 if (MaskVT.is256BitVector() && AllowFloatDomain) {
37633 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
37634 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
37635 Shuffle = X86ISD::MOVDDUP;
37636 SrcVT = DstVT = MVT::v4f64;
37637 return true;
37638 }
37639 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
37640 V1)) {
37641 Shuffle = X86ISD::MOVSLDUP;
37642 SrcVT = DstVT = MVT::v8f32;
37643 return true;
37644 }
37645 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
37646 V1)) {
37647 Shuffle = X86ISD::MOVSHDUP;
37648 SrcVT = DstVT = MVT::v8f32;
37649 return true;
37650 }
37651 }
37652
37653 if (MaskVT.is512BitVector() && AllowFloatDomain) {
37654 assert(Subtarget.hasAVX512() &&
37655 "AVX512 required for 512-bit vector shuffles");
37656 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
37657 V1)) {
37658 Shuffle = X86ISD::MOVDDUP;
37659 SrcVT = DstVT = MVT::v8f64;
37660 return true;
37661 }
37663 MaskVT, Mask,
37664 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
37665 Shuffle = X86ISD::MOVSLDUP;
37666 SrcVT = DstVT = MVT::v16f32;
37667 return true;
37668 }
37670 MaskVT, Mask,
37671 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
37672 Shuffle = X86ISD::MOVSHDUP;
37673 SrcVT = DstVT = MVT::v16f32;
37674 return true;
37675 }
37676 }
37677
37678 return false;
37679}
37680
37681// Attempt to match a combined shuffle mask against supported unary immediate
37682// permute instructions.
37683// TODO: Investigate sharing more of this with shuffle lowering.
37685 const APInt &Zeroable,
37686 bool AllowFloatDomain, bool AllowIntDomain,
37687 const SelectionDAG &DAG,
37688 const X86Subtarget &Subtarget,
37689 unsigned &Shuffle, MVT &ShuffleVT,
37690 unsigned &PermuteImm) {
37691 unsigned NumMaskElts = Mask.size();
37692 unsigned InputSizeInBits = MaskVT.getSizeInBits();
37693 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
37694 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
37695 bool ContainsZeros = isAnyZero(Mask);
37696
37697 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
37698 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
37699 // Check for lane crossing permutes.
37700 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
37701 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
37702 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
37703 Shuffle = X86ISD::VPERMI;
37704 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
37705 PermuteImm = getV4X86ShuffleImm(Mask);
37706 return true;
37707 }
37708 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
37709 SmallVector<int, 4> RepeatedMask;
37710 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
37711 Shuffle = X86ISD::VPERMI;
37712 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
37713 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
37714 return true;
37715 }
37716 }
37717 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
37718 // VPERMILPD can permute with a non-repeating shuffle.
37719 Shuffle = X86ISD::VPERMILPI;
37720 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
37721 PermuteImm = 0;
37722 for (int i = 0, e = Mask.size(); i != e; ++i) {
37723 int M = Mask[i];
37724 if (M == SM_SentinelUndef)
37725 continue;
37726 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
37727 PermuteImm |= (M & 1) << i;
37728 }
37729 return true;
37730 }
37731 }
37732
37733 // We are checking for shuffle match or shift match. Loop twice so we can
37734 // order which we try and match first depending on target preference.
37735 for (unsigned Order = 0; Order < 2; ++Order) {
37736 if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
37737 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
37738 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
37739 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
37740 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
37741 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
37742 SmallVector<int, 4> RepeatedMask;
37743 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
37744 // Narrow the repeated mask to create 32-bit element permutes.
37745 SmallVector<int, 4> WordMask = RepeatedMask;
37746 if (MaskScalarSizeInBits == 64)
37747 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
37748
37749 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
37750 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
37751 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
37752 PermuteImm = getV4X86ShuffleImm(WordMask);
37753 return true;
37754 }
37755 }
37756
37757 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
37758 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
37759 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
37760 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
37761 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
37762 SmallVector<int, 4> RepeatedMask;
37763 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
37764 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
37765 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
37766
37767 // PSHUFLW: permute lower 4 elements only.
37768 if (isUndefOrInRange(LoMask, 0, 4) &&
37769 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
37770 Shuffle = X86ISD::PSHUFLW;
37771 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
37772 PermuteImm = getV4X86ShuffleImm(LoMask);
37773 return true;
37774 }
37775
37776 // PSHUFHW: permute upper 4 elements only.
37777 if (isUndefOrInRange(HiMask, 4, 8) &&
37778 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
37779 // Offset the HiMask so that we can create the shuffle immediate.
37780 int OffsetHiMask[4];
37781 for (int i = 0; i != 4; ++i)
37782 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
37783
37784 Shuffle = X86ISD::PSHUFHW;
37785 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
37786 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
37787 return true;
37788 }
37789 }
37790 }
37791 } else {
37792 // Attempt to match against bit rotates.
37793 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
37794 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
37795 Subtarget.hasAVX512())) {
37796 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
37797 Subtarget, Mask);
37798 if (0 < RotateAmt) {
37799 Shuffle = X86ISD::VROTLI;
37800 PermuteImm = (unsigned)RotateAmt;
37801 return true;
37802 }
37803 }
37804 }
37805 // Attempt to match against byte/bit shifts.
37806 if (AllowIntDomain &&
37807 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
37808 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
37809 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
37810 int ShiftAmt =
37811 matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
37812 Zeroable, Subtarget);
37813 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
37814 32 <= ShuffleVT.getScalarSizeInBits())) {
37815 // Byte shifts can be slower so only match them on second attempt.
37816 if (Order == 0 &&
37817 (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
37818 continue;
37819
37820 PermuteImm = (unsigned)ShiftAmt;
37821 return true;
37822 }
37823
37824 }
37825 }
37826
37827 return false;
37828}
37829
37830// Attempt to match a combined unary shuffle mask against supported binary
37831// shuffle instructions.
37832// TODO: Investigate sharing more of this with shuffle lowering.
37833static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
37834 bool AllowFloatDomain, bool AllowIntDomain,
37835 SDValue &V1, SDValue &V2, const SDLoc &DL,
37836 SelectionDAG &DAG, const X86Subtarget &Subtarget,
37837 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
37838 bool IsUnary) {
37839 unsigned NumMaskElts = Mask.size();
37840 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
37841 unsigned SizeInBits = MaskVT.getSizeInBits();
37842
37843 if (MaskVT.is128BitVector()) {
37844 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
37845 AllowFloatDomain) {
37846 V2 = V1;
37847 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
37848 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
37849 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
37850 return true;
37851 }
37852 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
37853 AllowFloatDomain) {
37854 V2 = V1;
37855 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
37856 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
37857 return true;
37858 }
37859 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
37860 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
37861 std::swap(V1, V2);
37862 Shuffle = X86ISD::MOVSD;
37863 SrcVT = DstVT = MVT::v2f64;
37864 return true;
37865 }
37866 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
37867 (AllowFloatDomain || !Subtarget.hasSSE41())) {
37868 Shuffle = X86ISD::MOVSS;
37869 SrcVT = DstVT = MVT::v4f32;
37870 return true;
37871 }
37872 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
37873 DAG) &&
37874 Subtarget.hasFP16()) {
37875 Shuffle = X86ISD::MOVSH;
37876 SrcVT = DstVT = MVT::v8f16;
37877 return true;
37878 }
37879 }
37880
37881 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
37882 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
37883 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
37884 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
37885 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
37886 Subtarget)) {
37887 DstVT = MaskVT;
37888 return true;
37889 }
37890 }
37891 // TODO: Can we handle this inside matchShuffleWithPACK?
37892 if (MaskVT == MVT::v4i32 && Subtarget.hasSSE2() &&
37893 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2, 4, 6}, DAG) &&
37894 V1.getScalarValueSizeInBits() == 64 &&
37895 V2.getScalarValueSizeInBits() == 64) {
37896 // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.
37897 unsigned MinLZV1 = DAG.computeKnownBits(V1).countMinLeadingZeros();
37898 unsigned MinLZV2 = DAG.computeKnownBits(V2).countMinLeadingZeros();
37899 if (Subtarget.hasSSE41() && MinLZV1 >= 48 && MinLZV2 >= 48) {
37900 SrcVT = MVT::v4i32;
37901 DstVT = MVT::v8i16;
37902 Shuffle = X86ISD::PACKUS;
37903 return true;
37904 }
37905 // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.
37906 if (MinLZV1 >= 56 && MinLZV2 >= 56) {
37907 SrcVT = MVT::v8i16;
37908 DstVT = MVT::v16i8;
37909 Shuffle = X86ISD::PACKUS;
37910 return true;
37911 }
37912 // Use PACKSSWD if the signbits extend to the lowest 16-bits.
37913 if (DAG.ComputeNumSignBits(V1) > 48 && DAG.ComputeNumSignBits(V2) > 48) {
37914 SrcVT = MVT::v4i32;
37915 DstVT = MVT::v8i16;
37916 Shuffle = X86ISD::PACKSS;
37917 return true;
37918 }
37919 }
37920
37921 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
37922 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
37923 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
37924 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
37925 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
37926 (MaskVT.is512BitVector() && Subtarget.hasAVX512() &&
37927 (32 <= EltSizeInBits || Subtarget.hasBWI()))) {
37928 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
37929 Subtarget)) {
37930 SrcVT = DstVT = MaskVT;
37931 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
37932 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
37933 return true;
37934 }
37935 }
37936
37937 // Attempt to match against a OR if we're performing a blend shuffle and the
37938 // non-blended source element is zero in each case.
37939 // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
37940 if (SizeInBits == V1.getValueSizeInBits() &&
37941 SizeInBits == V2.getValueSizeInBits() &&
37942 (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
37943 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
37944 bool IsBlend = true;
37945 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
37946 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
37947 unsigned Scale1 = NumV1Elts / NumMaskElts;
37948 unsigned Scale2 = NumV2Elts / NumMaskElts;
37949 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
37950 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
37951 for (unsigned i = 0; i != NumMaskElts; ++i) {
37952 int M = Mask[i];
37953 if (M == SM_SentinelUndef)
37954 continue;
37955 if (M == SM_SentinelZero) {
37956 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
37957 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
37958 continue;
37959 }
37960 if (M == (int)i) {
37961 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
37962 continue;
37963 }
37964 if (M == (int)(i + NumMaskElts)) {
37965 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
37966 continue;
37967 }
37968 IsBlend = false;
37969 break;
37970 }
37971 if (IsBlend) {
37972 if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
37973 DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
37974 Shuffle = ISD::OR;
37975 SrcVT = DstVT = MaskVT.changeTypeToInteger();
37976 return true;
37977 }
37978 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
37979 // FIXME: handle mismatched sizes?
37980 // TODO: investigate if `ISD::OR` handling in
37981 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
37982 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
37983 unsigned NumElts = V.getValueType().getVectorNumElements();
37984 KnownBits Known(NumElts);
37985 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
37986 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
37987 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
37988 if (PeepholeKnown.isZero())
37989 Known.Zero.setBit(EltIdx);
37990 if (PeepholeKnown.isAllOnes())
37991 Known.One.setBit(EltIdx);
37992 }
37993 return Known;
37994 };
37995
37996 KnownBits V1Known = computeKnownBitsElementWise(V1);
37997 KnownBits V2Known = computeKnownBitsElementWise(V2);
37998
37999 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
38000 int M = Mask[i];
38001 if (M == SM_SentinelUndef)
38002 continue;
38003 if (M == SM_SentinelZero) {
38004 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
38005 continue;
38006 }
38007 if (M == (int)i) {
38008 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
38009 continue;
38010 }
38011 if (M == (int)(i + NumMaskElts)) {
38012 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
38013 continue;
38014 }
38015 llvm_unreachable("will not get here.");
38016 }
38017 if (IsBlend) {
38018 Shuffle = ISD::OR;
38019 SrcVT = DstVT = MaskVT.changeTypeToInteger();
38020 return true;
38021 }
38022 }
38023 }
38024 }
38025
38026 return false;
38027}
38028
38030 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
38031 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
38032 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
38033 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
38034 unsigned NumMaskElts = Mask.size();
38035 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
38036
38037 // Attempt to match against VALIGND/VALIGNQ rotate.
38038 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
38039 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
38040 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
38041 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
38042 if (!isAnyZero(Mask)) {
38043 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
38044 if (0 < Rotation) {
38045 Shuffle = X86ISD::VALIGN;
38046 if (EltSizeInBits == 64)
38047 ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
38048 else
38049 ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
38050 PermuteImm = Rotation;
38051 return true;
38052 }
38053 }
38054 }
38055
38056 // Attempt to match against PALIGNR byte rotate.
38057 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
38058 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
38059 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
38060 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
38061 if (0 < ByteRotation) {
38062 Shuffle = X86ISD::PALIGNR;
38063 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
38064 PermuteImm = ByteRotation;
38065 return true;
38066 }
38067 }
38068
38069 // Attempt to combine to X86ISD::BLENDI.
38070 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
38071 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
38072 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
38073 uint64_t BlendMask = 0;
38074 bool ForceV1Zero = false, ForceV2Zero = false;
38075 SmallVector<int, 8> TargetMask(Mask);
38076 if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
38077 ForceV2Zero, BlendMask)) {
38078 if (MaskVT == MVT::v16i16) {
38079 // We can only use v16i16 PBLENDW if the lanes are repeated.
38080 SmallVector<int, 8> RepeatedMask;
38081 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
38082 RepeatedMask)) {
38083 assert(RepeatedMask.size() == 8 &&
38084 "Repeated mask size doesn't match!");
38085 PermuteImm = 0;
38086 for (int i = 0; i < 8; ++i)
38087 if (RepeatedMask[i] >= 8)
38088 PermuteImm |= 1 << i;
38089 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
38090 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
38091 Shuffle = X86ISD::BLENDI;
38092 ShuffleVT = MaskVT;
38093 return true;
38094 }
38095 } else {
38096 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
38097 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
38098 PermuteImm = (unsigned)BlendMask;
38099 Shuffle = X86ISD::BLENDI;
38100 ShuffleVT = MaskVT;
38101 return true;
38102 }
38103 }
38104 }
38105
38106 // Attempt to combine to INSERTPS, but only if it has elements that need to
38107 // be set to zero.
38108 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
38109 MaskVT.is128BitVector() && isAnyZero(Mask) &&
38110 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
38111 Shuffle = X86ISD::INSERTPS;
38112 ShuffleVT = MVT::v4f32;
38113 return true;
38114 }
38115
38116 // Attempt to combine to SHUFPD.
38117 if (AllowFloatDomain && EltSizeInBits == 64 &&
38118 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
38119 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
38120 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
38121 bool ForceV1Zero = false, ForceV2Zero = false;
38122 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
38123 PermuteImm, Mask, Zeroable)) {
38124 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
38125 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
38126 Shuffle = X86ISD::SHUFP;
38127 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
38128 return true;
38129 }
38130 }
38131
38132 // Attempt to combine to SHUFPS.
38133 if (AllowFloatDomain && EltSizeInBits == 32 &&
38134 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
38135 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
38136 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
38137 SmallVector<int, 4> RepeatedMask;
38138 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
38139 // Match each half of the repeated mask, to determine if its just
38140 // referencing one of the vectors, is zeroable or entirely undef.
38141 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
38142 int M0 = RepeatedMask[Offset];
38143 int M1 = RepeatedMask[Offset + 1];
38144
38145 if (isUndefInRange(RepeatedMask, Offset, 2)) {
38146 return DAG.getUNDEF(MaskVT);
38147 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
38148 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
38149 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
38150 return getZeroVector(MaskVT, Subtarget, DAG, DL);
38151 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
38152 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
38153 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
38154 return V1;
38155 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
38156 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
38157 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
38158 return V2;
38159 }
38160
38161 return SDValue();
38162 };
38163
38164 int ShufMask[4] = {-1, -1, -1, -1};
38165 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
38166 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
38167
38168 if (Lo && Hi) {
38169 V1 = Lo;
38170 V2 = Hi;
38171 Shuffle = X86ISD::SHUFP;
38172 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
38173 PermuteImm = getV4X86ShuffleImm(ShufMask);
38174 return true;
38175 }
38176 }
38177 }
38178
38179 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
38180 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
38181 MaskVT.is128BitVector() &&
38182 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
38183 Shuffle = X86ISD::INSERTPS;
38184 ShuffleVT = MVT::v4f32;
38185 return true;
38186 }
38187
38188 return false;
38189}
38190
38192 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
38193 bool HasVariableMask, bool AllowVariableCrossLaneMask,
38194 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
38195 const X86Subtarget &Subtarget);
38196
38197/// Combine an arbitrary chain of shuffles into a single instruction if
38198/// possible.
38199///
38200/// This is the leaf of the recursive combine below. When we have found some
38201/// chain of single-use x86 shuffle instructions and accumulated the combined
38202/// shuffle mask represented by them, this will try to pattern match that mask
38203/// into either a single instruction if there is a special purpose instruction
38204/// for this operation, or into a PSHUFB instruction which is a fully general
38205/// instruction but should only be used to replace chains over a certain depth.
38207 ArrayRef<int> BaseMask, int Depth,
38208 bool HasVariableMask,
38209 bool AllowVariableCrossLaneMask,
38210 bool AllowVariablePerLaneMask,
38211 SelectionDAG &DAG,
38212 const X86Subtarget &Subtarget) {
38213 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
38214 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
38215 "Unexpected number of shuffle inputs!");
38216
38217 SDLoc DL(Root);
38218 MVT RootVT = Root.getSimpleValueType();
38219 unsigned RootSizeInBits = RootVT.getSizeInBits();
38220 unsigned NumRootElts = RootVT.getVectorNumElements();
38221
38222 // Canonicalize shuffle input op to the requested type.
38223 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
38224 if (VT.getSizeInBits() > Op.getValueSizeInBits())
38225 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
38226 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
38227 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
38228 return DAG.getBitcast(VT, Op);
38229 };
38230
38231 // Find the inputs that enter the chain. Note that multiple uses are OK
38232 // here, we're not going to remove the operands we find.
38233 bool UnaryShuffle = (Inputs.size() == 1);
38234 SDValue V1 = peekThroughBitcasts(Inputs[0]);
38235 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
38236 : peekThroughBitcasts(Inputs[1]));
38237
38238 MVT VT1 = V1.getSimpleValueType();
38239 MVT VT2 = V2.getSimpleValueType();
38240 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&
38241 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");
38242
38243 SDValue Res;
38244
38245 unsigned NumBaseMaskElts = BaseMask.size();
38246 if (NumBaseMaskElts == 1) {
38247 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
38248 return CanonicalizeShuffleInput(RootVT, V1);
38249 }
38250
38251 bool OptForSize = DAG.shouldOptForSize();
38252 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
38253 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
38254 (RootVT.isFloatingPoint() && Depth >= 1) ||
38255 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
38256
38257 // Don't combine if we are a AVX512/EVEX target and the mask element size
38258 // is different from the root element size - this would prevent writemasks
38259 // from being reused.
38260 bool IsMaskedShuffle = false;
38261 if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
38262 if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
38263 Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
38264 IsMaskedShuffle = true;
38265 }
38266 }
38267
38268 // If we are shuffling a splat (and not introducing zeros) then we can just
38269 // use it directly. This works for smaller elements as well as they already
38270 // repeat across each mask element.
38271 if (UnaryShuffle && !isAnyZero(BaseMask) &&
38272 V1.getValueSizeInBits() >= RootSizeInBits &&
38273 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
38274 DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
38275 return CanonicalizeShuffleInput(RootVT, V1);
38276 }
38277
38278 SmallVector<int, 64> Mask(BaseMask);
38279
38280 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
38281 // etc. can be simplified.
38282 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
38283 SmallVector<int> ScaledMask, IdentityMask;
38284 unsigned NumElts = VT1.getVectorNumElements();
38285 if (Mask.size() <= NumElts &&
38286 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
38287 for (unsigned i = 0; i != NumElts; ++i)
38288 IdentityMask.push_back(i);
38289 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
38290 V2))
38291 return CanonicalizeShuffleInput(RootVT, V1);
38292 }
38293 }
38294
38295 // Handle 128/256-bit lane shuffles of 512-bit vectors.
38296 if (RootVT.is512BitVector() &&
38297 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
38298 // If the upper subvectors are zeroable, then an extract+insert is more
38299 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
38300 // to zero the upper subvectors.
38301 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
38302 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
38303 return SDValue(); // Nothing to do!
38304 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
38305 "Unexpected lane shuffle");
38306 Res = CanonicalizeShuffleInput(RootVT, V1);
38307 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
38308 bool UseZero = isAnyZero(Mask);
38309 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
38310 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
38311 }
38312
38313 // Narrow shuffle mask to v4x128.
38314 SmallVector<int, 4> ScaledMask;
38315 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
38316 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
38317
38318 // Try to lower to vshuf64x2/vshuf32x4.
38319 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
38320 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
38321 SelectionDAG &DAG) {
38322 int PermMask[4] = {-1, -1, -1, -1};
38323 // Ensure elements came from the same Op.
38324 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
38325 for (int i = 0; i < 4; ++i) {
38326 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
38327 if (ScaledMask[i] < 0)
38328 continue;
38329
38330 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
38331 unsigned OpIndex = i / 2;
38332 if (Ops[OpIndex].isUndef())
38333 Ops[OpIndex] = Op;
38334 else if (Ops[OpIndex] != Op)
38335 return SDValue();
38336
38337 PermMask[i] = ScaledMask[i] % 4;
38338 }
38339
38340 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
38341 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
38342 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
38343 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
38344 };
38345
38346 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
38347 // doesn't work because our mask is for 128 bits and we don't have an MVT
38348 // to match that.
38349 bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) &&
38350 isUndefOrInRange(ScaledMask[1], 0, 2) &&
38351 isUndefOrInRange(ScaledMask[2], 2, 4) &&
38352 isUndefOrInRange(ScaledMask[3], 2, 4) &&
38353 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
38354 ScaledMask[0] == (ScaledMask[2] % 2)) &&
38355 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
38356 ScaledMask[1] == (ScaledMask[3] % 2));
38357
38358 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
38359 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
38360 return SDValue(); // Nothing to do!
38361 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
38362 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
38363 return DAG.getBitcast(RootVT, V);
38364 }
38365 }
38366
38367 // Handle 128-bit lane shuffles of 256-bit vectors.
38368 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
38369 // If the upper half is zeroable, then an extract+insert is more optimal
38370 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
38371 // zero the upper half.
38372 if (isUndefOrZero(Mask[1])) {
38373 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
38374 return SDValue(); // Nothing to do!
38375 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
38376 Res = CanonicalizeShuffleInput(RootVT, V1);
38377 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
38378 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
38379 256);
38380 }
38381
38382 // If we're inserting the low subvector, an insert-subvector 'concat'
38383 // pattern is quicker than VPERM2X128.
38384 // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
38385 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
38386 !Subtarget.hasAVX2()) {
38387 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
38388 return SDValue(); // Nothing to do!
38389 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
38390 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
38391 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
38392 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
38393 }
38394
38395 if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
38396 return SDValue(); // Nothing to do!
38397
38398 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
38399 // we need to use the zeroing feature.
38400 // Prefer blends for sequential shuffles unless we are optimizing for size.
38401 if (UnaryShuffle &&
38402 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
38403 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
38404 unsigned PermMask = 0;
38405 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
38406 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
38407 return DAG.getNode(
38408 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
38409 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
38410 }
38411
38412 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
38413 return SDValue(); // Nothing to do!
38414
38415 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
38416 if (!UnaryShuffle && !IsMaskedShuffle) {
38417 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
38418 "Unexpected shuffle sentinel value");
38419 // Prefer blends to X86ISD::VPERM2X128.
38420 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
38421 unsigned PermMask = 0;
38422 PermMask |= ((Mask[0] & 3) << 0);
38423 PermMask |= ((Mask[1] & 3) << 4);
38424 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
38425 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
38426 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
38427 CanonicalizeShuffleInput(RootVT, LHS),
38428 CanonicalizeShuffleInput(RootVT, RHS),
38429 DAG.getTargetConstant(PermMask, DL, MVT::i8));
38430 }
38431 }
38432 }
38433
38434 // For masks that have been widened to 128-bit elements or more,
38435 // narrow back down to 64-bit elements.
38436 if (BaseMaskEltSizeInBits > 64) {
38437 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
38438 int MaskScale = BaseMaskEltSizeInBits / 64;
38439 SmallVector<int, 64> ScaledMask;
38440 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
38441 Mask = std::move(ScaledMask);
38442 }
38443
38444 // For masked shuffles, we're trying to match the root width for better
38445 // writemask folding, attempt to scale the mask.
38446 // TODO - variable shuffles might need this to be widened again.
38447 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
38448 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
38449 int MaskScale = NumRootElts / Mask.size();
38450 SmallVector<int, 64> ScaledMask;
38451 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
38452 Mask = std::move(ScaledMask);
38453 }
38454
38455 unsigned NumMaskElts = Mask.size();
38456 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
38457 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38458
38459 // Determine the effective mask value type.
38460 FloatDomain &= (32 <= MaskEltSizeInBits);
38461 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
38462 : MVT::getIntegerVT(MaskEltSizeInBits);
38463 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
38464
38465 // Only allow legal mask types.
38466 if (!TLI.isTypeLegal(MaskVT))
38467 return SDValue();
38468
38469 // Attempt to match the mask against known shuffle patterns.
38470 MVT ShuffleSrcVT, ShuffleVT;
38471 unsigned Shuffle, PermuteImm;
38472
38473 // Which shuffle domains are permitted?
38474 // Permit domain crossing at higher combine depths.
38475 // TODO: Should we indicate which domain is preferred if both are allowed?
38476 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
38477 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
38478 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
38479
38480 // Determine zeroable mask elements.
38481 APInt KnownUndef, KnownZero;
38482 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
38483 APInt Zeroable = KnownUndef | KnownZero;
38484
38485 if (UnaryShuffle) {
38486 // Attempt to match against broadcast-from-vector.
38487 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
38488 if ((Subtarget.hasAVX2() ||
38489 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
38490 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
38491 if (isUndefOrEqual(Mask, 0)) {
38492 if (V1.getValueType() == MaskVT &&
38494 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
38495 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
38496 return SDValue(); // Nothing to do!
38497 Res = V1.getOperand(0);
38498 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
38499 return DAG.getBitcast(RootVT, Res);
38500 }
38501 if (Subtarget.hasAVX2()) {
38502 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
38503 return SDValue(); // Nothing to do!
38504 Res = CanonicalizeShuffleInput(MaskVT, V1);
38505 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
38506 return DAG.getBitcast(RootVT, Res);
38507 }
38508 }
38509 }
38510
38511 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
38512 DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
38513 (!IsMaskedShuffle ||
38514 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
38515 if (Depth == 0 && Root.getOpcode() == Shuffle)
38516 return SDValue(); // Nothing to do!
38517 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
38518 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
38519 return DAG.getBitcast(RootVT, Res);
38520 }
38521
38522 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
38523 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
38524 PermuteImm) &&
38525 (!IsMaskedShuffle ||
38526 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
38527 if (Depth == 0 && Root.getOpcode() == Shuffle)
38528 return SDValue(); // Nothing to do!
38529 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
38530 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
38531 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
38532 return DAG.getBitcast(RootVT, Res);
38533 }
38534 }
38535
38536 // Attempt to combine to INSERTPS, but only if the inserted element has come
38537 // from a scalar.
38538 // TODO: Handle other insertions here as well?
38539 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
38540 Subtarget.hasSSE41() &&
38541 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
38542 if (MaskEltSizeInBits == 32) {
38543 SDValue SrcV1 = V1, SrcV2 = V2;
38544 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
38545 DAG) &&
38546 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
38547 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
38548 return SDValue(); // Nothing to do!
38549 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
38550 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
38551 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
38552 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
38553 return DAG.getBitcast(RootVT, Res);
38554 }
38555 }
38556 if (MaskEltSizeInBits == 64 &&
38557 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
38558 V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
38559 V2.getScalarValueSizeInBits() <= 32) {
38560 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
38561 return SDValue(); // Nothing to do!
38562 PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
38563 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
38564 CanonicalizeShuffleInput(MVT::v4f32, V1),
38565 CanonicalizeShuffleInput(MVT::v4f32, V2),
38566 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
38567 return DAG.getBitcast(RootVT, Res);
38568 }
38569 }
38570
38571 SDValue NewV1 = V1; // Save operands in case early exit happens.
38572 SDValue NewV2 = V2;
38573 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
38574 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
38575 ShuffleVT, UnaryShuffle) &&
38576 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
38577 if (Depth == 0 && Root.getOpcode() == Shuffle)
38578 return SDValue(); // Nothing to do!
38579 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
38580 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
38581 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
38582 return DAG.getBitcast(RootVT, Res);
38583 }
38584
38585 NewV1 = V1; // Save operands in case early exit happens.
38586 NewV2 = V2;
38587 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
38588 AllowIntDomain, NewV1, NewV2, DL, DAG,
38589 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
38590 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
38591 if (Depth == 0 && Root.getOpcode() == Shuffle)
38592 return SDValue(); // Nothing to do!
38593 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
38594 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
38595 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
38596 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
38597 return DAG.getBitcast(RootVT, Res);
38598 }
38599
38600 // Typically from here on, we need an integer version of MaskVT.
38601 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
38602 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
38603
38604 // Annoyingly, SSE4A instructions don't map into the above match helpers.
38605 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
38606 uint64_t BitLen, BitIdx;
38607 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
38608 Zeroable)) {
38609 if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
38610 return SDValue(); // Nothing to do!
38611 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
38612 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
38613 DAG.getTargetConstant(BitLen, DL, MVT::i8),
38614 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
38615 return DAG.getBitcast(RootVT, Res);
38616 }
38617
38618 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
38619 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
38620 return SDValue(); // Nothing to do!
38621 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
38622 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
38623 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
38624 DAG.getTargetConstant(BitLen, DL, MVT::i8),
38625 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
38626 return DAG.getBitcast(RootVT, Res);
38627 }
38628 }
38629
38630 // Match shuffle against TRUNCATE patterns.
38631 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
38632 // Match against a VTRUNC instruction, accounting for src/dst sizes.
38633 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
38634 Subtarget)) {
38635 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
38636 ShuffleSrcVT.getVectorNumElements();
38637 unsigned Opc =
38638 IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
38639 if (Depth == 0 && Root.getOpcode() == Opc)
38640 return SDValue(); // Nothing to do!
38641 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
38642 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
38643 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
38644 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
38645 return DAG.getBitcast(RootVT, Res);
38646 }
38647
38648 // Do we need a more general binary truncation pattern?
38649 if (RootSizeInBits < 512 &&
38650 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
38651 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
38652 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
38653 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
38654 // Bail if this was already a truncation or PACK node.
38655 // We sometimes fail to match PACK if we demand known undef elements.
38656 if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE ||
38657 Root.getOpcode() == X86ISD::PACKSS ||
38658 Root.getOpcode() == X86ISD::PACKUS))
38659 return SDValue(); // Nothing to do!
38660 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
38661 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
38662 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
38663 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
38664 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
38665 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
38666 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
38667 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
38668 return DAG.getBitcast(RootVT, Res);
38669 }
38670 }
38671
38672 // Don't try to re-form single instruction chains under any circumstances now
38673 // that we've done encoding canonicalization for them.
38674 if (Depth < 1)
38675 return SDValue();
38676
38677 // Depth threshold above which we can efficiently use variable mask shuffles.
38678 int VariableCrossLaneShuffleDepth =
38679 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
38680 int VariablePerLaneShuffleDepth =
38681 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
38682 AllowVariableCrossLaneMask &=
38683 (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
38684 AllowVariablePerLaneMask &=
38685 (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
38686 // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
38687 // higher depth before combining them.
38688 bool AllowBWIVPERMV3 =
38689 (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
38690
38691 bool MaskContainsZeros = isAnyZero(Mask);
38692
38693 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
38694 // If we have a single input lane-crossing shuffle then lower to VPERMV.
38695 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
38696 if (Subtarget.hasAVX2() &&
38697 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
38698 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
38699 Res = CanonicalizeShuffleInput(MaskVT, V1);
38700 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
38701 return DAG.getBitcast(RootVT, Res);
38702 }
38703 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
38704 if ((Subtarget.hasAVX512() &&
38705 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
38706 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
38707 (Subtarget.hasBWI() &&
38708 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
38709 (Subtarget.hasVBMI() &&
38710 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
38711 V1 = CanonicalizeShuffleInput(MaskVT, V1);
38712 V2 = DAG.getUNDEF(MaskVT);
38713 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
38714 return DAG.getBitcast(RootVT, Res);
38715 }
38716 }
38717
38718 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
38719 // vector as the second source (non-VLX will pad to 512-bit shuffles).
38720 if (UnaryShuffle && AllowVariableCrossLaneMask &&
38721 ((Subtarget.hasAVX512() &&
38722 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
38723 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
38724 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
38725 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
38726 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
38727 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
38728 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
38729 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
38730 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
38731 for (unsigned i = 0; i != NumMaskElts; ++i)
38732 if (Mask[i] == SM_SentinelZero)
38733 Mask[i] = NumMaskElts + i;
38734 V1 = CanonicalizeShuffleInput(MaskVT, V1);
38735 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
38736 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
38737 return DAG.getBitcast(RootVT, Res);
38738 }
38739
38740 // If that failed and either input is extracted then try to combine as a
38741 // shuffle with the larger type.
38743 Inputs, Root, BaseMask, Depth, HasVariableMask,
38744 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,
38745 Subtarget))
38746 return WideShuffle;
38747
38748 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
38749 // (non-VLX will pad to 512-bit shuffles).
38750 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
38751 ((Subtarget.hasAVX512() &&
38752 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
38753 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
38754 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
38755 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
38756 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
38757 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
38758 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
38759 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
38760 V1 = CanonicalizeShuffleInput(MaskVT, V1);
38761 V2 = CanonicalizeShuffleInput(MaskVT, V2);
38762 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
38763 return DAG.getBitcast(RootVT, Res);
38764 }
38765 return SDValue();
38766 }
38767
38768 // See if we can combine a single input shuffle with zeros to a bit-mask,
38769 // which is much simpler than any shuffle.
38770 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
38771 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
38772 TLI.isTypeLegal(MaskVT)) {
38773 APInt Zero = APInt::getZero(MaskEltSizeInBits);
38774 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
38775 APInt UndefElts(NumMaskElts, 0);
38776 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
38777 for (unsigned i = 0; i != NumMaskElts; ++i) {
38778 int M = Mask[i];
38779 if (M == SM_SentinelUndef) {
38780 UndefElts.setBit(i);
38781 continue;
38782 }
38783 if (M == SM_SentinelZero)
38784 continue;
38785 EltBits[i] = AllOnes;
38786 }
38787 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
38788 Res = CanonicalizeShuffleInput(MaskVT, V1);
38789 unsigned AndOpcode =
38791 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
38792 return DAG.getBitcast(RootVT, Res);
38793 }
38794
38795 // If we have a single input shuffle with different shuffle patterns in the
38796 // the 128-bit lanes use the variable mask to VPERMILPS.
38797 // TODO Combine other mask types at higher depths.
38798 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
38799 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
38800 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
38801 SmallVector<SDValue, 16> VPermIdx;
38802 for (int M : Mask) {
38803 SDValue Idx =
38804 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
38805 VPermIdx.push_back(Idx);
38806 }
38807 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
38808 Res = CanonicalizeShuffleInput(MaskVT, V1);
38809 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
38810 return DAG.getBitcast(RootVT, Res);
38811 }
38812
38813 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
38814 // to VPERMIL2PD/VPERMIL2PS.
38815 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
38816 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
38817 MaskVT == MVT::v8f32)) {
38818 // VPERMIL2 Operation.
38819 // Bits[3] - Match Bit.
38820 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
38821 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
38822 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
38823 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
38824 SmallVector<int, 8> VPerm2Idx;
38825 unsigned M2ZImm = 0;
38826 for (int M : Mask) {
38827 if (M == SM_SentinelUndef) {
38828 VPerm2Idx.push_back(-1);
38829 continue;
38830 }
38831 if (M == SM_SentinelZero) {
38832 M2ZImm = 2;
38833 VPerm2Idx.push_back(8);
38834 continue;
38835 }
38836 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
38837 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
38838 VPerm2Idx.push_back(Index);
38839 }
38840 V1 = CanonicalizeShuffleInput(MaskVT, V1);
38841 V2 = CanonicalizeShuffleInput(MaskVT, V2);
38842 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
38843 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
38844 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
38845 return DAG.getBitcast(RootVT, Res);
38846 }
38847
38848 // If we have 3 or more shuffle instructions or a chain involving a variable
38849 // mask, we can replace them with a single PSHUFB instruction profitably.
38850 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
38851 // instructions, but in practice PSHUFB tends to be *very* fast so we're
38852 // more aggressive.
38853 if (UnaryShuffle && AllowVariablePerLaneMask &&
38854 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
38855 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
38856 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
38857 SmallVector<SDValue, 16> PSHUFBMask;
38858 int NumBytes = RootVT.getSizeInBits() / 8;
38859 int Ratio = NumBytes / NumMaskElts;
38860 for (int i = 0; i < NumBytes; ++i) {
38861 int M = Mask[i / Ratio];
38862 if (M == SM_SentinelUndef) {
38863 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
38864 continue;
38865 }
38866 if (M == SM_SentinelZero) {
38867 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
38868 continue;
38869 }
38870 M = Ratio * M + i % Ratio;
38871 assert((M / 16) == (i / 16) && "Lane crossing detected");
38872 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
38873 }
38874 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
38875 Res = CanonicalizeShuffleInput(ByteVT, V1);
38876 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
38877 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
38878 return DAG.getBitcast(RootVT, Res);
38879 }
38880
38881 // With XOP, if we have a 128-bit binary input shuffle we can always combine
38882 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
38883 // slower than PSHUFB on targets that support both.
38884 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
38885 Subtarget.hasXOP()) {
38886 // VPPERM Mask Operation
38887 // Bits[4:0] - Byte Index (0 - 31)
38888 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
38889 SmallVector<SDValue, 16> VPPERMMask;
38890 int NumBytes = 16;
38891 int Ratio = NumBytes / NumMaskElts;
38892 for (int i = 0; i < NumBytes; ++i) {
38893 int M = Mask[i / Ratio];
38894 if (M == SM_SentinelUndef) {
38895 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
38896 continue;
38897 }
38898 if (M == SM_SentinelZero) {
38899 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
38900 continue;
38901 }
38902 M = Ratio * M + i % Ratio;
38903 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
38904 }
38905 MVT ByteVT = MVT::v16i8;
38906 V1 = CanonicalizeShuffleInput(ByteVT, V1);
38907 V2 = CanonicalizeShuffleInput(ByteVT, V2);
38908 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
38909 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
38910 return DAG.getBitcast(RootVT, Res);
38911 }
38912
38913 // If that failed and either input is extracted then try to combine as a
38914 // shuffle with the larger type.
38916 Inputs, Root, BaseMask, Depth, HasVariableMask,
38917 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))
38918 return WideShuffle;
38919
38920 // If we have a dual input shuffle then lower to VPERMV3,
38921 // (non-VLX will pad to 512-bit shuffles)
38922 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
38923 ((Subtarget.hasAVX512() &&
38924 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
38925 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
38926 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
38927 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
38928 MaskVT == MVT::v16i32)) ||
38929 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
38930 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
38931 MaskVT == MVT::v32i16)) ||
38932 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
38933 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
38934 MaskVT == MVT::v64i8)))) {
38935 V1 = CanonicalizeShuffleInput(MaskVT, V1);
38936 V2 = CanonicalizeShuffleInput(MaskVT, V2);
38937 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
38938 return DAG.getBitcast(RootVT, Res);
38939 }
38940
38941 // Failed to find any combines.
38942 return SDValue();
38943}
38944
38945// Combine an arbitrary chain of shuffles + extract_subvectors into a single
38946// instruction if possible.
38947//
38948// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
38949// type size to attempt to combine:
38950// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
38951// -->
38952// extract_subvector(shuffle(x,y,m2),0)
38954 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
38955 bool HasVariableMask, bool AllowVariableCrossLaneMask,
38956 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
38957 const X86Subtarget &Subtarget) {
38958 unsigned NumMaskElts = BaseMask.size();
38959 unsigned NumInputs = Inputs.size();
38960 if (NumInputs == 0)
38961 return SDValue();
38962
38963 EVT RootVT = Root.getValueType();
38964 unsigned RootSizeInBits = RootVT.getSizeInBits();
38965 unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
38966 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
38967
38968 // Peek through extract_subvector to find widest legal vector.
38969 // TODO: Handle ISD::TRUNCATE
38970 unsigned WideSizeInBits = RootSizeInBits;
38971 for (unsigned I = 0; I != NumInputs; ++I) {
38972 SDValue Input = peekThroughBitcasts(Inputs[I]);
38973 while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR)
38974 Input = peekThroughBitcasts(Input.getOperand(0));
38975 if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
38976 WideSizeInBits < Input.getValueSizeInBits())
38977 WideSizeInBits = Input.getValueSizeInBits();
38978 }
38979
38980 // Bail if we fail to find a source larger than the existing root.
38981 unsigned Scale = WideSizeInBits / RootSizeInBits;
38982 if (WideSizeInBits <= RootSizeInBits ||
38983 (WideSizeInBits % RootSizeInBits) != 0)
38984 return SDValue();
38985
38986 // Create new mask for larger type.
38987 SmallVector<int, 64> WideMask(BaseMask);
38988 for (int &M : WideMask) {
38989 if (M < 0)
38990 continue;
38991 M = (M % NumMaskElts) + ((M / NumMaskElts) * Scale * NumMaskElts);
38992 }
38993 WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
38994
38995 // Attempt to peek through inputs and adjust mask when we extract from an
38996 // upper subvector.
38997 int AdjustedMasks = 0;
38998 SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
38999 for (unsigned I = 0; I != NumInputs; ++I) {
39000 SDValue &Input = WideInputs[I];
39001 Input = peekThroughBitcasts(Input);
39002 while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
39003 Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
39005 if (Idx != 0) {
39006 ++AdjustedMasks;
39007 unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
39008 Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
39009
39010 int lo = I * WideMask.size();
39011 int hi = (I + 1) * WideMask.size();
39012 for (int &M : WideMask)
39013 if (lo <= M && M < hi)
39014 M += Idx;
39015 }
39016 Input = peekThroughBitcasts(Input.getOperand(0));
39017 }
39018 }
39019
39020 // Remove unused/repeated shuffle source ops.
39021 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
39022 assert(!WideInputs.empty() && "Shuffle with no inputs detected");
39023
39024 // Bail if we're always extracting from the lowest subvectors,
39025 // combineX86ShuffleChain should match this for the current width, or the
39026 // shuffle still references too many inputs.
39027 if (AdjustedMasks == 0 || WideInputs.size() > 2)
39028 return SDValue();
39029
39030 // Minor canonicalization of the accumulated shuffle mask to make it easier
39031 // to match below. All this does is detect masks with sequential pairs of
39032 // elements, and shrink them to the half-width mask. It does this in a loop
39033 // so it will reduce the size of the mask to the minimal width mask which
39034 // performs an equivalent shuffle.
39035 while (WideMask.size() > 1) {
39036 SmallVector<int, 64> WidenedMask;
39037 if (!canWidenShuffleElements(WideMask, WidenedMask))
39038 break;
39039 WideMask = std::move(WidenedMask);
39040 }
39041
39042 // Canonicalization of binary shuffle masks to improve pattern matching by
39043 // commuting the inputs.
39044 if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
39046 std::swap(WideInputs[0], WideInputs[1]);
39047 }
39048
39049 // Increase depth for every upper subvector we've peeked through.
39050 Depth += AdjustedMasks;
39051
39052 // Attempt to combine wider chain.
39053 // TODO: Can we use a better Root?
39054 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
39055 WideInputs.back().getValueSizeInBits()
39056 ? WideInputs.front()
39057 : WideInputs.back();
39058 assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
39059 "WideRootSize mismatch");
39060
39061 if (SDValue WideShuffle =
39062 combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
39063 HasVariableMask, AllowVariableCrossLaneMask,
39064 AllowVariablePerLaneMask, DAG, Subtarget)) {
39065 WideShuffle =
39066 extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
39067 return DAG.getBitcast(RootVT, WideShuffle);
39068 }
39069
39070 return SDValue();
39071}
39072
39073// Canonicalize the combined shuffle mask chain with horizontal ops.
39074// NOTE: This may update the Ops and Mask.
39077 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
39078 const X86Subtarget &Subtarget) {
39079 if (Mask.empty() || Ops.empty())
39080 return SDValue();
39081
39083 for (SDValue Op : Ops)
39085
39086 // All ops must be the same horizop + type.
39087 SDValue BC0 = BC[0];
39088 EVT VT0 = BC0.getValueType();
39089 unsigned Opcode0 = BC0.getOpcode();
39090 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
39091 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
39092 }))
39093 return SDValue();
39094
39095 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
39096 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
39097 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
39098 if (!isHoriz && !isPack)
39099 return SDValue();
39100
39101 // Do all ops have a single use?
39102 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
39103 return Op.hasOneUse() &&
39105 });
39106
39107 int NumElts = VT0.getVectorNumElements();
39108 int NumLanes = VT0.getSizeInBits() / 128;
39109 int NumEltsPerLane = NumElts / NumLanes;
39110 int NumHalfEltsPerLane = NumEltsPerLane / 2;
39111 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
39112 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
39113
39114 if (NumEltsPerLane >= 4 &&
39115 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
39116 SmallVector<int> LaneMask, ScaledMask;
39117 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
39118 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
39119 // See if we can remove the shuffle by resorting the HOP chain so that
39120 // the HOP args are pre-shuffled.
39121 // TODO: Generalize to any sized/depth chain.
39122 // TODO: Add support for PACKSS/PACKUS.
39123 if (isHoriz) {
39124 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
39125 auto GetHOpSrc = [&](int M) {
39126 if (M == SM_SentinelUndef)
39127 return DAG.getUNDEF(VT0);
39128 if (M == SM_SentinelZero)
39129 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
39130 SDValue Src0 = BC[M / 4];
39131 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
39132 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
39133 return Src1.getOperand(M % 2);
39134 return SDValue();
39135 };
39136 SDValue M0 = GetHOpSrc(ScaledMask[0]);
39137 SDValue M1 = GetHOpSrc(ScaledMask[1]);
39138 SDValue M2 = GetHOpSrc(ScaledMask[2]);
39139 SDValue M3 = GetHOpSrc(ScaledMask[3]);
39140 if (M0 && M1 && M2 && M3) {
39141 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
39142 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
39143 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
39144 }
39145 }
39146 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
39147 if (Ops.size() >= 2) {
39148 SDValue LHS, RHS;
39149 auto GetHOpSrc = [&](int M, int &OutM) {
39150 // TODO: Support SM_SentinelZero
39151 if (M < 0)
39152 return M == SM_SentinelUndef;
39153 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
39154 if (!LHS || LHS == Src) {
39155 LHS = Src;
39156 OutM = (M % 2);
39157 return true;
39158 }
39159 if (!RHS || RHS == Src) {
39160 RHS = Src;
39161 OutM = (M % 2) + 2;
39162 return true;
39163 }
39164 return false;
39165 };
39166 int PostMask[4] = {-1, -1, -1, -1};
39167 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
39168 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
39169 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
39170 GetHOpSrc(ScaledMask[3], PostMask[3])) {
39171 LHS = DAG.getBitcast(SrcVT, LHS);
39172 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
39173 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
39174 // Use SHUFPS for the permute so this will work on SSE2 targets,
39175 // shuffle combining and domain handling will simplify this later on.
39176 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
39177 Res = DAG.getBitcast(ShuffleVT, Res);
39178 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
39179 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
39180 }
39181 }
39182 }
39183 }
39184
39185 if (2 < Ops.size())
39186 return SDValue();
39187
39188 SDValue BC1 = BC[BC.size() - 1];
39189 if (Mask.size() == VT0.getVectorNumElements()) {
39190 // Canonicalize binary shuffles of horizontal ops that use the
39191 // same sources to an unary shuffle.
39192 // TODO: Try to perform this fold even if the shuffle remains.
39193 if (Ops.size() == 2) {
39194 auto ContainsOps = [](SDValue HOp, SDValue Op) {
39195 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
39196 };
39197 // Commute if all BC0's ops are contained in BC1.
39198 if (ContainsOps(BC1, BC0.getOperand(0)) &&
39199 ContainsOps(BC1, BC0.getOperand(1))) {
39201 std::swap(Ops[0], Ops[1]);
39202 std::swap(BC0, BC1);
39203 }
39204
39205 // If BC1 can be represented by BC0, then convert to unary shuffle.
39206 if (ContainsOps(BC0, BC1.getOperand(0)) &&
39207 ContainsOps(BC0, BC1.getOperand(1))) {
39208 for (int &M : Mask) {
39209 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
39210 continue;
39211 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
39212 M -= NumElts + (SubLane * NumHalfEltsPerLane);
39213 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
39214 M += NumHalfEltsPerLane;
39215 }
39216 }
39217 }
39218
39219 // Canonicalize unary horizontal ops to only refer to lower halves.
39220 for (int i = 0; i != NumElts; ++i) {
39221 int &M = Mask[i];
39222 if (isUndefOrZero(M))
39223 continue;
39224 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
39225 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
39226 M -= NumHalfEltsPerLane;
39227 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
39228 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
39229 M -= NumHalfEltsPerLane;
39230 }
39231 }
39232
39233 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
39234 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
39235 // represents the LHS/RHS inputs for the lower/upper halves.
39236 SmallVector<int, 16> TargetMask128, WideMask128;
39237 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
39238 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
39239 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
39240 bool SingleOp = (Ops.size() == 1);
39241 if (isPack || OneUseOps ||
39242 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
39243 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
39244 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
39245 Lo = Lo.getOperand(WideMask128[0] & 1);
39246 Hi = Hi.getOperand(WideMask128[1] & 1);
39247 if (SingleOp) {
39248 SDValue Undef = DAG.getUNDEF(SrcVT);
39249 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
39250 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
39251 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
39252 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
39253 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
39254 }
39255 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
39256 }
39257 }
39258
39259 // If we are post-shuffling a 256-bit hop and not requiring the upper
39260 // elements, then try to narrow to a 128-bit hop directly.
39261 SmallVector<int, 16> WideMask64;
39262 if (Ops.size() == 1 && NumLanes == 2 &&
39263 scaleShuffleElements(Mask, 4, WideMask64) &&
39264 isUndefInRange(WideMask64, 2, 2)) {
39265 int M0 = WideMask64[0];
39266 int M1 = WideMask64[1];
39267 if (isInRange(M0, 0, 4) && isInRange(M1, 0, 4)) {
39269 unsigned Idx0 = (M0 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
39270 unsigned Idx1 = (M1 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
39271 SDValue V0 = extract128BitVector(BC[0].getOperand(M0 & 1), Idx0, DAG, DL);
39272 SDValue V1 = extract128BitVector(BC[0].getOperand(M1 & 1), Idx1, DAG, DL);
39273 SDValue Res = DAG.getNode(Opcode0, DL, HalfVT, V0, V1);
39274 return widenSubVector(Res, false, Subtarget, DAG, DL, 256);
39275 }
39276 }
39277
39278 return SDValue();
39279}
39280
39281// Attempt to constant fold all of the constant source ops.
39282// Returns true if the entire shuffle is folded to a constant.
39283// TODO: Extend this to merge multiple constant Ops and update the mask.
39285 ArrayRef<int> Mask, SDValue Root,
39286 bool HasVariableMask,
39287 SelectionDAG &DAG,
39288 const X86Subtarget &Subtarget) {
39289 MVT VT = Root.getSimpleValueType();
39290
39291 unsigned SizeInBits = VT.getSizeInBits();
39292 unsigned NumMaskElts = Mask.size();
39293 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
39294 unsigned NumOps = Ops.size();
39295
39296 // Extract constant bits from each source op.
39297 SmallVector<APInt, 16> UndefEltsOps(NumOps);
39298 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
39299 for (unsigned I = 0; I != NumOps; ++I)
39300 if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
39301 RawBitsOps[I],
39302 /*AllowWholeUndefs*/ true,
39303 /*AllowPartialUndefs*/ true))
39304 return SDValue();
39305
39306 // If we're optimizing for size, only fold if at least one of the constants is
39307 // only used once or the combined shuffle has included a variable mask
39308 // shuffle, this is to avoid constant pool bloat.
39309 bool IsOptimizingSize = DAG.shouldOptForSize();
39310 if (IsOptimizingSize && !HasVariableMask &&
39311 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
39312 return SDValue();
39313
39314 // Shuffle the constant bits according to the mask.
39315 SDLoc DL(Root);
39316 APInt UndefElts(NumMaskElts, 0);
39317 APInt ZeroElts(NumMaskElts, 0);
39318 APInt ConstantElts(NumMaskElts, 0);
39319 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
39320 APInt::getZero(MaskSizeInBits));
39321 for (unsigned i = 0; i != NumMaskElts; ++i) {
39322 int M = Mask[i];
39323 if (M == SM_SentinelUndef) {
39324 UndefElts.setBit(i);
39325 continue;
39326 } else if (M == SM_SentinelZero) {
39327 ZeroElts.setBit(i);
39328 continue;
39329 }
39330 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
39331
39332 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
39333 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
39334
39335 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
39336 if (SrcUndefElts[SrcMaskIdx]) {
39337 UndefElts.setBit(i);
39338 continue;
39339 }
39340
39341 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
39342 APInt &Bits = SrcEltBits[SrcMaskIdx];
39343 if (!Bits) {
39344 ZeroElts.setBit(i);
39345 continue;
39346 }
39347
39348 ConstantElts.setBit(i);
39349 ConstantBitData[i] = Bits;
39350 }
39351 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes());
39352
39353 // Attempt to create a zero vector.
39354 if ((UndefElts | ZeroElts).isAllOnes())
39355 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
39356
39357 // Create the constant data.
39358 MVT MaskSVT;
39359 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
39360 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
39361 else
39362 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
39363
39364 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
39365 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
39366 return SDValue();
39367
39368 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
39369 return DAG.getBitcast(VT, CstOp);
39370}
39371
39372namespace llvm {
39373 namespace X86 {
39374 enum {
39377 } // namespace X86
39378} // namespace llvm
39379
39380/// Fully generic combining of x86 shuffle instructions.
39381///
39382/// This should be the last combine run over the x86 shuffle instructions. Once
39383/// they have been fully optimized, this will recursively consider all chains
39384/// of single-use shuffle instructions, build a generic model of the cumulative
39385/// shuffle operation, and check for simpler instructions which implement this
39386/// operation. We use this primarily for two purposes:
39387///
39388/// 1) Collapse generic shuffles to specialized single instructions when
39389/// equivalent. In most cases, this is just an encoding size win, but
39390/// sometimes we will collapse multiple generic shuffles into a single
39391/// special-purpose shuffle.
39392/// 2) Look for sequences of shuffle instructions with 3 or more total
39393/// instructions, and replace them with the slightly more expensive SSSE3
39394/// PSHUFB instruction if available. We do this as the last combining step
39395/// to ensure we avoid using PSHUFB if we can implement the shuffle with
39396/// a suitable short sequence of other instructions. The PSHUFB will either
39397/// use a register or have to read from memory and so is slightly (but only
39398/// slightly) more expensive than the other shuffle instructions.
39399///
39400/// Because this is inherently a quadratic operation (for each shuffle in
39401/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
39402/// This should never be an issue in practice as the shuffle lowering doesn't
39403/// produce sequences of more than 8 instructions.
39404///
39405/// FIXME: We will currently miss some cases where the redundant shuffling
39406/// would simplify under the threshold for PSHUFB formation because of
39407/// combine-ordering. To fix this, we should do the redundant instruction
39408/// combining in this recursive walk.
39410 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
39411 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
39412 unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,
39413 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
39414 const X86Subtarget &Subtarget) {
39415 assert(!RootMask.empty() &&
39416 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
39417 "Illegal shuffle root mask");
39418 MVT RootVT = Root.getSimpleValueType();
39419 assert(RootVT.isVector() && "Shuffles operate on vector types!");
39420 unsigned RootSizeInBits = RootVT.getSizeInBits();
39421
39422 // Bound the depth of our recursive combine because this is ultimately
39423 // quadratic in nature.
39424 if (Depth >= MaxDepth)
39425 return SDValue();
39426
39427 // Directly rip through bitcasts to find the underlying operand.
39428 SDValue Op = SrcOps[SrcOpIndex];
39430
39431 EVT VT = Op.getValueType();
39432 if (!VT.isVector() || !VT.isSimple())
39433 return SDValue(); // Bail if we hit a non-simple non-vector.
39434
39435 // FIXME: Just bail on f16 for now.
39436 if (VT.getVectorElementType() == MVT::f16)
39437 return SDValue();
39438
39439 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
39440 "Can only combine shuffles upto size of the root op.");
39441
39442 // Create a demanded elts mask from the referenced elements of Op.
39443 APInt OpDemandedElts = APInt::getZero(RootMask.size());
39444 for (int M : RootMask) {
39445 int BaseIdx = RootMask.size() * SrcOpIndex;
39446 if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
39447 OpDemandedElts.setBit(M - BaseIdx);
39448 }
39449 if (RootSizeInBits != VT.getSizeInBits()) {
39450 // Op is smaller than Root - extract the demanded elts for the subvector.
39451 unsigned Scale = RootSizeInBits / VT.getSizeInBits();
39452 unsigned NumOpMaskElts = RootMask.size() / Scale;
39453 assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch");
39454 assert(OpDemandedElts
39455 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
39456 .isZero() &&
39457 "Out of range elements referenced in root mask");
39458 OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
39459 }
39460 OpDemandedElts =
39461 APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
39462
39463 // Extract target shuffle mask and resolve sentinels and inputs.
39464 SmallVector<int, 64> OpMask;
39465 SmallVector<SDValue, 2> OpInputs;
39466 APInt OpUndef, OpZero;
39467 bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
39468 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
39469 OpZero, DAG, Depth, false)) {
39470 // Shuffle inputs must not be larger than the shuffle result.
39471 // TODO: Relax this for single input faux shuffles (e.g. trunc).
39472 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
39473 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
39474 }))
39475 return SDValue();
39476 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
39477 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
39478 !isNullConstant(Op.getOperand(1))) {
39479 SDValue SrcVec = Op.getOperand(0);
39480 int ExtractIdx = Op.getConstantOperandVal(1);
39481 unsigned NumElts = VT.getVectorNumElements();
39482 OpInputs.assign({SrcVec});
39483 OpMask.assign(NumElts, SM_SentinelUndef);
39484 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
39485 OpZero = OpUndef = APInt::getZero(NumElts);
39486 } else {
39487 return SDValue();
39488 }
39489
39490 // If the shuffle result was smaller than the root, we need to adjust the
39491 // mask indices and pad the mask with undefs.
39492 if (RootSizeInBits > VT.getSizeInBits()) {
39493 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
39494 unsigned OpMaskSize = OpMask.size();
39495 if (OpInputs.size() > 1) {
39496 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
39497 for (int &M : OpMask) {
39498 if (M < 0)
39499 continue;
39500 int EltIdx = M % OpMaskSize;
39501 int OpIdx = M / OpMaskSize;
39502 M = (PaddedMaskSize * OpIdx) + EltIdx;
39503 }
39504 }
39505 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
39506 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
39507 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
39508 }
39509
39512
39513 // We don't need to merge masks if the root is empty.
39514 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
39515 if (EmptyRoot) {
39516 // Only resolve zeros if it will remove an input, otherwise we might end
39517 // up in an infinite loop.
39518 bool ResolveKnownZeros = true;
39519 if (!OpZero.isZero()) {
39520 APInt UsedInputs = APInt::getZero(OpInputs.size());
39521 for (int i = 0, e = OpMask.size(); i != e; ++i) {
39522 int M = OpMask[i];
39523 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
39524 continue;
39525 UsedInputs.setBit(M / OpMask.size());
39526 if (UsedInputs.isAllOnes()) {
39527 ResolveKnownZeros = false;
39528 break;
39529 }
39530 }
39531 }
39532 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
39533 ResolveKnownZeros);
39534
39535 Mask = OpMask;
39536 Ops.append(OpInputs.begin(), OpInputs.end());
39537 } else {
39538 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
39539
39540 // Add the inputs to the Ops list, avoiding duplicates.
39541 Ops.append(SrcOps.begin(), SrcOps.end());
39542
39543 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
39544 // Attempt to find an existing match.
39545 SDValue InputBC = peekThroughBitcasts(Input);
39546 for (int i = 0, e = Ops.size(); i < e; ++i)
39547 if (InputBC == peekThroughBitcasts(Ops[i]))
39548 return i;
39549 // Match failed - should we replace an existing Op?
39550 if (InsertionPoint >= 0) {
39551 Ops[InsertionPoint] = Input;
39552 return InsertionPoint;
39553 }
39554 // Add to the end of the Ops list.
39555 Ops.push_back(Input);
39556 return Ops.size() - 1;
39557 };
39558
39559 SmallVector<int, 2> OpInputIdx;
39560 for (SDValue OpInput : OpInputs)
39561 OpInputIdx.push_back(
39562 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
39563
39564 assert(((RootMask.size() > OpMask.size() &&
39565 RootMask.size() % OpMask.size() == 0) ||
39566 (OpMask.size() > RootMask.size() &&
39567 OpMask.size() % RootMask.size() == 0) ||
39568 OpMask.size() == RootMask.size()) &&
39569 "The smaller number of elements must divide the larger.");
39570
39571 // This function can be performance-critical, so we rely on the power-of-2
39572 // knowledge that we have about the mask sizes to replace div/rem ops with
39573 // bit-masks and shifts.
39574 assert(llvm::has_single_bit<uint32_t>(RootMask.size()) &&
39575 "Non-power-of-2 shuffle mask sizes");
39576 assert(llvm::has_single_bit<uint32_t>(OpMask.size()) &&
39577 "Non-power-of-2 shuffle mask sizes");
39578 unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
39579 unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
39580
39581 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
39582 unsigned RootRatio =
39583 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
39584 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
39585 assert((RootRatio == 1 || OpRatio == 1) &&
39586 "Must not have a ratio for both incoming and op masks!");
39587
39588 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
39589 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
39590 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
39591 unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
39592 unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
39593
39594 Mask.resize(MaskWidth, SM_SentinelUndef);
39595
39596 // Merge this shuffle operation's mask into our accumulated mask. Note that
39597 // this shuffle's mask will be the first applied to the input, followed by
39598 // the root mask to get us all the way to the root value arrangement. The
39599 // reason for this order is that we are recursing up the operation chain.
39600 for (unsigned i = 0; i < MaskWidth; ++i) {
39601 unsigned RootIdx = i >> RootRatioLog2;
39602 if (RootMask[RootIdx] < 0) {
39603 // This is a zero or undef lane, we're done.
39604 Mask[i] = RootMask[RootIdx];
39605 continue;
39606 }
39607
39608 unsigned RootMaskedIdx =
39609 RootRatio == 1
39610 ? RootMask[RootIdx]
39611 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
39612
39613 // Just insert the scaled root mask value if it references an input other
39614 // than the SrcOp we're currently inserting.
39615 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
39616 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
39617 Mask[i] = RootMaskedIdx;
39618 continue;
39619 }
39620
39621 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
39622 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
39623 if (OpMask[OpIdx] < 0) {
39624 // The incoming lanes are zero or undef, it doesn't matter which ones we
39625 // are using.
39626 Mask[i] = OpMask[OpIdx];
39627 continue;
39628 }
39629
39630 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
39631 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
39632 : (OpMask[OpIdx] << OpRatioLog2) +
39633 (RootMaskedIdx & (OpRatio - 1));
39634
39635 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
39636 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
39637 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
39638 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
39639
39640 Mask[i] = OpMaskedIdx;
39641 }
39642 }
39643
39644 // Peek through vector widenings and set out of bounds mask indices to undef.
39645 // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
39646 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
39647 SDValue &Op = Ops[I];
39648 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() &&
39649 isNullConstant(Op.getOperand(2))) {
39650 Op = Op.getOperand(1);
39651 unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
39652 int Lo = I * Mask.size();
39653 int Hi = (I + 1) * Mask.size();
39654 int NewHi = Lo + (Mask.size() / Scale);
39655 for (int &M : Mask) {
39656 if (Lo <= M && NewHi <= M && M < Hi)
39657 M = SM_SentinelUndef;
39658 }
39659 }
39660 }
39661
39662 // Peek through any free extract_subvector nodes back to root size.
39663 for (SDValue &Op : Ops)
39664 while (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
39665 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
39666 isNullConstant(Op.getOperand(1)))
39667 Op = Op.getOperand(0);
39668
39669 // Remove unused/repeated shuffle source ops.
39671
39672 // Handle the all undef/zero/ones cases early.
39673 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
39674 return DAG.getUNDEF(RootVT);
39675 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
39676 return getZeroVector(RootVT, Subtarget, DAG, SDLoc(Root));
39677 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
39679 return getOnesVector(RootVT, DAG, SDLoc(Root));
39680
39681 assert(!Ops.empty() && "Shuffle with no inputs detected");
39682 HasVariableMask |= IsOpVariableMask;
39683
39684 // Update the list of shuffle nodes that have been combined so far.
39685 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
39686 SrcNodes.end());
39687 CombinedNodes.push_back(Op.getNode());
39688
39689 // See if we can recurse into each shuffle source op (if it's a target
39690 // shuffle). The source op should only be generally combined if it either has
39691 // a single use (i.e. current Op) or all its users have already been combined,
39692 // if not then we can still combine but should prevent generation of variable
39693 // shuffles to avoid constant pool bloat.
39694 // Don't recurse if we already have more source ops than we can combine in
39695 // the remaining recursion depth.
39696 if (Ops.size() < (MaxDepth - Depth)) {
39697 for (int i = 0, e = Ops.size(); i < e; ++i) {
39698 // For empty roots, we need to resolve zeroable elements before combining
39699 // them with other shuffles.
39700 SmallVector<int, 64> ResolvedMask = Mask;
39701 if (EmptyRoot)
39702 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
39703 bool AllowCrossLaneVar = false;
39704 bool AllowPerLaneVar = false;
39705 if (Ops[i].getNode()->hasOneUse() ||
39706 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
39707 AllowCrossLaneVar = AllowVariableCrossLaneMask;
39708 AllowPerLaneVar = AllowVariablePerLaneMask;
39709 }
39711 Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
39712 HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,
39713 Subtarget))
39714 return Res;
39715 }
39716 }
39717
39718 // Attempt to constant fold all of the constant source ops.
39720 Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
39721 return Cst;
39722
39723 // If constant fold failed and we only have constants - then we have
39724 // multiple uses by a single non-variable shuffle - just bail.
39725 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
39726 APInt UndefElts;
39727 SmallVector<APInt> RawBits;
39728 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
39729 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
39730 RawBits,
39731 /*AllowWholeUndefs*/ true,
39732 /*AllowPartialUndefs*/ true);
39733 })) {
39734 return SDValue();
39735 }
39736
39737 // Canonicalize the combined shuffle mask chain with horizontal ops.
39738 // NOTE: This will update the Ops and Mask.
39740 Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
39741 return DAG.getBitcast(RootVT, HOp);
39742
39743 // Try to refine our inputs given our knowledge of target shuffle mask.
39744 for (auto I : enumerate(Ops)) {
39745 int OpIdx = I.index();
39746 SDValue &Op = I.value();
39747
39748 // What range of shuffle mask element values results in picking from Op?
39749 int Lo = OpIdx * Mask.size();
39750 int Hi = Lo + Mask.size();
39751
39752 // Which elements of Op do we demand, given the mask's granularity?
39753 APInt OpDemandedElts(Mask.size(), 0);
39754 for (int MaskElt : Mask) {
39755 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
39756 int OpEltIdx = MaskElt - Lo;
39757 OpDemandedElts.setBit(OpEltIdx);
39758 }
39759 }
39760
39761 // Is the shuffle result smaller than the root?
39762 if (Op.getValueSizeInBits() < RootSizeInBits) {
39763 // We padded the mask with undefs. But we now need to undo that.
39764 unsigned NumExpectedVectorElts = Mask.size();
39765 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
39766 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
39767 assert(!OpDemandedElts.extractBits(
39768 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
39769 "Demanding the virtual undef widening padding?");
39770 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
39771 }
39772
39773 // The Op itself may be of different VT, so we need to scale the mask.
39774 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
39775 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
39776
39777 // Can this operand be simplified any further, given it's demanded elements?
39778 if (SDValue NewOp =
39780 Op, OpScaledDemandedElts, DAG))
39781 Op = NewOp;
39782 }
39783 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
39784
39785 // Widen any subvector shuffle inputs we've collected.
39786 // TODO: Remove this to avoid generating temporary nodes, we should only
39787 // widen once combineX86ShuffleChain has found a match.
39788 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
39789 return Op.getValueSizeInBits() < RootSizeInBits;
39790 })) {
39791 for (SDValue &Op : Ops)
39792 if (Op.getValueSizeInBits() < RootSizeInBits)
39793 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
39794 RootSizeInBits);
39795 // Reresolve - we might have repeated subvector sources.
39797 }
39798
39799 // We can only combine unary and binary shuffle mask cases.
39800 if (Ops.size() <= 2) {
39801 // Minor canonicalization of the accumulated shuffle mask to make it easier
39802 // to match below. All this does is detect masks with sequential pairs of
39803 // elements, and shrink them to the half-width mask. It does this in a loop
39804 // so it will reduce the size of the mask to the minimal width mask which
39805 // performs an equivalent shuffle.
39806 while (Mask.size() > 1) {
39807 SmallVector<int, 64> WidenedMask;
39808 if (!canWidenShuffleElements(Mask, WidenedMask))
39809 break;
39810 Mask = std::move(WidenedMask);
39811 }
39812
39813 // Canonicalization of binary shuffle masks to improve pattern matching by
39814 // commuting the inputs.
39815 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
39817 std::swap(Ops[0], Ops[1]);
39818 }
39819
39820 // Try to combine into a single shuffle instruction.
39821 if (SDValue Shuffle = combineX86ShuffleChain(
39822 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
39823 AllowVariablePerLaneMask, DAG, Subtarget))
39824 return Shuffle;
39825
39826 // If all the operands come from the same larger vector, fallthrough and try
39827 // to use combineX86ShuffleChainWithExtract.
39830 if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
39831 (RootSizeInBits / Mask.size()) != 64 ||
39832 LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
39833 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
39834 LHS.getOperand(0) != RHS.getOperand(0))
39835 return SDValue();
39836 }
39837
39838 // If that failed and any input is extracted then try to combine as a
39839 // shuffle with the larger type.
39841 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
39842 AllowVariablePerLaneMask, DAG, Subtarget);
39843}
39844
39845/// Helper entry wrapper to combineX86ShufflesRecursively.
39847 const X86Subtarget &Subtarget) {
39849 {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,
39850 /*HasVarMask*/ false,
39851 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,
39852 Subtarget);
39853}
39854
39855/// Get the PSHUF-style mask from PSHUF node.
39856///
39857/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
39858/// PSHUF-style masks that can be reused with such instructions.
39860 MVT VT = N.getSimpleValueType();
39863 bool HaveMask = getTargetShuffleMask(N, false, Ops, Mask);
39864 (void)HaveMask;
39865 assert(HaveMask);
39866
39867 // If we have more than 128-bits, only the low 128-bits of shuffle mask
39868 // matter. Check that the upper masks are repeats and remove them.
39869 if (VT.getSizeInBits() > 128) {
39870 int LaneElts = 128 / VT.getScalarSizeInBits();
39871#ifndef NDEBUG
39872 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
39873 for (int j = 0; j < LaneElts; ++j)
39874 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
39875 "Mask doesn't repeat in high 128-bit lanes!");
39876#endif
39877 Mask.resize(LaneElts);
39878 }
39879
39880 switch (N.getOpcode()) {
39881 case X86ISD::PSHUFD:
39882 return Mask;
39883 case X86ISD::PSHUFLW:
39884 Mask.resize(4);
39885 return Mask;
39886 case X86ISD::PSHUFHW:
39887 Mask.erase(Mask.begin(), Mask.begin() + 4);
39888 for (int &M : Mask)
39889 M -= 4;
39890 return Mask;
39891 default:
39892 llvm_unreachable("No valid shuffle instruction found!");
39893 }
39894}
39895
39896/// Search for a combinable shuffle across a chain ending in pshufd.
39897///
39898/// We walk up the chain and look for a combinable shuffle, skipping over
39899/// shuffles that we could hoist this shuffle's transformation past without
39900/// altering anything.
39903 const SDLoc &DL,
39904 SelectionDAG &DAG) {
39905 assert(N.getOpcode() == X86ISD::PSHUFD &&
39906 "Called with something other than an x86 128-bit half shuffle!");
39907
39908 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
39909 // of the shuffles in the chain so that we can form a fresh chain to replace
39910 // this one.
39912 SDValue V = N.getOperand(0);
39913 for (; V.hasOneUse(); V = V.getOperand(0)) {
39914 switch (V.getOpcode()) {
39915 default:
39916 return SDValue(); // Nothing combined!
39917
39918 case ISD::BITCAST:
39919 // Skip bitcasts as we always know the type for the target specific
39920 // instructions.
39921 continue;
39922
39923 case X86ISD::PSHUFD:
39924 // Found another dword shuffle.
39925 break;
39926
39927 case X86ISD::PSHUFLW:
39928 // Check that the low words (being shuffled) are the identity in the
39929 // dword shuffle, and the high words are self-contained.
39930 if (Mask[0] != 0 || Mask[1] != 1 ||
39931 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
39932 return SDValue();
39933
39934 Chain.push_back(V);
39935 continue;
39936
39937 case X86ISD::PSHUFHW:
39938 // Check that the high words (being shuffled) are the identity in the
39939 // dword shuffle, and the low words are self-contained.
39940 if (Mask[2] != 2 || Mask[3] != 3 ||
39941 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
39942 return SDValue();
39943
39944 Chain.push_back(V);
39945 continue;
39946
39947 case X86ISD::UNPCKL:
39948 case X86ISD::UNPCKH:
39949 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
39950 // shuffle into a preceding word shuffle.
39951 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
39952 V.getSimpleValueType().getVectorElementType() != MVT::i16)
39953 return SDValue();
39954
39955 // Search for a half-shuffle which we can combine with.
39956 unsigned CombineOp =
39957 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
39958 if (V.getOperand(0) != V.getOperand(1) ||
39959 !V->isOnlyUserOf(V.getOperand(0).getNode()))
39960 return SDValue();
39961 Chain.push_back(V);
39962 V = V.getOperand(0);
39963 do {
39964 switch (V.getOpcode()) {
39965 default:
39966 return SDValue(); // Nothing to combine.
39967
39968 case X86ISD::PSHUFLW:
39969 case X86ISD::PSHUFHW:
39970 if (V.getOpcode() == CombineOp)
39971 break;
39972
39973 Chain.push_back(V);
39974
39975 [[fallthrough]];
39976 case ISD::BITCAST:
39977 V = V.getOperand(0);
39978 continue;
39979 }
39980 break;
39981 } while (V.hasOneUse());
39982 break;
39983 }
39984 // Break out of the loop if we break out of the switch.
39985 break;
39986 }
39987
39988 if (!V.hasOneUse())
39989 // We fell out of the loop without finding a viable combining instruction.
39990 return SDValue();
39991
39992 // Merge this node's mask and our incoming mask.
39994 for (int &M : Mask)
39995 M = VMask[M];
39996 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
39997 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
39998
39999 // Rebuild the chain around this new shuffle.
40000 while (!Chain.empty()) {
40001 SDValue W = Chain.pop_back_val();
40002
40003 if (V.getValueType() != W.getOperand(0).getValueType())
40004 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
40005
40006 switch (W.getOpcode()) {
40007 default:
40008 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
40009
40010 case X86ISD::UNPCKL:
40011 case X86ISD::UNPCKH:
40012 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
40013 break;
40014
40015 case X86ISD::PSHUFD:
40016 case X86ISD::PSHUFLW:
40017 case X86ISD::PSHUFHW:
40018 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
40019 break;
40020 }
40021 }
40022 if (V.getValueType() != N.getValueType())
40023 V = DAG.getBitcast(N.getValueType(), V);
40024
40025 // Return the new chain to replace N.
40026 return V;
40027}
40028
40029// Attempt to commute shufps LHS loads:
40030// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
40032 SelectionDAG &DAG) {
40033 // TODO: Add vXf64 support.
40034 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
40035 return SDValue();
40036
40037 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
40038 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
40039 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
40040 return SDValue();
40041 SDValue N0 = V.getOperand(0);
40042 SDValue N1 = V.getOperand(1);
40043 unsigned Imm = V.getConstantOperandVal(2);
40044 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
40045 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
40047 return SDValue();
40048 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
40049 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
40050 DAG.getTargetConstant(Imm, DL, MVT::i8));
40051 };
40052
40053 switch (N.getOpcode()) {
40054 case X86ISD::VPERMILPI:
40055 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
40056 unsigned Imm = N.getConstantOperandVal(1);
40057 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
40058 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
40059 }
40060 break;
40061 case X86ISD::SHUFP: {
40062 SDValue N0 = N.getOperand(0);
40063 SDValue N1 = N.getOperand(1);
40064 unsigned Imm = N.getConstantOperandVal(2);
40065 if (N0 == N1) {
40066 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
40067 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
40068 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
40069 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
40070 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
40071 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
40072 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
40073 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
40074 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
40075 }
40076 break;
40077 }
40078 }
40079
40080 return SDValue();
40081}
40082
40083// Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))
40084// iff we don't demand the same element index for both X and Y.
40085static SDValue
40087 const APInt &DemandedElts, SelectionDAG &DAG,
40088 const X86Subtarget &Subtarget, const SDLoc &DL) {
40089 assert(isBlendOrUndef(BlendMask) && "Blend shuffle expected");
40090 if (!N0.hasOneUse() || !N1.hasOneUse())
40091 return SDValue();
40092
40093 unsigned NumElts = VT.getVectorNumElements();
40096
40097 // See if both operands are shuffles, and that we can scale the shuffle masks
40098 // to the same width as the blend mask.
40099 // TODO: Support SM_SentinelZero?
40100 SmallVector<SDValue, 2> Ops0, Ops1;
40101 SmallVector<int, 32> Mask0, Mask1, ScaledMask0, ScaledMask1;
40102 if (!getTargetShuffleMask(BC0, /*AllowSentinelZero=*/false, Ops0, Mask0) ||
40103 !getTargetShuffleMask(BC1, /*AllowSentinelZero=*/false, Ops1, Mask1) ||
40104 !scaleShuffleElements(Mask0, NumElts, ScaledMask0) ||
40105 !scaleShuffleElements(Mask1, NumElts, ScaledMask1))
40106 return SDValue();
40107
40108 // Determine the demanded elts from both permutes.
40109 APInt Demanded0, DemandedLHS0, DemandedRHS0;
40110 APInt Demanded1, DemandedLHS1, DemandedRHS1;
40111 if (!getShuffleDemandedElts(NumElts, BlendMask, DemandedElts, Demanded0,
40112 Demanded1,
40113 /*AllowUndefElts=*/true) ||
40114 !getShuffleDemandedElts(NumElts, ScaledMask0, Demanded0, DemandedLHS0,
40115 DemandedRHS0, /*AllowUndefElts=*/true) ||
40116 !getShuffleDemandedElts(NumElts, ScaledMask1, Demanded1, DemandedLHS1,
40117 DemandedRHS1, /*AllowUndefElts=*/true))
40118 return SDValue();
40119
40120 // Confirm that we only use a single operand from both permutes and that we
40121 // don't demand the same index from both.
40122 if (!DemandedRHS0.isZero() || !DemandedRHS1.isZero() ||
40123 DemandedLHS0.intersects(DemandedLHS1))
40124 return SDValue();
40125
40126 // Use the permute demanded elts masks as the new blend mask.
40127 // Create the new permute mask as a blend of the 2 original permute masks.
40128 SmallVector<int, 32> NewBlendMask(NumElts, SM_SentinelUndef);
40129 SmallVector<int, 32> NewPermuteMask(NumElts, SM_SentinelUndef);
40130 for (unsigned I = 0; I != NumElts; ++I) {
40131 if (Demanded0[I]) {
40132 int M = ScaledMask0[I];
40133 if (0 <= M) {
40134 assert(isUndefOrEqual(NewBlendMask[M], M) &&
40135 "BlendMask demands LHS AND RHS");
40136 NewBlendMask[M] = M;
40137 NewPermuteMask[I] = M;
40138 }
40139 } else if (Demanded1[I]) {
40140 int M = ScaledMask1[I];
40141 if (0 <= M) {
40142 assert(isUndefOrEqual(NewBlendMask[M], M + NumElts) &&
40143 "BlendMask demands LHS AND RHS");
40144 NewBlendMask[M] = M + NumElts;
40145 NewPermuteMask[I] = M;
40146 }
40147 }
40148 }
40149 assert(isBlendOrUndef(NewBlendMask) && "Bad blend");
40150 assert(isUndefOrInRange(NewPermuteMask, 0, NumElts) && "Bad permute");
40151
40152 // v16i16 shuffles can explode in complexity very easily, only accept them if
40153 // the blend mask is the same in the 128-bit subvectors (or can widen to
40154 // v8i32) and the permute can be widened as well.
40155 if (VT == MVT::v16i16) {
40156 if (!is128BitLaneRepeatedShuffleMask(VT, NewBlendMask) &&
40157 !canWidenShuffleElements(NewBlendMask))
40158 return SDValue();
40159 if (!canWidenShuffleElements(NewPermuteMask))
40160 return SDValue();
40161 }
40162
40163 // Don't introduce lane-crossing permutes without AVX2, unless it can be
40164 // widened to a lane permute (vperm2f128).
40165 if (VT.is256BitVector() && !Subtarget.hasAVX2() &&
40167 NewPermuteMask) &&
40168 !canScaleShuffleElements(NewPermuteMask, 2))
40169 return SDValue();
40170
40171 SDValue NewBlend =
40172 DAG.getVectorShuffle(VT, DL, DAG.getBitcast(VT, Ops0[0]),
40173 DAG.getBitcast(VT, Ops1[0]), NewBlendMask);
40174 return DAG.getVectorShuffle(VT, DL, NewBlend, DAG.getUNDEF(VT),
40175 NewPermuteMask);
40176}
40177
40178// TODO - move this to TLI like isBinOp?
40179static bool isUnaryOp(unsigned Opcode) {
40180 switch (Opcode) {
40181 case ISD::CTLZ:
40182 case ISD::CTTZ:
40183 case ISD::CTPOP:
40184 return true;
40185 }
40186 return false;
40187}
40188
40189// Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
40190// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
40192 const SDLoc &DL) {
40193 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40194 EVT ShuffleVT = N.getValueType();
40195 unsigned Opc = N.getOpcode();
40196
40197 auto IsMergeableWithShuffle = [Opc, &DAG](SDValue Op, bool FoldShuf = true,
40198 bool FoldLoad = false) {
40199 // AllZeros/AllOnes constants are freely shuffled and will peek through
40200 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
40201 // merge with target shuffles if it has one use so shuffle combining is
40202 // likely to kick in. Shuffles of splats are expected to be removed.
40203 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
40204 ISD::isBuildVectorAllZeros(Op.getNode()) ||
40207 getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op)) ||
40208 (Op.getOpcode() == Opc && Op->hasOneUse()) ||
40209 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
40210 (FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
40211 (FoldLoad && isShuffleFoldableLoad(Op)) ||
40212 DAG.isSplatValue(Op, /*AllowUndefs*/ false);
40213 };
40214 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
40215 // Ensure we only shuffle whole vector src elements, unless its a logical
40216 // binops where we can more aggressively move shuffles from dst to src.
40217 return isLogicOp(BinOp) ||
40218 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
40219 };
40220
40221 switch (Opc) {
40222 // Unary and Unary+Permute Shuffles.
40223 case X86ISD::PSHUFB: {
40224 // Don't merge PSHUFB if it contains zero'd elements.
40225 SmallVector<int> Mask;
40227 if (!getTargetShuffleMask(N, false, Ops, Mask))
40228 break;
40229 [[fallthrough]];
40230 }
40231 case X86ISD::VBROADCAST:
40232 case X86ISD::MOVDDUP:
40233 case X86ISD::PSHUFD:
40234 case X86ISD::PSHUFHW:
40235 case X86ISD::PSHUFLW:
40236 case X86ISD::VPERMI:
40237 case X86ISD::VPERMILPI: {
40238 if (N.getOperand(0).getValueType() == ShuffleVT &&
40239 N->isOnlyUserOf(N.getOperand(0).getNode())) {
40240 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
40241 unsigned SrcOpcode = N0.getOpcode();
40242 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
40245 if (IsMergeableWithShuffle(Op00, Opc != X86ISD::VPERMI,
40246 Opc != X86ISD::PSHUFB) ||
40247 IsMergeableWithShuffle(Op01, Opc != X86ISD::VPERMI,
40248 Opc != X86ISD::PSHUFB)) {
40249 SDValue LHS, RHS;
40250 Op00 = DAG.getBitcast(ShuffleVT, Op00);
40251 Op01 = DAG.getBitcast(ShuffleVT, Op01);
40252 if (N.getNumOperands() == 2) {
40253 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
40254 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
40255 } else {
40256 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
40257 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
40258 }
40259 EVT OpVT = N0.getValueType();
40260 return DAG.getBitcast(ShuffleVT,
40261 DAG.getNode(SrcOpcode, DL, OpVT,
40262 DAG.getBitcast(OpVT, LHS),
40263 DAG.getBitcast(OpVT, RHS)));
40264 }
40265 }
40266 }
40267 break;
40268 }
40269 // Binary and Binary+Permute Shuffles.
40270 case X86ISD::INSERTPS: {
40271 // Don't merge INSERTPS if it contains zero'd elements.
40272 unsigned InsertPSMask = N.getConstantOperandVal(2);
40273 unsigned ZeroMask = InsertPSMask & 0xF;
40274 if (ZeroMask != 0)
40275 break;
40276 [[fallthrough]];
40277 }
40278 case X86ISD::MOVSD:
40279 case X86ISD::MOVSS:
40280 case X86ISD::BLENDI:
40281 case X86ISD::SHUFP:
40282 case X86ISD::UNPCKH:
40283 case X86ISD::UNPCKL: {
40284 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
40285 N->isOnlyUserOf(N.getOperand(1).getNode())) {
40286 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
40287 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
40288 unsigned SrcOpcode = N0.getOpcode();
40289 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
40290 N0.getValueType() == N1.getValueType() &&
40291 IsSafeToMoveShuffle(N0, SrcOpcode) &&
40292 IsSafeToMoveShuffle(N1, SrcOpcode)) {
40297 // Ensure the total number of shuffles doesn't increase by folding this
40298 // shuffle through to the source ops.
40299 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
40300 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
40301 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
40302 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
40303 SDValue LHS, RHS;
40304 Op00 = DAG.getBitcast(ShuffleVT, Op00);
40305 Op10 = DAG.getBitcast(ShuffleVT, Op10);
40306 Op01 = DAG.getBitcast(ShuffleVT, Op01);
40307 Op11 = DAG.getBitcast(ShuffleVT, Op11);
40308 if (N.getNumOperands() == 3) {
40309 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
40310 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
40311 } else {
40312 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
40313 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
40314 }
40315 EVT OpVT = N0.getValueType();
40316 return DAG.getBitcast(ShuffleVT,
40317 DAG.getNode(SrcOpcode, DL, OpVT,
40318 DAG.getBitcast(OpVT, LHS),
40319 DAG.getBitcast(OpVT, RHS)));
40320 }
40321 }
40322 if (isUnaryOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
40323 N0.getValueType() == N1.getValueType() &&
40324 IsSafeToMoveShuffle(N0, SrcOpcode) &&
40325 IsSafeToMoveShuffle(N1, SrcOpcode)) {
40328 SDValue Res;
40329 Op00 = DAG.getBitcast(ShuffleVT, Op00);
40330 Op10 = DAG.getBitcast(ShuffleVT, Op10);
40331 if (N.getNumOperands() == 3) {
40332 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
40333 } else {
40334 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
40335 }
40336 EVT OpVT = N0.getValueType();
40337 return DAG.getBitcast(
40338 ShuffleVT,
40339 DAG.getNode(SrcOpcode, DL, OpVT, DAG.getBitcast(OpVT, Res)));
40340 }
40341 }
40342 break;
40343 }
40344 }
40345 return SDValue();
40346}
40347
40348/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
40350 SelectionDAG &DAG,
40351 const SDLoc &DL) {
40352 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
40353
40354 MVT VT = V.getSimpleValueType();
40355 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
40356 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
40357 unsigned SrcOpc0 = Src0.getOpcode();
40358 unsigned SrcOpc1 = Src1.getOpcode();
40359 EVT SrcVT0 = Src0.getValueType();
40360 EVT SrcVT1 = Src1.getValueType();
40361
40362 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
40363 return SDValue();
40364
40365 switch (SrcOpc0) {
40366 case X86ISD::MOVDDUP: {
40367 SDValue LHS = Src0.getOperand(0);
40368 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
40369 SDValue Res =
40370 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
40371 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
40372 return DAG.getBitcast(VT, Res);
40373 }
40374 case X86ISD::VPERMILPI:
40375 // TODO: Handle v4f64 permutes with different low/high lane masks.
40376 if (SrcVT0 == MVT::v4f64) {
40377 uint64_t Mask = Src0.getConstantOperandVal(1);
40378 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
40379 break;
40380 }
40381 [[fallthrough]];
40382 case X86ISD::VSHLI:
40383 case X86ISD::VSRLI:
40384 case X86ISD::VSRAI:
40385 case X86ISD::PSHUFD:
40386 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
40387 SDValue LHS = Src0.getOperand(0);
40388 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
40389 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
40390 V.getOperand(2));
40391 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
40392 return DAG.getBitcast(VT, Res);
40393 }
40394 break;
40395 }
40396
40397 return SDValue();
40398}
40399
40400/// Try to combine x86 target specific shuffles.
40402 SelectionDAG &DAG,
40404 const X86Subtarget &Subtarget) {
40405 MVT VT = N.getSimpleValueType();
40407 unsigned Opcode = N.getOpcode();
40408 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40409
40410 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
40411 return R;
40412
40413 // Handle specific target shuffles.
40414 switch (Opcode) {
40415 case X86ISD::MOVDDUP: {
40416 SDValue Src = N.getOperand(0);
40417 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
40418 if (VT == MVT::v2f64 && Src.hasOneUse() &&
40419 ISD::isNormalLoad(Src.getNode())) {
40420 LoadSDNode *LN = cast<LoadSDNode>(Src);
40421 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
40422 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
40423 DCI.CombineTo(N.getNode(), Movddup);
40424 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
40426 return N; // Return N so it doesn't get rechecked!
40427 }
40428 }
40429
40430 return SDValue();
40431 }
40432 case X86ISD::VBROADCAST: {
40433 SDValue Src = N.getOperand(0);
40434 SDValue BC = peekThroughBitcasts(Src);
40435 EVT SrcVT = Src.getValueType();
40436 EVT BCVT = BC.getValueType();
40437
40438 // If broadcasting from another shuffle, attempt to simplify it.
40439 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
40440 if (isTargetShuffle(BC.getOpcode()) &&
40441 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
40442 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
40443 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
40445 for (unsigned i = 0; i != Scale; ++i)
40446 DemandedMask[i] = i;
40448 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
40450 /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,
40451 /*AllowPerLaneVarMask*/ true, DAG, Subtarget))
40452 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
40453 DAG.getBitcast(SrcVT, Res));
40454 }
40455
40456 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
40457 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
40458 if (Src.getOpcode() == ISD::BITCAST &&
40459 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
40460 TLI.isTypeLegal(BCVT) &&
40462 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
40463 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
40465 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
40466 }
40467
40468 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
40469 // If we're re-broadcasting a smaller type then broadcast with that type and
40470 // bitcast.
40471 // TODO: Do this for any splat?
40472 if (Src.getOpcode() == ISD::BITCAST &&
40473 (BC.getOpcode() == X86ISD::VBROADCAST ||
40475 (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
40476 (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
40477 MVT NewVT =
40479 VT.getSizeInBits() / BCVT.getScalarSizeInBits());
40480 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
40481 }
40482
40483 // Reduce broadcast source vector to lowest 128-bits.
40484 if (SrcVT.getSizeInBits() > 128)
40485 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
40486 extract128BitVector(Src, 0, DAG, DL));
40487
40488 // broadcast(scalar_to_vector(x)) -> broadcast(x).
40489 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
40490 Src.getValueType().getScalarType() == Src.getOperand(0).getValueType())
40491 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
40492
40493 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
40494 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
40495 isNullConstant(Src.getOperand(1)) &&
40496 Src.getValueType() ==
40497 Src.getOperand(0).getValueType().getScalarType() &&
40498 TLI.isTypeLegal(Src.getOperand(0).getValueType()))
40499 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
40500
40501 // Share broadcast with the longest vector and extract low subvector (free).
40502 // Ensure the same SDValue from the SDNode use is being used.
40503 for (SDNode *User : Src->uses())
40504 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
40505 Src == User->getOperand(0) &&
40506 User->getValueSizeInBits(0).getFixedValue() >
40507 VT.getFixedSizeInBits()) {
40508 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
40509 VT.getSizeInBits());
40510 }
40511
40512 // vbroadcast(scalarload X) -> vbroadcast_load X
40513 // For float loads, extract other uses of the scalar from the broadcast.
40514 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
40515 ISD::isNormalLoad(Src.getNode())) {
40516 LoadSDNode *LN = cast<LoadSDNode>(Src);
40517 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40518 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
40519 SDValue BcastLd =
40521 LN->getMemoryVT(), LN->getMemOperand());
40522 // If the load value is used only by N, replace it via CombineTo N.
40523 bool NoReplaceExtract = Src.hasOneUse();
40524 DCI.CombineTo(N.getNode(), BcastLd);
40525 if (NoReplaceExtract) {
40526 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40528 } else {
40529 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
40530 DAG.getIntPtrConstant(0, DL));
40531 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
40532 }
40533 return N; // Return N so it doesn't get rechecked!
40534 }
40535
40536 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
40537 // i16. So shrink it ourselves if we can make a broadcast_load.
40538 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
40539 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
40540 assert(Subtarget.hasAVX2() && "Expected AVX2");
40541 SDValue TruncIn = Src.getOperand(0);
40542
40543 // If this is a truncate of a non extending load we can just narrow it to
40544 // use a broadcast_load.
40545 if (ISD::isNormalLoad(TruncIn.getNode())) {
40546 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
40547 // Unless its volatile or atomic.
40548 if (LN->isSimple()) {
40549 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40550 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
40551 SDValue BcastLd = DAG.getMemIntrinsicNode(
40552 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
40553 LN->getPointerInfo(), LN->getOriginalAlign(),
40554 LN->getMemOperand()->getFlags());
40555 DCI.CombineTo(N.getNode(), BcastLd);
40556 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40557 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
40558 return N; // Return N so it doesn't get rechecked!
40559 }
40560 }
40561
40562 // If this is a truncate of an i16 extload, we can directly replace it.
40563 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
40564 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
40565 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
40566 if (LN->getMemoryVT().getSizeInBits() == 16) {
40567 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40568 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
40569 SDValue BcastLd =
40571 LN->getMemoryVT(), LN->getMemOperand());
40572 DCI.CombineTo(N.getNode(), BcastLd);
40573 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40574 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
40575 return N; // Return N so it doesn't get rechecked!
40576 }
40577 }
40578
40579 // If this is a truncate of load that has been shifted right, we can
40580 // offset the pointer and use a narrower load.
40581 if (TruncIn.getOpcode() == ISD::SRL &&
40582 TruncIn.getOperand(0).hasOneUse() &&
40583 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
40584 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
40585 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
40586 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
40587 // Make sure the shift amount and the load size are divisible by 16.
40588 // Don't do this if the load is volatile or atomic.
40589 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
40590 LN->isSimple()) {
40591 unsigned Offset = ShiftAmt / 8;
40592 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40595 SDValue Ops[] = { LN->getChain(), Ptr };
40596 SDValue BcastLd = DAG.getMemIntrinsicNode(
40597 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
40599 LN->getOriginalAlign(),
40600 LN->getMemOperand()->getFlags());
40601 DCI.CombineTo(N.getNode(), BcastLd);
40602 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40603 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
40604 return N; // Return N so it doesn't get rechecked!
40605 }
40606 }
40607 }
40608
40609 // vbroadcast(vzload X) -> vbroadcast_load X
40610 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
40611 MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
40612 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
40613 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40614 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
40615 SDValue BcastLd =
40617 LN->getMemoryVT(), LN->getMemOperand());
40618 DCI.CombineTo(N.getNode(), BcastLd);
40619 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40621 return N; // Return N so it doesn't get rechecked!
40622 }
40623 }
40624
40625 // vbroadcast(vector load X) -> vbroadcast_load
40626 if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
40627 SrcVT == MVT::v4i32) &&
40628 Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
40629 LoadSDNode *LN = cast<LoadSDNode>(Src);
40630 // Unless the load is volatile or atomic.
40631 if (LN->isSimple()) {
40632 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40633 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
40634 SDValue BcastLd = DAG.getMemIntrinsicNode(
40635 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
40636 LN->getPointerInfo(), LN->getOriginalAlign(),
40637 LN->getMemOperand()->getFlags());
40638 DCI.CombineTo(N.getNode(), BcastLd);
40639 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40641 return N; // Return N so it doesn't get rechecked!
40642 }
40643 }
40644
40645 return SDValue();
40646 }
40647 case X86ISD::VZEXT_MOVL: {
40648 SDValue N0 = N.getOperand(0);
40649
40650 // If this a vzmovl of a full vector load, replace it with a vzload, unless
40651 // the load is volatile.
40652 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
40653 auto *LN = cast<LoadSDNode>(N0);
40654 if (SDValue VZLoad =
40655 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
40656 DCI.CombineTo(N.getNode(), VZLoad);
40657 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
40659 return N;
40660 }
40661 }
40662
40663 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
40664 // and can just use a VZEXT_LOAD.
40665 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
40666 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
40667 auto *LN = cast<MemSDNode>(N0);
40668 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
40669 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40670 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
40671 SDValue VZLoad =
40673 LN->getMemoryVT(), LN->getMemOperand());
40674 DCI.CombineTo(N.getNode(), VZLoad);
40675 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
40677 return N;
40678 }
40679 }
40680
40681 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
40682 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
40683 // if the upper bits of the i64 are zero.
40684 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
40685 N0.getOperand(0).hasOneUse() &&
40686 N0.getOperand(0).getValueType() == MVT::i64) {
40687 SDValue In = N0.getOperand(0);
40688 APInt Mask = APInt::getHighBitsSet(64, 32);
40689 if (DAG.MaskedValueIsZero(In, Mask)) {
40690 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
40691 MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
40692 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
40693 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
40694 return DAG.getBitcast(VT, Movl);
40695 }
40696 }
40697
40698 // Load a scalar integer constant directly to XMM instead of transferring an
40699 // immediate value from GPR.
40700 // vzext_movl (scalar_to_vector C) --> load [C,0...]
40701 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
40702 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
40703 // Create a vector constant - scalar constant followed by zeros.
40704 EVT ScalarVT = N0.getOperand(0).getValueType();
40705 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
40706 unsigned NumElts = VT.getVectorNumElements();
40707 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
40708 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
40709 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
40710
40711 // Load the vector constant from constant pool.
40712 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
40713 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
40714 MachinePointerInfo MPI =
40716 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
40717 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
40719 }
40720 }
40721
40722 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
40723 // insert into a zero vector. This helps get VZEXT_MOVL closer to
40724 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
40725 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
40726 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
40728
40729 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
40730 isNullConstant(V.getOperand(2))) {
40731 SDValue In = V.getOperand(1);
40733 In.getValueSizeInBits() /
40734 VT.getScalarSizeInBits());
40735 In = DAG.getBitcast(SubVT, In);
40736 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
40737 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
40738 getZeroVector(VT, Subtarget, DAG, DL), Movl,
40739 V.getOperand(2));
40740 }
40741 }
40742
40743 return SDValue();
40744 }
40745 case X86ISD::BLENDI: {
40746 SDValue N0 = N.getOperand(0);
40747 SDValue N1 = N.getOperand(1);
40748
40749 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
40750 // TODO: Handle MVT::v16i16 repeated blend mask.
40751 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
40752 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
40753 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
40754 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
40755 SrcVT.getScalarSizeInBits() >= 32) {
40756 unsigned Size = VT.getVectorNumElements();
40757 unsigned NewSize = SrcVT.getVectorNumElements();
40758 APInt BlendMask = N.getConstantOperandAPInt(2).zextOrTrunc(Size);
40759 APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
40760 return DAG.getBitcast(
40761 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
40762 N1.getOperand(0),
40763 DAG.getTargetConstant(NewBlendMask.getZExtValue(),
40764 DL, MVT::i8)));
40765 }
40766 }
40767 return SDValue();
40768 }
40769 case X86ISD::SHUFP: {
40770 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
40771 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
40772 // TODO: Support types other than v4f32.
40773 if (VT == MVT::v4f32) {
40774 bool Updated = false;
40775 SmallVector<int> Mask;
40777 if (getTargetShuffleMask(N, false, Ops, Mask) && Ops.size() == 2) {
40778 for (int i = 0; i != 2; ++i) {
40779 SmallVector<SDValue> SubOps;
40780 SmallVector<int> SubMask, SubScaledMask;
40781 SDValue Sub = peekThroughBitcasts(Ops[i]);
40782 // TODO: Scaling might be easier if we specify the demanded elts.
40783 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
40784 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
40785 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
40786 int Ofs = i * 2;
40787 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
40788 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
40789 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
40790 Updated = true;
40791 }
40792 }
40793 }
40794 if (Updated) {
40795 for (int &M : Mask)
40796 M %= 4;
40797 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
40798 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
40799 }
40800 }
40801 return SDValue();
40802 }
40803 case X86ISD::VPERMI: {
40804 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
40805 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
40806 SDValue N0 = N.getOperand(0);
40807 SDValue N1 = N.getOperand(1);
40808 unsigned EltSizeInBits = VT.getScalarSizeInBits();
40809 if (N0.getOpcode() == ISD::BITCAST &&
40810 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
40811 SDValue Src = N0.getOperand(0);
40812 EVT SrcVT = Src.getValueType();
40813 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
40814 return DAG.getBitcast(VT, Res);
40815 }
40816 return SDValue();
40817 }
40818 case X86ISD::SHUF128: {
40819 // If we're permuting the upper 256-bits subvectors of a concatenation, then
40820 // see if we can peek through and access the subvector directly.
40821 if (VT.is512BitVector()) {
40822 // 512-bit mask uses 4 x i2 indices - if the msb is always set then only the
40823 // upper subvector is used.
40824 SDValue LHS = N->getOperand(0);
40825 SDValue RHS = N->getOperand(1);
40826 uint64_t Mask = N->getConstantOperandVal(2);
40827 SmallVector<SDValue> LHSOps, RHSOps;
40828 SDValue NewLHS, NewRHS;
40829 if ((Mask & 0x0A) == 0x0A &&
40830 collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) {
40831 NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512);
40832 Mask &= ~0x0A;
40833 }
40834 if ((Mask & 0xA0) == 0xA0 &&
40835 collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) {
40836 NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512);
40837 Mask &= ~0xA0;
40838 }
40839 if (NewLHS || NewRHS)
40840 return DAG.getNode(X86ISD::SHUF128, DL, VT, NewLHS ? NewLHS : LHS,
40841 NewRHS ? NewRHS : RHS,
40842 DAG.getTargetConstant(Mask, DL, MVT::i8));
40843 }
40844 return SDValue();
40845 }
40846 case X86ISD::VPERM2X128: {
40847 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
40848 SDValue LHS = N->getOperand(0);
40849 SDValue RHS = N->getOperand(1);
40850 if (LHS.getOpcode() == ISD::BITCAST &&
40851 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
40852 EVT SrcVT = LHS.getOperand(0).getValueType();
40853 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
40854 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
40855 DAG.getBitcast(SrcVT, LHS),
40856 DAG.getBitcast(SrcVT, RHS),
40857 N->getOperand(2)));
40858 }
40859 }
40860
40861 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
40863 return Res;
40864
40865 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
40866 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
40867 auto FindSubVector128 = [&](unsigned Idx) {
40868 if (Idx > 3)
40869 return SDValue();
40870 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
40871 SmallVector<SDValue> SubOps;
40872 if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
40873 return SubOps[Idx & 1];
40874 unsigned NumElts = Src.getValueType().getVectorNumElements();
40875 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
40876 Src.getOperand(1).getValueSizeInBits() == 128 &&
40877 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
40878 return Src.getOperand(1);
40879 }
40880 return SDValue();
40881 };
40882 unsigned Imm = N.getConstantOperandVal(2);
40883 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
40884 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
40885 MVT SubVT = VT.getHalfNumVectorElementsVT();
40886 SubLo = DAG.getBitcast(SubVT, SubLo);
40887 SubHi = DAG.getBitcast(SubVT, SubHi);
40888 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
40889 }
40890 }
40891 return SDValue();
40892 }
40893 case X86ISD::PSHUFD:
40894 case X86ISD::PSHUFLW:
40895 case X86ISD::PSHUFHW: {
40896 SDValue N0 = N.getOperand(0);
40897 SDValue N1 = N.getOperand(1);
40898 if (N0->hasOneUse()) {
40900 switch (V.getOpcode()) {
40901 case X86ISD::VSHL:
40902 case X86ISD::VSRL:
40903 case X86ISD::VSRA:
40904 case X86ISD::VSHLI:
40905 case X86ISD::VSRLI:
40906 case X86ISD::VSRAI:
40907 case X86ISD::VROTLI:
40908 case X86ISD::VROTRI: {
40909 MVT InnerVT = V.getSimpleValueType();
40910 if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
40911 SDValue Res = DAG.getNode(Opcode, DL, VT,
40912 DAG.getBitcast(VT, V.getOperand(0)), N1);
40913 Res = DAG.getBitcast(InnerVT, Res);
40914 Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
40915 return DAG.getBitcast(VT, Res);
40916 }
40917 break;
40918 }
40919 }
40920 }
40921
40922 Mask = getPSHUFShuffleMask(N);
40923 assert(Mask.size() == 4);
40924 break;
40925 }
40926 case X86ISD::MOVSD:
40927 case X86ISD::MOVSH:
40928 case X86ISD::MOVSS: {
40929 SDValue N0 = N.getOperand(0);
40930 SDValue N1 = N.getOperand(1);
40931
40932 // Canonicalize scalar FPOps:
40933 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
40934 // If commutable, allow OP(N1[0], N0[0]).
40935 unsigned Opcode1 = N1.getOpcode();
40936 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
40937 Opcode1 == ISD::FDIV) {
40938 SDValue N10 = N1.getOperand(0);
40939 SDValue N11 = N1.getOperand(1);
40940 if (N10 == N0 ||
40941 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
40942 if (N10 != N0)
40943 std::swap(N10, N11);
40944 MVT SVT = VT.getVectorElementType();
40945 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
40946 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
40947 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
40948 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
40949 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
40950 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
40951 }
40952 }
40953
40954 return SDValue();
40955 }
40956 case X86ISD::INSERTPS: {
40957 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
40958 SDValue Op0 = N.getOperand(0);
40959 SDValue Op1 = N.getOperand(1);
40960 unsigned InsertPSMask = N.getConstantOperandVal(2);
40961 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
40962 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
40963 unsigned ZeroMask = InsertPSMask & 0xF;
40964
40965 // If we zero out all elements from Op0 then we don't need to reference it.
40966 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
40967 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
40968 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
40969
40970 // If we zero out the element from Op1 then we don't need to reference it.
40971 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
40972 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
40973 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
40974
40975 // Attempt to merge insertps Op1 with an inner target shuffle node.
40976 SmallVector<int, 8> TargetMask1;
40978 APInt KnownUndef1, KnownZero1;
40979 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
40980 KnownZero1)) {
40981 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
40982 // Zero/UNDEF insertion - zero out element and remove dependency.
40983 InsertPSMask |= (1u << DstIdx);
40984 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
40985 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
40986 }
40987 // Update insertps mask srcidx and reference the source input directly.
40988 int M = TargetMask1[SrcIdx];
40989 assert(0 <= M && M < 8 && "Shuffle index out of range");
40990 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
40991 Op1 = Ops1[M < 4 ? 0 : 1];
40992 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
40993 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
40994 }
40995
40996 // Attempt to merge insertps Op0 with an inner target shuffle node.
40997 SmallVector<int, 8> TargetMask0;
40999 APInt KnownUndef0, KnownZero0;
41000 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
41001 KnownZero0)) {
41002 bool Updated = false;
41003 bool UseInput00 = false;
41004 bool UseInput01 = false;
41005 for (int i = 0; i != 4; ++i) {
41006 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
41007 // No change if element is already zero or the inserted element.
41008 continue;
41009 }
41010
41011 if (KnownUndef0[i] || KnownZero0[i]) {
41012 // If the target mask is undef/zero then we must zero the element.
41013 InsertPSMask |= (1u << i);
41014 Updated = true;
41015 continue;
41016 }
41017
41018 // The input vector element must be inline.
41019 int M = TargetMask0[i];
41020 if (M != i && M != (i + 4))
41021 return SDValue();
41022
41023 // Determine which inputs of the target shuffle we're using.
41024 UseInput00 |= (0 <= M && M < 4);
41025 UseInput01 |= (4 <= M);
41026 }
41027
41028 // If we're not using both inputs of the target shuffle then use the
41029 // referenced input directly.
41030 if (UseInput00 && !UseInput01) {
41031 Updated = true;
41032 Op0 = Ops0[0];
41033 } else if (!UseInput00 && UseInput01) {
41034 Updated = true;
41035 Op0 = Ops0[1];
41036 }
41037
41038 if (Updated)
41039 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
41040 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
41041 }
41042
41043 // If we're inserting an element from a vbroadcast load, fold the
41044 // load into the X86insertps instruction. We need to convert the scalar
41045 // load to a vector and clear the source lane of the INSERTPS control.
41046 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
41047 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
41048 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
41049 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
41050 MemIntr->getBasePtr(),
41051 MemIntr->getMemOperand());
41052 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
41054 Load),
41055 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
41056 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
41057 return Insert;
41058 }
41059 }
41060
41061 return SDValue();
41062 }
41063 default:
41064 return SDValue();
41065 }
41066
41067 // Nuke no-op shuffles that show up after combining.
41068 if (isNoopShuffleMask(Mask))
41069 return N.getOperand(0);
41070
41071 // Look for simplifications involving one or two shuffle instructions.
41072 SDValue V = N.getOperand(0);
41073 switch (N.getOpcode()) {
41074 default:
41075 break;
41076 case X86ISD::PSHUFLW:
41077 case X86ISD::PSHUFHW:
41078 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
41079
41080 // See if this reduces to a PSHUFD which is no more expensive and can
41081 // combine with more operations. Note that it has to at least flip the
41082 // dwords as otherwise it would have been removed as a no-op.
41083 if (ArrayRef<int>(Mask).equals({2, 3, 0, 1})) {
41084 int DMask[] = {0, 1, 2, 3};
41085 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
41086 DMask[DOffset + 0] = DOffset + 1;
41087 DMask[DOffset + 1] = DOffset + 0;
41088 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
41089 V = DAG.getBitcast(DVT, V);
41090 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
41091 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
41092 return DAG.getBitcast(VT, V);
41093 }
41094
41095 // Look for shuffle patterns which can be implemented as a single unpack.
41096 // FIXME: This doesn't handle the location of the PSHUFD generically, and
41097 // only works when we have a PSHUFD followed by two half-shuffles.
41098 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
41099 (V.getOpcode() == X86ISD::PSHUFLW ||
41100 V.getOpcode() == X86ISD::PSHUFHW) &&
41101 V.getOpcode() != N.getOpcode() &&
41102 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
41103 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
41104 if (D.getOpcode() == X86ISD::PSHUFD) {
41107 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
41108 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
41109 int WordMask[8];
41110 for (int i = 0; i < 4; ++i) {
41111 WordMask[i + NOffset] = Mask[i] + NOffset;
41112 WordMask[i + VOffset] = VMask[i] + VOffset;
41113 }
41114 // Map the word mask through the DWord mask.
41115 int MappedMask[8];
41116 for (int i = 0; i < 8; ++i)
41117 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
41118 if (ArrayRef<int>(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
41119 ArrayRef<int>(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
41120 // We can replace all three shuffles with an unpack.
41121 V = DAG.getBitcast(VT, D.getOperand(0));
41122 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
41124 DL, VT, V, V);
41125 }
41126 }
41127 }
41128
41129 break;
41130
41131 case X86ISD::PSHUFD:
41132 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DL, DAG))
41133 return NewN;
41134
41135 break;
41136 }
41137
41138 return SDValue();
41139}
41140
41141/// Checks if the shuffle mask takes subsequent elements
41142/// alternately from two vectors.
41143/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
41144static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
41145
41146 int ParitySrc[2] = {-1, -1};
41147 unsigned Size = Mask.size();
41148 for (unsigned i = 0; i != Size; ++i) {
41149 int M = Mask[i];
41150 if (M < 0)
41151 continue;
41152
41153 // Make sure we are using the matching element from the input.
41154 if ((M % Size) != i)
41155 return false;
41156
41157 // Make sure we use the same input for all elements of the same parity.
41158 int Src = M / Size;
41159 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
41160 return false;
41161 ParitySrc[i % 2] = Src;
41162 }
41163
41164 // Make sure each input is used.
41165 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
41166 return false;
41167
41168 Op0Even = ParitySrc[0] == 0;
41169 return true;
41170}
41171
41172/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
41173/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
41174/// are written to the parameters \p Opnd0 and \p Opnd1.
41175///
41176/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
41177/// so it is easier to generically match. We also insert dummy vector shuffle
41178/// nodes for the operands which explicitly discard the lanes which are unused
41179/// by this operation to try to flow through the rest of the combiner
41180/// the fact that they're unused.
41181static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
41182 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
41183 bool &IsSubAdd) {
41184
41185 EVT VT = N->getValueType(0);
41186 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41187 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
41189 return false;
41190
41191 // We only handle target-independent shuffles.
41192 // FIXME: It would be easy and harmless to use the target shuffle mask
41193 // extraction tool to support more.
41194 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
41195 return false;
41196
41197 SDValue V1 = N->getOperand(0);
41198 SDValue V2 = N->getOperand(1);
41199
41200 // Make sure we have an FADD and an FSUB.
41201 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
41202 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
41203 V1.getOpcode() == V2.getOpcode())
41204 return false;
41205
41206 // If there are other uses of these operations we can't fold them.
41207 if (!V1->hasOneUse() || !V2->hasOneUse())
41208 return false;
41209
41210 // Ensure that both operations have the same operands. Note that we can
41211 // commute the FADD operands.
41212 SDValue LHS, RHS;
41213 if (V1.getOpcode() == ISD::FSUB) {
41214 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
41215 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
41216 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
41217 return false;
41218 } else {
41219 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
41220 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
41221 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
41222 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
41223 return false;
41224 }
41225
41226 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
41227 bool Op0Even;
41228 if (!isAddSubOrSubAddMask(Mask, Op0Even))
41229 return false;
41230
41231 // It's a subadd if the vector in the even parity is an FADD.
41232 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
41233 : V2->getOpcode() == ISD::FADD;
41234
41235 Opnd0 = LHS;
41236 Opnd1 = RHS;
41237 return true;
41238}
41239
41240/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
41242 const X86Subtarget &Subtarget,
41243 SelectionDAG &DAG) {
41244 // We only handle target-independent shuffles.
41245 // FIXME: It would be easy and harmless to use the target shuffle mask
41246 // extraction tool to support more.
41247 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
41248 return SDValue();
41249
41250 MVT VT = N->getSimpleValueType(0);
41251 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41252 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
41253 return SDValue();
41254
41255 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
41256 SDValue Op0 = N->getOperand(0);
41257 SDValue Op1 = N->getOperand(1);
41258 SDValue FMAdd = Op0, FMSub = Op1;
41259 if (FMSub.getOpcode() != X86ISD::FMSUB)
41260 std::swap(FMAdd, FMSub);
41261
41262 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
41263 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
41264 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
41265 FMAdd.getOperand(2) != FMSub.getOperand(2))
41266 return SDValue();
41267
41268 // Check for correct shuffle mask.
41269 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
41270 bool Op0Even;
41271 if (!isAddSubOrSubAddMask(Mask, Op0Even))
41272 return SDValue();
41273
41274 // FMAddSub takes zeroth operand from FMSub node.
41275 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
41276 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
41277 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
41278 FMAdd.getOperand(2));
41279}
41280
41281/// Try to combine a shuffle into a target-specific add-sub or
41282/// mul-add-sub node.
41284 const X86Subtarget &Subtarget,
41285 SelectionDAG &DAG) {
41286 if (SDValue V = combineShuffleToFMAddSub(N, DL, Subtarget, DAG))
41287 return V;
41288
41289 SDValue Opnd0, Opnd1;
41290 bool IsSubAdd;
41291 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
41292 return SDValue();
41293
41294 MVT VT = N->getSimpleValueType(0);
41295
41296 // Try to generate X86ISD::FMADDSUB node here.
41297 SDValue Opnd2;
41298 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
41299 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
41300 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
41301 }
41302
41303 if (IsSubAdd)
41304 return SDValue();
41305
41306 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
41307 // the ADDSUB idiom has been successfully recognized. There are no known
41308 // X86 targets with 512-bit ADDSUB instructions!
41309 if (VT.is512BitVector())
41310 return SDValue();
41311
41312 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
41313 // the ADDSUB idiom has been successfully recognized. There are no known
41314 // X86 targets with FP16 ADDSUB instructions!
41315 if (VT.getVectorElementType() == MVT::f16)
41316 return SDValue();
41317
41318 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
41319}
41320
41321// We are looking for a shuffle where both sources are concatenated with undef
41322// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
41323// if we can express this as a single-source shuffle, that's preferable.
41325 SelectionDAG &DAG,
41326 const X86Subtarget &Subtarget) {
41327 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
41328 return SDValue();
41329
41330 EVT VT = N->getValueType(0);
41331
41332 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
41333 if (!VT.is128BitVector() && !VT.is256BitVector())
41334 return SDValue();
41335
41336 if (VT.getVectorElementType() != MVT::i32 &&
41337 VT.getVectorElementType() != MVT::i64 &&
41338 VT.getVectorElementType() != MVT::f32 &&
41339 VT.getVectorElementType() != MVT::f64)
41340 return SDValue();
41341
41342 SDValue N0 = N->getOperand(0);
41343 SDValue N1 = N->getOperand(1);
41344
41345 // Check that both sources are concats with undef.
41346 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
41347 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
41348 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
41349 !N1.getOperand(1).isUndef())
41350 return SDValue();
41351
41352 // Construct the new shuffle mask. Elements from the first source retain their
41353 // index, but elements from the second source no longer need to skip an undef.
41355 int NumElts = VT.getVectorNumElements();
41356
41357 auto *SVOp = cast<ShuffleVectorSDNode>(N);
41358 for (int Elt : SVOp->getMask())
41359 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
41360
41362 N1.getOperand(0));
41363 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
41364}
41365
41366/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
41367/// low half of each source vector and does not set any high half elements in
41368/// the destination vector, narrow the shuffle to half its original size.
41370 EVT VT = Shuf->getValueType(0);
41371 if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))
41372 return SDValue();
41373 if (!VT.is256BitVector() && !VT.is512BitVector())
41374 return SDValue();
41375
41376 // See if we can ignore all of the high elements of the shuffle.
41377 ArrayRef<int> Mask = Shuf->getMask();
41378 if (!isUndefUpperHalf(Mask))
41379 return SDValue();
41380
41381 // Check if the shuffle mask accesses only the low half of each input vector
41382 // (half-index output is 0 or 2).
41383 int HalfIdx1, HalfIdx2;
41384 SmallVector<int, 8> HalfMask(Mask.size() / 2);
41385 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
41386 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
41387 return SDValue();
41388
41389 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
41390 // The trick is knowing that all of the insert/extract are actually free
41391 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
41392 // of narrow inputs into a narrow output, and that is always cheaper than
41393 // the wide shuffle that we started with.
41394 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
41395 Shuf->getOperand(1), HalfMask, HalfIdx1,
41396 HalfIdx2, false, DAG, /*UseConcat*/ true);
41397}
41398
41401 const X86Subtarget &Subtarget) {
41402 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
41403 if (SDValue V = narrowShuffle(Shuf, DAG))
41404 return V;
41405
41406 // If we have legalized the vector types, look for blends of FADD and FSUB
41407 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
41408 SDLoc dl(N);
41409 EVT VT = N->getValueType(0);
41410 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41411 if (TLI.isTypeLegal(VT) && !isSoftF16(VT, Subtarget))
41412 if (SDValue AddSub =
41413 combineShuffleToAddSubOrFMAddSub(N, dl, Subtarget, DAG))
41414 return AddSub;
41415
41416 // Attempt to combine into a vector load/broadcast.
41418 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
41419 return LD;
41420
41421 // For AVX2, we sometimes want to combine
41422 // (vector_shuffle <mask> (concat_vectors t1, undef)
41423 // (concat_vectors t2, undef))
41424 // Into:
41425 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
41426 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
41427 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, dl, DAG, Subtarget))
41428 return ShufConcat;
41429
41430 if (isTargetShuffle(N->getOpcode())) {
41431 SDValue Op(N, 0);
41432 if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))
41433 return Shuffle;
41434
41435 // Try recursively combining arbitrary sequences of x86 shuffle
41436 // instructions into higher-order shuffles. We do this after combining
41437 // specific PSHUF instruction sequences into their minimal form so that we
41438 // can evaluate how many specialized shuffle instructions are involved in
41439 // a particular chain.
41440 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
41441 return Res;
41442
41443 // Simplify source operands based on shuffle mask.
41444 // TODO - merge this into combineX86ShufflesRecursively.
41445 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
41446 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
41447 return SDValue(N, 0);
41448
41449 // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
41450 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
41451 // Perform this after other shuffle combines to allow inner shuffles to be
41452 // combined away first.
41453 if (SDValue BinOp = canonicalizeShuffleWithOp(Op, DAG, dl))
41454 return BinOp;
41455 }
41456
41457 return SDValue();
41458}
41459
41460// Simplify variable target shuffle masks based on the demanded elements.
41461// TODO: Handle DemandedBits in mask indices as well?
41463 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
41464 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
41465 // If we're demanding all elements don't bother trying to simplify the mask.
41466 unsigned NumElts = DemandedElts.getBitWidth();
41467 if (DemandedElts.isAllOnes())
41468 return false;
41469
41470 SDValue Mask = Op.getOperand(MaskIndex);
41471 if (!Mask.hasOneUse())
41472 return false;
41473
41474 // Attempt to generically simplify the variable shuffle mask.
41475 APInt MaskUndef, MaskZero;
41476 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
41477 Depth + 1))
41478 return true;
41479
41480 // Attempt to extract+simplify a (constant pool load) shuffle mask.
41481 // TODO: Support other types from getTargetShuffleMaskIndices?
41483 EVT BCVT = BC.getValueType();
41484 auto *Load = dyn_cast<LoadSDNode>(BC);
41485 if (!Load || !Load->getBasePtr().hasOneUse())
41486 return false;
41487
41488 const Constant *C = getTargetConstantFromNode(Load);
41489 if (!C)
41490 return false;
41491
41492 Type *CTy = C->getType();
41493 if (!CTy->isVectorTy() ||
41494 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
41495 return false;
41496
41497 // Handle scaling for i64 elements on 32-bit targets.
41498 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
41499 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
41500 return false;
41501 unsigned Scale = NumCstElts / NumElts;
41502
41503 // Simplify mask if we have an undemanded element that is not undef.
41504 bool Simplified = false;
41505 SmallVector<Constant *, 32> ConstVecOps;
41506 for (unsigned i = 0; i != NumCstElts; ++i) {
41507 Constant *Elt = C->getAggregateElement(i);
41508 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
41509 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
41510 Simplified = true;
41511 continue;
41512 }
41513 ConstVecOps.push_back(Elt);
41514 }
41515 if (!Simplified)
41516 return false;
41517
41518 // Generate new constant pool entry + legalize immediately for the load.
41519 SDLoc DL(Op);
41520 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
41521 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
41522 SDValue NewMask = TLO.DAG.getLoad(
41523 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
41525 Load->getAlign());
41526 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
41527}
41528
41530 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
41531 TargetLoweringOpt &TLO, unsigned Depth) const {
41532 int NumElts = DemandedElts.getBitWidth();
41533 unsigned Opc = Op.getOpcode();
41534 EVT VT = Op.getValueType();
41535
41536 // Handle special case opcodes.
41537 switch (Opc) {
41538 case X86ISD::PMULDQ:
41539 case X86ISD::PMULUDQ: {
41540 APInt LHSUndef, LHSZero;
41541 APInt RHSUndef, RHSZero;
41542 SDValue LHS = Op.getOperand(0);
41543 SDValue RHS = Op.getOperand(1);
41544 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
41545 Depth + 1))
41546 return true;
41547 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
41548 Depth + 1))
41549 return true;
41550 // Multiply by zero.
41551 KnownZero = LHSZero | RHSZero;
41552 break;
41553 }
41554 case X86ISD::VPMADDWD: {
41555 APInt LHSUndef, LHSZero;
41556 APInt RHSUndef, RHSZero;
41557 SDValue LHS = Op.getOperand(0);
41558 SDValue RHS = Op.getOperand(1);
41559 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
41560
41561 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
41562 Depth + 1))
41563 return true;
41564 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
41565 Depth + 1))
41566 return true;
41567
41568 // TODO: Multiply by zero.
41569
41570 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
41571 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
41572 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
41573 Depth + 1))
41574 return true;
41575 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
41576 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
41577 Depth + 1))
41578 return true;
41579 break;
41580 }
41581 case X86ISD::PSADBW: {
41582 SDValue LHS = Op.getOperand(0);
41583 SDValue RHS = Op.getOperand(1);
41584 assert(VT.getScalarType() == MVT::i64 &&
41585 LHS.getValueType() == RHS.getValueType() &&
41586 LHS.getValueType().getScalarType() == MVT::i8 &&
41587 "Unexpected PSADBW types");
41588
41589 // Aggressively peek through ops to get at the demanded elts.
41590 if (!DemandedElts.isAllOnes()) {
41591 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
41592 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
41594 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
41596 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
41597 if (NewLHS || NewRHS) {
41598 NewLHS = NewLHS ? NewLHS : LHS;
41599 NewRHS = NewRHS ? NewRHS : RHS;
41600 return TLO.CombineTo(
41601 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
41602 }
41603 }
41604 break;
41605 }
41606 case X86ISD::VSHL:
41607 case X86ISD::VSRL:
41608 case X86ISD::VSRA: {
41609 // We only need the bottom 64-bits of the (128-bit) shift amount.
41610 SDValue Amt = Op.getOperand(1);
41611 MVT AmtVT = Amt.getSimpleValueType();
41612 assert(AmtVT.is128BitVector() && "Unexpected value type");
41613
41614 // If we reuse the shift amount just for sse shift amounts then we know that
41615 // only the bottom 64-bits are only ever used.
41616 bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
41617 unsigned UseOpc = Use->getOpcode();
41618 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
41619 UseOpc == X86ISD::VSRA) &&
41620 Use->getOperand(0) != Amt;
41621 });
41622
41623 APInt AmtUndef, AmtZero;
41624 unsigned NumAmtElts = AmtVT.getVectorNumElements();
41625 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
41626 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
41627 Depth + 1, AssumeSingleUse))
41628 return true;
41629 [[fallthrough]];
41630 }
41631 case X86ISD::VSHLI:
41632 case X86ISD::VSRLI:
41633 case X86ISD::VSRAI: {
41634 SDValue Src = Op.getOperand(0);
41635 APInt SrcUndef;
41636 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
41637 Depth + 1))
41638 return true;
41639
41640 // Fold shift(0,x) -> 0
41641 if (DemandedElts.isSubsetOf(KnownZero))
41642 return TLO.CombineTo(
41643 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
41644
41645 // Aggressively peek through ops to get at the demanded elts.
41646 if (!DemandedElts.isAllOnes())
41648 Src, DemandedElts, TLO.DAG, Depth + 1))
41649 return TLO.CombineTo(
41650 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
41651 break;
41652 }
41653 case X86ISD::VPSHA:
41654 case X86ISD::VPSHL:
41655 case X86ISD::VSHLV:
41656 case X86ISD::VSRLV:
41657 case X86ISD::VSRAV: {
41658 APInt LHSUndef, LHSZero;
41659 APInt RHSUndef, RHSZero;
41660 SDValue LHS = Op.getOperand(0);
41661 SDValue RHS = Op.getOperand(1);
41662 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
41663 Depth + 1))
41664 return true;
41665
41666 // Fold shift(0,x) -> 0
41667 if (DemandedElts.isSubsetOf(LHSZero))
41668 return TLO.CombineTo(
41669 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
41670
41671 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
41672 Depth + 1))
41673 return true;
41674
41675 KnownZero = LHSZero;
41676 break;
41677 }
41678 case X86ISD::PCMPEQ:
41679 case X86ISD::PCMPGT: {
41680 APInt LHSUndef, LHSZero;
41681 APInt RHSUndef, RHSZero;
41682 SDValue LHS = Op.getOperand(0);
41683 SDValue RHS = Op.getOperand(1);
41684 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
41685 Depth + 1))
41686 return true;
41687 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
41688 Depth + 1))
41689 return true;
41690 break;
41691 }
41692 case X86ISD::KSHIFTL: {
41693 SDValue Src = Op.getOperand(0);
41694 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
41695 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
41696 unsigned ShiftAmt = Amt->getZExtValue();
41697
41698 if (ShiftAmt == 0)
41699 return TLO.CombineTo(Op, Src);
41700
41701 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
41702 // single shift. We can do this if the bottom bits (which are shifted
41703 // out) are never demanded.
41704 if (Src.getOpcode() == X86ISD::KSHIFTR) {
41705 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
41706 unsigned C1 = Src.getConstantOperandVal(1);
41707 unsigned NewOpc = X86ISD::KSHIFTL;
41708 int Diff = ShiftAmt - C1;
41709 if (Diff < 0) {
41710 Diff = -Diff;
41711 NewOpc = X86ISD::KSHIFTR;
41712 }
41713
41714 SDLoc dl(Op);
41715 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
41716 return TLO.CombineTo(
41717 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
41718 }
41719 }
41720
41721 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
41722 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
41723 Depth + 1))
41724 return true;
41725
41726 KnownUndef <<= ShiftAmt;
41727 KnownZero <<= ShiftAmt;
41728 KnownZero.setLowBits(ShiftAmt);
41729 break;
41730 }
41731 case X86ISD::KSHIFTR: {
41732 SDValue Src = Op.getOperand(0);
41733 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
41734 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
41735 unsigned ShiftAmt = Amt->getZExtValue();
41736
41737 if (ShiftAmt == 0)
41738 return TLO.CombineTo(Op, Src);
41739
41740 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
41741 // single shift. We can do this if the top bits (which are shifted
41742 // out) are never demanded.
41743 if (Src.getOpcode() == X86ISD::KSHIFTL) {
41744 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
41745 unsigned C1 = Src.getConstantOperandVal(1);
41746 unsigned NewOpc = X86ISD::KSHIFTR;
41747 int Diff = ShiftAmt - C1;
41748 if (Diff < 0) {
41749 Diff = -Diff;
41750 NewOpc = X86ISD::KSHIFTL;
41751 }
41752
41753 SDLoc dl(Op);
41754 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
41755 return TLO.CombineTo(
41756 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
41757 }
41758 }
41759
41760 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
41761 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
41762 Depth + 1))
41763 return true;
41764
41765 KnownUndef.lshrInPlace(ShiftAmt);
41766 KnownZero.lshrInPlace(ShiftAmt);
41767 KnownZero.setHighBits(ShiftAmt);
41768 break;
41769 }
41770 case X86ISD::ANDNP: {
41771 // ANDNP = (~LHS & RHS);
41772 SDValue LHS = Op.getOperand(0);
41773 SDValue RHS = Op.getOperand(1);
41774
41775 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
41776 APInt UndefElts;
41777 SmallVector<APInt> EltBits;
41778 int NumElts = VT.getVectorNumElements();
41779 int EltSizeInBits = VT.getScalarSizeInBits();
41780 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
41781 APInt OpElts = DemandedElts;
41782 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
41783 EltBits)) {
41784 OpBits.clearAllBits();
41785 OpElts.clearAllBits();
41786 for (int I = 0; I != NumElts; ++I) {
41787 if (!DemandedElts[I])
41788 continue;
41789 if (UndefElts[I]) {
41790 // We can't assume an undef src element gives an undef dst - the
41791 // other src might be zero.
41792 OpBits.setAllBits();
41793 OpElts.setBit(I);
41794 } else if ((Invert && !EltBits[I].isAllOnes()) ||
41795 (!Invert && !EltBits[I].isZero())) {
41796 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
41797 OpElts.setBit(I);
41798 }
41799 }
41800 }
41801 return std::make_pair(OpBits, OpElts);
41802 };
41803 APInt BitsLHS, EltsLHS;
41804 APInt BitsRHS, EltsRHS;
41805 std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
41806 std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
41807
41808 APInt LHSUndef, LHSZero;
41809 APInt RHSUndef, RHSZero;
41810 if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
41811 Depth + 1))
41812 return true;
41813 if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
41814 Depth + 1))
41815 return true;
41816
41817 if (!DemandedElts.isAllOnes()) {
41818 SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
41819 TLO.DAG, Depth + 1);
41820 SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
41821 TLO.DAG, Depth + 1);
41822 if (NewLHS || NewRHS) {
41823 NewLHS = NewLHS ? NewLHS : LHS;
41824 NewRHS = NewRHS ? NewRHS : RHS;
41825 return TLO.CombineTo(
41826 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
41827 }
41828 }
41829 break;
41830 }
41831 case X86ISD::CVTSI2P:
41832 case X86ISD::CVTUI2P:
41833 case X86ISD::CVTPH2PS:
41834 case X86ISD::CVTPS2PH: {
41835 SDValue Src = Op.getOperand(0);
41836 MVT SrcVT = Src.getSimpleValueType();
41837 APInt SrcUndef, SrcZero;
41838 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
41839 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
41840 Depth + 1))
41841 return true;
41842 break;
41843 }
41844 case X86ISD::PACKSS:
41845 case X86ISD::PACKUS: {
41846 SDValue N0 = Op.getOperand(0);
41847 SDValue N1 = Op.getOperand(1);
41848
41849 APInt DemandedLHS, DemandedRHS;
41850 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
41851
41852 APInt LHSUndef, LHSZero;
41853 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
41854 Depth + 1))
41855 return true;
41856 APInt RHSUndef, RHSZero;
41857 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
41858 Depth + 1))
41859 return true;
41860
41861 // TODO - pass on known zero/undef.
41862
41863 // Aggressively peek through ops to get at the demanded elts.
41864 // TODO - we should do this for all target/faux shuffles ops.
41865 if (!DemandedElts.isAllOnes()) {
41866 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
41867 TLO.DAG, Depth + 1);
41868 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
41869 TLO.DAG, Depth + 1);
41870 if (NewN0 || NewN1) {
41871 NewN0 = NewN0 ? NewN0 : N0;
41872 NewN1 = NewN1 ? NewN1 : N1;
41873 return TLO.CombineTo(Op,
41874 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
41875 }
41876 }
41877 break;
41878 }
41879 case X86ISD::HADD:
41880 case X86ISD::HSUB:
41881 case X86ISD::FHADD:
41882 case X86ISD::FHSUB: {
41883 SDValue N0 = Op.getOperand(0);
41884 SDValue N1 = Op.getOperand(1);
41885
41886 APInt DemandedLHS, DemandedRHS;
41887 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
41888
41889 APInt LHSUndef, LHSZero;
41890 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
41891 Depth + 1))
41892 return true;
41893 APInt RHSUndef, RHSZero;
41894 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
41895 Depth + 1))
41896 return true;
41897
41898 // TODO - pass on known zero/undef.
41899
41900 // Aggressively peek through ops to get at the demanded elts.
41901 // TODO: Handle repeated operands.
41902 if (N0 != N1 && !DemandedElts.isAllOnes()) {
41903 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
41904 TLO.DAG, Depth + 1);
41905 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
41906 TLO.DAG, Depth + 1);
41907 if (NewN0 || NewN1) {
41908 NewN0 = NewN0 ? NewN0 : N0;
41909 NewN1 = NewN1 ? NewN1 : N1;
41910 return TLO.CombineTo(Op,
41911 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
41912 }
41913 }
41914 break;
41915 }
41916 case X86ISD::VTRUNC:
41917 case X86ISD::VTRUNCS:
41918 case X86ISD::VTRUNCUS: {
41919 SDValue Src = Op.getOperand(0);
41920 MVT SrcVT = Src.getSimpleValueType();
41921 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
41922 APInt SrcUndef, SrcZero;
41923 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
41924 Depth + 1))
41925 return true;
41926 KnownZero = SrcZero.zextOrTrunc(NumElts);
41927 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
41928 break;
41929 }
41930 case X86ISD::BLENDI: {
41931 SmallVector<int, 16> BlendMask;
41932 DecodeBLENDMask(NumElts, Op.getConstantOperandVal(2), BlendMask);
41934 VT.getSimpleVT(), Op.getOperand(0), Op.getOperand(1), BlendMask,
41935 DemandedElts, TLO.DAG, Subtarget, SDLoc(Op)))
41936 return TLO.CombineTo(Op, R);
41937 break;
41938 }
41939 case X86ISD::BLENDV: {
41940 APInt SelUndef, SelZero;
41941 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
41942 SelZero, TLO, Depth + 1))
41943 return true;
41944
41945 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
41946 APInt LHSUndef, LHSZero;
41947 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
41948 LHSZero, TLO, Depth + 1))
41949 return true;
41950
41951 APInt RHSUndef, RHSZero;
41952 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
41953 RHSZero, TLO, Depth + 1))
41954 return true;
41955
41956 KnownZero = LHSZero & RHSZero;
41957 KnownUndef = LHSUndef & RHSUndef;
41958 break;
41959 }
41960 case X86ISD::VZEXT_MOVL: {
41961 // If upper demanded elements are already zero then we have nothing to do.
41962 SDValue Src = Op.getOperand(0);
41963 APInt DemandedUpperElts = DemandedElts;
41964 DemandedUpperElts.clearLowBits(1);
41965 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
41966 return TLO.CombineTo(Op, Src);
41967 break;
41968 }
41969 case X86ISD::VZEXT_LOAD: {
41970 // If upper demanded elements are not demanded then simplify to a
41971 // scalar_to_vector(load()).
41973 if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) {
41974 SDLoc DL(Op);
41975 auto *Mem = cast<MemSDNode>(Op);
41976 SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(),
41977 Mem->getMemOperand());
41978 SDValue Vec = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Elt);
41979 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Vec));
41980 }
41981 break;
41982 }
41983 case X86ISD::VBROADCAST: {
41984 SDValue Src = Op.getOperand(0);
41985 MVT SrcVT = Src.getSimpleValueType();
41986 if (!SrcVT.isVector())
41987 break;
41988 // Don't bother broadcasting if we just need the 0'th element.
41989 if (DemandedElts == 1) {
41990 if (Src.getValueType() != VT)
41991 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
41992 SDLoc(Op));
41993 return TLO.CombineTo(Op, Src);
41994 }
41995 APInt SrcUndef, SrcZero;
41996 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
41997 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
41998 Depth + 1))
41999 return true;
42000 // Aggressively peek through src to get at the demanded elt.
42001 // TODO - we should do this for all target/faux shuffles ops.
42003 Src, SrcElts, TLO.DAG, Depth + 1))
42004 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
42005 break;
42006 }
42007 case X86ISD::VPERMV:
42008 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
42009 Depth))
42010 return true;
42011 break;
42012 case X86ISD::PSHUFB:
42013 case X86ISD::VPERMV3:
42014 case X86ISD::VPERMILPV:
42015 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
42016 Depth))
42017 return true;
42018 break;
42019 case X86ISD::VPPERM:
42020 case X86ISD::VPERMIL2:
42021 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
42022 Depth))
42023 return true;
42024 break;
42025 }
42026
42027 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
42028 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
42029 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
42030 if ((VT.is256BitVector() || VT.is512BitVector()) &&
42031 DemandedElts.lshr(NumElts / 2) == 0) {
42032 unsigned SizeInBits = VT.getSizeInBits();
42033 unsigned ExtSizeInBits = SizeInBits / 2;
42034
42035 // See if 512-bit ops only use the bottom 128-bits.
42036 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
42037 ExtSizeInBits = SizeInBits / 4;
42038
42039 switch (Opc) {
42040 // Scalar broadcast.
42041 case X86ISD::VBROADCAST: {
42042 SDLoc DL(Op);
42043 SDValue Src = Op.getOperand(0);
42044 if (Src.getValueSizeInBits() > ExtSizeInBits)
42045 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
42046 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
42047 ExtSizeInBits / VT.getScalarSizeInBits());
42048 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
42049 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
42050 TLO.DAG, DL, ExtSizeInBits));
42051 }
42053 SDLoc DL(Op);
42054 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
42055 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
42056 ExtSizeInBits / VT.getScalarSizeInBits());
42057 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
42058 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
42059 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
42060 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
42061 MemIntr->getMemOperand());
42063 Bcst.getValue(1));
42064 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
42065 TLO.DAG, DL, ExtSizeInBits));
42066 }
42067 // Subvector broadcast.
42069 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
42070 EVT MemVT = MemIntr->getMemoryVT();
42071 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
42072 SDLoc DL(Op);
42073 SDValue Ld =
42074 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
42075 MemIntr->getBasePtr(), MemIntr->getMemOperand());
42077 Ld.getValue(1));
42078 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
42079 TLO.DAG, DL, ExtSizeInBits));
42080 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
42081 SDLoc DL(Op);
42082 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
42083 ExtSizeInBits / VT.getScalarSizeInBits());
42084 if (SDValue BcstLd =
42085 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
42086 return TLO.CombineTo(Op,
42087 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
42088 TLO.DAG, DL, ExtSizeInBits));
42089 }
42090 break;
42091 }
42092 // Byte shifts by immediate.
42093 case X86ISD::VSHLDQ:
42094 case X86ISD::VSRLDQ:
42095 // Shift by uniform.
42096 case X86ISD::VSHL:
42097 case X86ISD::VSRL:
42098 case X86ISD::VSRA:
42099 // Shift by immediate.
42100 case X86ISD::VSHLI:
42101 case X86ISD::VSRLI:
42102 case X86ISD::VSRAI: {
42103 SDLoc DL(Op);
42104 SDValue Ext0 =
42105 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
42106 SDValue ExtOp =
42107 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
42108 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
42109 SDValue Insert =
42110 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
42111 return TLO.CombineTo(Op, Insert);
42112 }
42113 case X86ISD::VPERMI: {
42114 // Simplify PERMPD/PERMQ to extract_subvector.
42115 // TODO: This should be done in shuffle combining.
42116 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
42118 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
42119 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
42120 SDLoc DL(Op);
42121 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
42122 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
42123 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
42124 return TLO.CombineTo(Op, Insert);
42125 }
42126 }
42127 break;
42128 }
42129 case X86ISD::VPERM2X128: {
42130 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
42131 SDLoc DL(Op);
42132 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
42133 if (LoMask & 0x8)
42134 return TLO.CombineTo(
42135 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
42136 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
42137 unsigned SrcIdx = (LoMask & 0x2) >> 1;
42138 SDValue ExtOp =
42139 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
42140 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
42141 SDValue Insert =
42142 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
42143 return TLO.CombineTo(Op, Insert);
42144 }
42145 // Zero upper elements.
42146 case X86ISD::VZEXT_MOVL:
42147 // Target unary shuffles by immediate:
42148 case X86ISD::PSHUFD:
42149 case X86ISD::PSHUFLW:
42150 case X86ISD::PSHUFHW:
42151 case X86ISD::VPERMILPI:
42152 // (Non-Lane Crossing) Target Shuffles.
42153 case X86ISD::VPERMILPV:
42154 case X86ISD::VPERMIL2:
42155 case X86ISD::PSHUFB:
42156 case X86ISD::UNPCKL:
42157 case X86ISD::UNPCKH:
42158 case X86ISD::BLENDI:
42159 // Integer ops.
42160 case X86ISD::PACKSS:
42161 case X86ISD::PACKUS:
42162 case X86ISD::PCMPEQ:
42163 case X86ISD::PCMPGT:
42164 case X86ISD::PMULUDQ:
42165 case X86ISD::PMULDQ:
42166 case X86ISD::VSHLV:
42167 case X86ISD::VSRLV:
42168 case X86ISD::VSRAV:
42169 // Float ops.
42170 case X86ISD::FMAX:
42171 case X86ISD::FMIN:
42172 case X86ISD::FMAXC:
42173 case X86ISD::FMINC:
42174 // Horizontal Ops.
42175 case X86ISD::HADD:
42176 case X86ISD::HSUB:
42177 case X86ISD::FHADD:
42178 case X86ISD::FHSUB: {
42179 SDLoc DL(Op);
42181 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
42182 SDValue SrcOp = Op.getOperand(i);
42183 EVT SrcVT = SrcOp.getValueType();
42184 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
42185 "Unsupported vector size");
42186 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
42187 ExtSizeInBits)
42188 : SrcOp);
42189 }
42190 MVT ExtVT = VT.getSimpleVT();
42191 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
42192 ExtSizeInBits / ExtVT.getScalarSizeInBits());
42193 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
42194 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
42195 SDValue Insert =
42196 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
42197 return TLO.CombineTo(Op, Insert);
42198 }
42199 }
42200 }
42201
42202 // For splats, unless we *only* demand the 0'th element,
42203 // stop attempts at simplification here, we aren't going to improve things,
42204 // this is better than any potential shuffle.
42205 if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
42206 return false;
42207
42208 // Get target/faux shuffle mask.
42209 APInt OpUndef, OpZero;
42210 SmallVector<int, 64> OpMask;
42211 SmallVector<SDValue, 2> OpInputs;
42212 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
42213 OpZero, TLO.DAG, Depth, false))
42214 return false;
42215
42216 // Shuffle inputs must be the same size as the result.
42217 if (OpMask.size() != (unsigned)NumElts ||
42218 llvm::any_of(OpInputs, [VT](SDValue V) {
42219 return VT.getSizeInBits() != V.getValueSizeInBits() ||
42220 !V.getValueType().isVector();
42221 }))
42222 return false;
42223
42224 KnownZero = OpZero;
42225 KnownUndef = OpUndef;
42226
42227 // Check if shuffle mask can be simplified to undef/zero/identity.
42228 int NumSrcs = OpInputs.size();
42229 for (int i = 0; i != NumElts; ++i)
42230 if (!DemandedElts[i])
42231 OpMask[i] = SM_SentinelUndef;
42232
42233 if (isUndefInRange(OpMask, 0, NumElts)) {
42234 KnownUndef.setAllBits();
42235 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
42236 }
42237 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
42238 KnownZero.setAllBits();
42239 return TLO.CombineTo(
42240 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
42241 }
42242 for (int Src = 0; Src != NumSrcs; ++Src)
42243 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
42244 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
42245
42246 // Attempt to simplify inputs.
42247 for (int Src = 0; Src != NumSrcs; ++Src) {
42248 // TODO: Support inputs of different types.
42249 if (OpInputs[Src].getValueType() != VT)
42250 continue;
42251
42252 int Lo = Src * NumElts;
42253 APInt SrcElts = APInt::getZero(NumElts);
42254 for (int i = 0; i != NumElts; ++i)
42255 if (DemandedElts[i]) {
42256 int M = OpMask[i] - Lo;
42257 if (0 <= M && M < NumElts)
42258 SrcElts.setBit(M);
42259 }
42260
42261 // TODO - Propagate input undef/zero elts.
42262 APInt SrcUndef, SrcZero;
42263 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
42264 TLO, Depth + 1))
42265 return true;
42266 }
42267
42268 // If we don't demand all elements, then attempt to combine to a simpler
42269 // shuffle.
42270 // We need to convert the depth to something combineX86ShufflesRecursively
42271 // can handle - so pretend its Depth == 0 again, and reduce the max depth
42272 // to match. This prevents combineX86ShuffleChain from returning a
42273 // combined shuffle that's the same as the original root, causing an
42274 // infinite loop.
42275 if (!DemandedElts.isAllOnes()) {
42276 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
42277
42278 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
42279 for (int i = 0; i != NumElts; ++i)
42280 if (DemandedElts[i])
42281 DemandedMask[i] = i;
42282
42284 {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
42285 /*HasVarMask*/ false,
42286 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,
42287 Subtarget);
42288 if (NewShuffle)
42289 return TLO.CombineTo(Op, NewShuffle);
42290 }
42291
42292 return false;
42293}
42294
42296 SDValue Op, const APInt &OriginalDemandedBits,
42297 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
42298 unsigned Depth) const {
42299 EVT VT = Op.getValueType();
42300 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
42301 unsigned Opc = Op.getOpcode();
42302 switch(Opc) {
42303 case X86ISD::VTRUNC: {
42304 KnownBits KnownOp;
42305 SDValue Src = Op.getOperand(0);
42306 MVT SrcVT = Src.getSimpleValueType();
42307
42308 // Simplify the input, using demanded bit information.
42309 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
42310 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
42311 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
42312 return true;
42313 break;
42314 }
42315 case X86ISD::PMULDQ:
42316 case X86ISD::PMULUDQ: {
42317 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
42318 KnownBits KnownLHS, KnownRHS;
42319 SDValue LHS = Op.getOperand(0);
42320 SDValue RHS = Op.getOperand(1);
42321
42322 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
42323 // FIXME: Can we bound this better?
42324 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
42325 APInt DemandedMaskLHS = APInt::getAllOnes(64);
42326 APInt DemandedMaskRHS = APInt::getAllOnes(64);
42327
42328 bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
42329 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
42330 DemandedMaskLHS = DemandedMask;
42331 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
42332 DemandedMaskRHS = DemandedMask;
42333
42334 if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
42335 KnownLHS, TLO, Depth + 1))
42336 return true;
42337 if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
42338 KnownRHS, TLO, Depth + 1))
42339 return true;
42340
42341 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
42342 KnownRHS = KnownRHS.trunc(32);
42343 if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
42344 KnownRHS.getConstant().isOne()) {
42345 SDLoc DL(Op);
42346 SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
42347 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
42348 }
42349
42350 // Aggressively peek through ops to get at the demanded low bits.
42352 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
42354 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
42355 if (DemandedLHS || DemandedRHS) {
42356 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
42357 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
42358 return TLO.CombineTo(
42359 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
42360 }
42361 break;
42362 }
42363 case X86ISD::ANDNP: {
42364 KnownBits Known2;
42365 SDValue Op0 = Op.getOperand(0);
42366 SDValue Op1 = Op.getOperand(1);
42367
42368 if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,
42369 Known, TLO, Depth + 1))
42370 return true;
42371 assert(!Known.hasConflict() && "Bits known to be one AND zero?");
42372
42373 if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits,
42374 OriginalDemandedElts, Known2, TLO, Depth + 1))
42375 return true;
42376 assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
42377
42378 // If the RHS is a constant, see if we can simplify it.
42379 if (ShrinkDemandedConstant(Op, ~Known2.One & OriginalDemandedBits,
42380 OriginalDemandedElts, TLO))
42381 return true;
42382
42383 // ANDNP = (~Op0 & Op1);
42384 Known.One &= Known2.Zero;
42385 Known.Zero |= Known2.One;
42386 break;
42387 }
42388 case X86ISD::VSHLI: {
42389 SDValue Op0 = Op.getOperand(0);
42390
42391 unsigned ShAmt = Op.getConstantOperandVal(1);
42392 if (ShAmt >= BitWidth)
42393 break;
42394
42395 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
42396
42397 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
42398 // single shift. We can do this if the bottom bits (which are shifted
42399 // out) are never demanded.
42400 if (Op0.getOpcode() == X86ISD::VSRLI &&
42401 OriginalDemandedBits.countr_zero() >= ShAmt) {
42402 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
42403 if (Shift2Amt < BitWidth) {
42404 int Diff = ShAmt - Shift2Amt;
42405 if (Diff == 0)
42406 return TLO.CombineTo(Op, Op0.getOperand(0));
42407
42408 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
42409 SDValue NewShift = TLO.DAG.getNode(
42410 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
42411 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
42412 return TLO.CombineTo(Op, NewShift);
42413 }
42414 }
42415
42416 // If we are only demanding sign bits then we can use the shift source directly.
42417 unsigned NumSignBits =
42418 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
42419 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
42420 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
42421 return TLO.CombineTo(Op, Op0);
42422
42423 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
42424 TLO, Depth + 1))
42425 return true;
42426
42427 assert(!Known.hasConflict() && "Bits known to be one AND zero?");
42428 Known.Zero <<= ShAmt;
42429 Known.One <<= ShAmt;
42430
42431 // Low bits known zero.
42432 Known.Zero.setLowBits(ShAmt);
42433 return false;
42434 }
42435 case X86ISD::VSRLI: {
42436 unsigned ShAmt = Op.getConstantOperandVal(1);
42437 if (ShAmt >= BitWidth)
42438 break;
42439
42440 APInt DemandedMask = OriginalDemandedBits << ShAmt;
42441
42442 if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
42443 OriginalDemandedElts, Known, TLO, Depth + 1))
42444 return true;
42445
42446 assert(!Known.hasConflict() && "Bits known to be one AND zero?");
42447 Known.Zero.lshrInPlace(ShAmt);
42448 Known.One.lshrInPlace(ShAmt);
42449
42450 // High bits known zero.
42451 Known.Zero.setHighBits(ShAmt);
42452 return false;
42453 }
42454 case X86ISD::VSRAI: {
42455 SDValue Op0 = Op.getOperand(0);
42456 SDValue Op1 = Op.getOperand(1);
42457
42458 unsigned ShAmt = Op1->getAsZExtVal();
42459 if (ShAmt >= BitWidth)
42460 break;
42461
42462 APInt DemandedMask = OriginalDemandedBits << ShAmt;
42463
42464 // If we just want the sign bit then we don't need to shift it.
42465 if (OriginalDemandedBits.isSignMask())
42466 return TLO.CombineTo(Op, Op0);
42467
42468 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
42469 if (Op0.getOpcode() == X86ISD::VSHLI &&
42470 Op.getOperand(1) == Op0.getOperand(1)) {
42471 SDValue Op00 = Op0.getOperand(0);
42472 unsigned NumSignBits =
42473 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
42474 if (ShAmt < NumSignBits)
42475 return TLO.CombineTo(Op, Op00);
42476 }
42477
42478 // If any of the demanded bits are produced by the sign extension, we also
42479 // demand the input sign bit.
42480 if (OriginalDemandedBits.countl_zero() < ShAmt)
42481 DemandedMask.setSignBit();
42482
42483 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
42484 TLO, Depth + 1))
42485 return true;
42486
42487 assert(!Known.hasConflict() && "Bits known to be one AND zero?");
42488 Known.Zero.lshrInPlace(ShAmt);
42489 Known.One.lshrInPlace(ShAmt);
42490
42491 // If the input sign bit is known to be zero, or if none of the top bits
42492 // are demanded, turn this into an unsigned shift right.
42493 if (Known.Zero[BitWidth - ShAmt - 1] ||
42494 OriginalDemandedBits.countl_zero() >= ShAmt)
42495 return TLO.CombineTo(
42496 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
42497
42498 // High bits are known one.
42499 if (Known.One[BitWidth - ShAmt - 1])
42500 Known.One.setHighBits(ShAmt);
42501 return false;
42502 }
42503 case X86ISD::BLENDV: {
42504 SDValue Sel = Op.getOperand(0);
42505 SDValue LHS = Op.getOperand(1);
42506 SDValue RHS = Op.getOperand(2);
42507
42508 APInt SignMask = APInt::getSignMask(BitWidth);
42510 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
42512 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
42514 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
42515
42516 if (NewSel || NewLHS || NewRHS) {
42517 NewSel = NewSel ? NewSel : Sel;
42518 NewLHS = NewLHS ? NewLHS : LHS;
42519 NewRHS = NewRHS ? NewRHS : RHS;
42520 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
42521 NewSel, NewLHS, NewRHS));
42522 }
42523 break;
42524 }
42525 case X86ISD::PEXTRB:
42526 case X86ISD::PEXTRW: {
42527 SDValue Vec = Op.getOperand(0);
42528 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
42529 MVT VecVT = Vec.getSimpleValueType();
42530 unsigned NumVecElts = VecVT.getVectorNumElements();
42531
42532 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
42533 unsigned Idx = CIdx->getZExtValue();
42534 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
42535
42536 // If we demand no bits from the vector then we must have demanded
42537 // bits from the implict zext - simplify to zero.
42538 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
42539 if (DemandedVecBits == 0)
42540 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
42541
42542 APInt KnownUndef, KnownZero;
42543 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
42544 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
42545 KnownZero, TLO, Depth + 1))
42546 return true;
42547
42548 KnownBits KnownVec;
42549 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
42550 KnownVec, TLO, Depth + 1))
42551 return true;
42552
42554 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
42555 return TLO.CombineTo(
42556 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
42557
42558 Known = KnownVec.zext(BitWidth);
42559 return false;
42560 }
42561 break;
42562 }
42563 case X86ISD::PINSRB:
42564 case X86ISD::PINSRW: {
42565 SDValue Vec = Op.getOperand(0);
42566 SDValue Scl = Op.getOperand(1);
42567 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
42568 MVT VecVT = Vec.getSimpleValueType();
42569
42570 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
42571 unsigned Idx = CIdx->getZExtValue();
42572 if (!OriginalDemandedElts[Idx])
42573 return TLO.CombineTo(Op, Vec);
42574
42575 KnownBits KnownVec;
42576 APInt DemandedVecElts(OriginalDemandedElts);
42577 DemandedVecElts.clearBit(Idx);
42578 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
42579 KnownVec, TLO, Depth + 1))
42580 return true;
42581
42582 KnownBits KnownScl;
42583 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
42584 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
42585 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
42586 return true;
42587
42588 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
42589 Known = KnownVec.intersectWith(KnownScl);
42590 return false;
42591 }
42592 break;
42593 }
42594 case X86ISD::PACKSS:
42595 // PACKSS saturates to MIN/MAX integer values. So if we just want the
42596 // sign bit then we can just ask for the source operands sign bit.
42597 // TODO - add known bits handling.
42598 if (OriginalDemandedBits.isSignMask()) {
42599 APInt DemandedLHS, DemandedRHS;
42600 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
42601
42602 KnownBits KnownLHS, KnownRHS;
42603 APInt SignMask = APInt::getSignMask(BitWidth * 2);
42604 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
42605 KnownLHS, TLO, Depth + 1))
42606 return true;
42607 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
42608 KnownRHS, TLO, Depth + 1))
42609 return true;
42610
42611 // Attempt to avoid multi-use ops if we don't need anything from them.
42613 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
42615 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
42616 if (DemandedOp0 || DemandedOp1) {
42617 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
42618 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
42619 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
42620 }
42621 }
42622 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
42623 break;
42624 case X86ISD::VBROADCAST: {
42625 SDValue Src = Op.getOperand(0);
42626 MVT SrcVT = Src.getSimpleValueType();
42627 APInt DemandedElts = APInt::getOneBitSet(
42628 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
42629 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
42630 TLO, Depth + 1))
42631 return true;
42632 // If we don't need the upper bits, attempt to narrow the broadcast source.
42633 // Don't attempt this on AVX512 as it might affect broadcast folding.
42634 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
42635 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
42636 OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
42637 Src->hasOneUse()) {
42638 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
42639 SDValue NewSrc =
42640 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
42641 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
42642 SDValue NewBcst =
42643 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
42644 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
42645 }
42646 break;
42647 }
42648 case X86ISD::PCMPGT:
42649 // icmp sgt(0, R) == ashr(R, BitWidth-1).
42650 // iff we only need the sign bit then we can use R directly.
42651 if (OriginalDemandedBits.isSignMask() &&
42652 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
42653 return TLO.CombineTo(Op, Op.getOperand(1));
42654 break;
42655 case X86ISD::MOVMSK: {
42656 SDValue Src = Op.getOperand(0);
42657 MVT SrcVT = Src.getSimpleValueType();
42658 unsigned SrcBits = SrcVT.getScalarSizeInBits();
42659 unsigned NumElts = SrcVT.getVectorNumElements();
42660
42661 // If we don't need the sign bits at all just return zero.
42662 if (OriginalDemandedBits.countr_zero() >= NumElts)
42663 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
42664
42665 // See if we only demand bits from the lower 128-bit vector.
42666 if (SrcVT.is256BitVector() &&
42667 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
42668 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
42669 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
42670 }
42671
42672 // Only demand the vector elements of the sign bits we need.
42673 APInt KnownUndef, KnownZero;
42674 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
42675 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
42676 TLO, Depth + 1))
42677 return true;
42678
42679 Known.Zero = KnownZero.zext(BitWidth);
42680 Known.Zero.setHighBits(BitWidth - NumElts);
42681
42682 // MOVMSK only uses the MSB from each vector element.
42683 KnownBits KnownSrc;
42684 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
42685 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
42686 Depth + 1))
42687 return true;
42688
42689 if (KnownSrc.One[SrcBits - 1])
42690 Known.One.setLowBits(NumElts);
42691 else if (KnownSrc.Zero[SrcBits - 1])
42692 Known.Zero.setLowBits(NumElts);
42693
42694 // Attempt to avoid multi-use os if we don't need anything from it.
42696 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
42697 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
42698 return false;
42699 }
42700 case X86ISD::TESTP: {
42701 SDValue Op0 = Op.getOperand(0);
42702 SDValue Op1 = Op.getOperand(1);
42703 MVT OpVT = Op0.getSimpleValueType();
42704 assert((OpVT.getVectorElementType() == MVT::f32 ||
42705 OpVT.getVectorElementType() == MVT::f64) &&
42706 "Illegal vector type for X86ISD::TESTP");
42707
42708 // TESTPS/TESTPD only demands the sign bits of ALL the elements.
42709 KnownBits KnownSrc;
42710 APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
42711 bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
42712 return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
42713 AssumeSingleUse) ||
42714 SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
42715 AssumeSingleUse);
42716 }
42717 case X86ISD::BEXTR:
42718 case X86ISD::BEXTRI: {
42719 SDValue Op0 = Op.getOperand(0);
42720 SDValue Op1 = Op.getOperand(1);
42721
42722 // Only bottom 16-bits of the control bits are required.
42723 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
42724 // NOTE: SimplifyDemandedBits won't do this for constants.
42725 uint64_t Val1 = Cst1->getZExtValue();
42726 uint64_t MaskedVal1 = Val1 & 0xFFFF;
42727 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
42728 SDLoc DL(Op);
42729 return TLO.CombineTo(
42730 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
42731 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
42732 }
42733
42734 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
42735 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
42736
42737 // If the length is 0, the result is 0.
42738 if (Length == 0) {
42739 Known.setAllZero();
42740 return false;
42741 }
42742
42743 if ((Shift + Length) <= BitWidth) {
42744 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
42745 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
42746 return true;
42747
42748 Known = Known.extractBits(Length, Shift);
42749 Known = Known.zextOrTrunc(BitWidth);
42750 return false;
42751 }
42752 } else {
42753 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
42754 KnownBits Known1;
42755 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
42756 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
42757 return true;
42758
42759 // If the length is 0, replace with 0.
42760 KnownBits LengthBits = Known1.extractBits(8, 8);
42761 if (LengthBits.isZero())
42762 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
42763 }
42764
42765 break;
42766 }
42767 case X86ISD::PDEP: {
42768 SDValue Op0 = Op.getOperand(0);
42769 SDValue Op1 = Op.getOperand(1);
42770
42771 unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
42772 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
42773
42774 // If the demanded bits has leading zeroes, we don't demand those from the
42775 // mask.
42776 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
42777 return true;
42778
42779 // The number of possible 1s in the mask determines the number of LSBs of
42780 // operand 0 used. Undemanded bits from the mask don't matter so filter
42781 // them before counting.
42782 KnownBits Known2;
42783 uint64_t Count = (~Known.Zero & LoMask).popcount();
42784 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
42785 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
42786 return true;
42787
42788 // Zeroes are retained from the mask, but not ones.
42789 Known.One.clearAllBits();
42790 // The result will have at least as many trailing zeros as the non-mask
42791 // operand since bits can only map to the same or higher bit position.
42792 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
42793 return false;
42794 }
42795 }
42796
42798 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
42799}
42800
42802 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
42803 SelectionDAG &DAG, unsigned Depth) const {
42804 int NumElts = DemandedElts.getBitWidth();
42805 unsigned Opc = Op.getOpcode();
42806 EVT VT = Op.getValueType();
42807
42808 switch (Opc) {
42809 case X86ISD::PINSRB:
42810 case X86ISD::PINSRW: {
42811 // If we don't demand the inserted element, return the base vector.
42812 SDValue Vec = Op.getOperand(0);
42813 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
42814 MVT VecVT = Vec.getSimpleValueType();
42815 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
42816 !DemandedElts[CIdx->getZExtValue()])
42817 return Vec;
42818 break;
42819 }
42820 case X86ISD::VSHLI: {
42821 // If we are only demanding sign bits then we can use the shift source
42822 // directly.
42823 SDValue Op0 = Op.getOperand(0);
42824 unsigned ShAmt = Op.getConstantOperandVal(1);
42825 unsigned BitWidth = DemandedBits.getBitWidth();
42826 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
42827 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
42828 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
42829 return Op0;
42830 break;
42831 }
42832 case X86ISD::VSRAI:
42833 // iff we only need the sign bit then we can use the source directly.
42834 // TODO: generalize where we only demand extended signbits.
42835 if (DemandedBits.isSignMask())
42836 return Op.getOperand(0);
42837 break;
42838 case X86ISD::PCMPGT:
42839 // icmp sgt(0, R) == ashr(R, BitWidth-1).
42840 // iff we only need the sign bit then we can use R directly.
42841 if (DemandedBits.isSignMask() &&
42842 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
42843 return Op.getOperand(1);
42844 break;
42845 case X86ISD::BLENDV: {
42846 // BLENDV: Cond (MSB) ? LHS : RHS
42847 SDValue Cond = Op.getOperand(0);
42848 SDValue LHS = Op.getOperand(1);
42849 SDValue RHS = Op.getOperand(2);
42850
42851 KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1);
42852 if (CondKnown.isNegative())
42853 return LHS;
42854 if (CondKnown.isNonNegative())
42855 return RHS;
42856 break;
42857 }
42858 case X86ISD::ANDNP: {
42859 // ANDNP = (~LHS & RHS);
42860 SDValue LHS = Op.getOperand(0);
42861 SDValue RHS = Op.getOperand(1);
42862
42863 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
42864 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
42865
42866 // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
42867 // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
42868 // this context, so return RHS.
42869 if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
42870 return RHS;
42871 break;
42872 }
42873 }
42874
42875 APInt ShuffleUndef, ShuffleZero;
42876 SmallVector<int, 16> ShuffleMask;
42878 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
42879 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
42880 // If all the demanded elts are from one operand and are inline,
42881 // then we can use the operand directly.
42882 int NumOps = ShuffleOps.size();
42883 if (ShuffleMask.size() == (unsigned)NumElts &&
42885 return VT.getSizeInBits() == V.getValueSizeInBits();
42886 })) {
42887
42888 if (DemandedElts.isSubsetOf(ShuffleUndef))
42889 return DAG.getUNDEF(VT);
42890 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
42891 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
42892
42893 // Bitmask that indicates which ops have only been accessed 'inline'.
42894 APInt IdentityOp = APInt::getAllOnes(NumOps);
42895 for (int i = 0; i != NumElts; ++i) {
42896 int M = ShuffleMask[i];
42897 if (!DemandedElts[i] || ShuffleUndef[i])
42898 continue;
42899 int OpIdx = M / NumElts;
42900 int EltIdx = M % NumElts;
42901 if (M < 0 || EltIdx != i) {
42902 IdentityOp.clearAllBits();
42903 break;
42904 }
42905 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
42906 if (IdentityOp == 0)
42907 break;
42908 }
42909 assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&
42910 "Multiple identity shuffles detected");
42911
42912 if (IdentityOp != 0)
42913 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
42914 }
42915 }
42916
42918 Op, DemandedBits, DemandedElts, DAG, Depth);
42919}
42920
42922 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
42923 bool PoisonOnly, unsigned Depth) const {
42924 unsigned NumElts = DemandedElts.getBitWidth();
42925
42926 // TODO: Add more target shuffles.
42927 switch (Op.getOpcode()) {
42928 case X86ISD::PSHUFD:
42929 case X86ISD::VPERMILPI: {
42932 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
42933 SmallVector<APInt, 2> DemandedSrcElts(Ops.size(),
42934 APInt::getZero(NumElts));
42935 for (auto M : enumerate(Mask)) {
42936 if (!DemandedElts[M.index()] || M.value() == SM_SentinelZero)
42937 continue;
42938 if (M.value() == SM_SentinelUndef)
42939 return false;
42940 assert(0 <= M.value() && M.value() < (int)(Ops.size() * NumElts) &&
42941 "Shuffle mask index out of range");
42942 DemandedSrcElts[M.value() / NumElts].setBit(M.value() % NumElts);
42943 }
42944 for (auto Op : enumerate(Ops))
42945 if (!DemandedSrcElts[Op.index()].isZero() &&
42947 Op.value(), DemandedSrcElts[Op.index()], PoisonOnly, Depth + 1))
42948 return false;
42949 return true;
42950 }
42951 break;
42952 }
42953 }
42955 Op, DemandedElts, DAG, PoisonOnly, Depth);
42956}
42957
42959 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
42960 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
42961
42962 // TODO: Add more target shuffles.
42963 switch (Op.getOpcode()) {
42964 case X86ISD::PSHUFD:
42965 case X86ISD::VPERMILPI:
42966 case X86ISD::UNPCKH:
42967 case X86ISD::UNPCKL:
42968 return false;
42969 }
42971 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
42972}
42973
42975 const APInt &DemandedElts,
42976 APInt &UndefElts,
42977 const SelectionDAG &DAG,
42978 unsigned Depth) const {
42979 unsigned NumElts = DemandedElts.getBitWidth();
42980 unsigned Opc = Op.getOpcode();
42981
42982 switch (Opc) {
42983 case X86ISD::VBROADCAST:
42985 UndefElts = APInt::getZero(NumElts);
42986 return true;
42987 }
42988
42989 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
42990 DAG, Depth);
42991}
42992
42993// Helper to peek through bitops/trunc/setcc to determine size of source vector.
42994// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
42995static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
42996 bool AllowTruncate) {
42997 switch (Src.getOpcode()) {
42998 case ISD::TRUNCATE:
42999 if (!AllowTruncate)
43000 return false;
43001 [[fallthrough]];
43002 case ISD::SETCC:
43003 return Src.getOperand(0).getValueSizeInBits() == Size;
43004 case ISD::FREEZE:
43005 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate);
43006 case ISD::AND:
43007 case ISD::XOR:
43008 case ISD::OR:
43009 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
43010 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
43011 case ISD::SELECT:
43012 case ISD::VSELECT:
43013 return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
43014 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate) &&
43015 checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate);
43016 case ISD::BUILD_VECTOR:
43017 return ISD::isBuildVectorAllZeros(Src.getNode()) ||
43018 ISD::isBuildVectorAllOnes(Src.getNode());
43019 }
43020 return false;
43021}
43022
43023// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
43024static unsigned getAltBitOpcode(unsigned Opcode) {
43025 switch(Opcode) {
43026 // clang-format off
43027 case ISD::AND: return X86ISD::FAND;
43028 case ISD::OR: return X86ISD::FOR;
43029 case ISD::XOR: return X86ISD::FXOR;
43030 case X86ISD::ANDNP: return X86ISD::FANDN;
43031 // clang-format on
43032 }
43033 llvm_unreachable("Unknown bitwise opcode");
43034}
43035
43036// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
43038 const SDLoc &DL) {
43039 EVT SrcVT = Src.getValueType();
43040 if (SrcVT != MVT::v4i1)
43041 return SDValue();
43042
43043 switch (Src.getOpcode()) {
43044 case ISD::SETCC:
43045 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
43046 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
43047 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
43048 SDValue Op0 = Src.getOperand(0);
43049 if (ISD::isNormalLoad(Op0.getNode()))
43050 return DAG.getBitcast(MVT::v4f32, Op0);
43051 if (Op0.getOpcode() == ISD::BITCAST &&
43052 Op0.getOperand(0).getValueType() == MVT::v4f32)
43053 return Op0.getOperand(0);
43054 }
43055 break;
43056 case ISD::AND:
43057 case ISD::XOR:
43058 case ISD::OR: {
43059 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
43060 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
43061 if (Op0 && Op1)
43062 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
43063 Op1);
43064 break;
43065 }
43066 }
43067 return SDValue();
43068}
43069
43070// Helper to push sign extension of vXi1 SETCC result through bitops.
43072 SDValue Src, const SDLoc &DL) {
43073 switch (Src.getOpcode()) {
43074 case ISD::SETCC:
43075 case ISD::FREEZE:
43076 case ISD::TRUNCATE:
43077 case ISD::BUILD_VECTOR:
43078 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
43079 case ISD::AND:
43080 case ISD::XOR:
43081 case ISD::OR:
43082 return DAG.getNode(
43083 Src.getOpcode(), DL, SExtVT,
43084 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
43085 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
43086 case ISD::SELECT:
43087 case ISD::VSELECT:
43088 return DAG.getSelect(
43089 DL, SExtVT, Src.getOperand(0),
43090 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
43091 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
43092 }
43093 llvm_unreachable("Unexpected node type for vXi1 sign extension");
43094}
43095
43096// Try to match patterns such as
43097// (i16 bitcast (v16i1 x))
43098// ->
43099// (i16 movmsk (16i8 sext (v16i1 x)))
43100// before the illegal vector is scalarized on subtargets that don't have legal
43101// vxi1 types.
43103 const SDLoc &DL,
43104 const X86Subtarget &Subtarget) {
43105 EVT SrcVT = Src.getValueType();
43106 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
43107 return SDValue();
43108
43109 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
43110 // legalization destroys the v4i32 type.
43111 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
43112 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
43113 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
43114 DAG.getBitcast(MVT::v4f32, V));
43115 return DAG.getZExtOrTrunc(V, DL, VT);
43116 }
43117 }
43118
43119 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
43120 // movmskb even with avx512. This will be better than truncating to vXi1 and
43121 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
43122 // vpcmpeqb/vpcmpgtb.
43123 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
43124 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
43125 Src.getOperand(0).getValueType() == MVT::v32i8 ||
43126 Src.getOperand(0).getValueType() == MVT::v64i8);
43127
43128 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
43129 // directly with vpmovmskb/vmovmskps/vmovmskpd.
43130 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
43131 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
43132 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
43133 EVT CmpVT = Src.getOperand(0).getValueType();
43134 EVT EltVT = CmpVT.getVectorElementType();
43135 if (CmpVT.getSizeInBits() <= 256 &&
43136 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
43137 PreferMovMsk = true;
43138 }
43139
43140 // With AVX512 vxi1 types are legal and we prefer using k-regs.
43141 // MOVMSK is supported in SSE2 or later.
43142 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
43143 return SDValue();
43144
43145 // If the upper ops of a concatenation are undef, then try to bitcast the
43146 // lower op and extend.
43147 SmallVector<SDValue, 4> SubSrcOps;
43148 if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
43149 SubSrcOps.size() >= 2) {
43150 SDValue LowerOp = SubSrcOps[0];
43151 ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
43152 if (LowerOp.getOpcode() == ISD::SETCC &&
43153 all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
43154 EVT SubVT = VT.getIntegerVT(
43155 *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
43156 if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
43157 EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
43158 return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
43159 }
43160 }
43161 }
43162
43163 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
43164 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
43165 // v8i16 and v16i16.
43166 // For these two cases, we can shuffle the upper element bytes to a
43167 // consecutive sequence at the start of the vector and treat the results as
43168 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
43169 // for v16i16 this is not the case, because the shuffle is expensive, so we
43170 // avoid sign-extending to this type entirely.
43171 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
43172 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
43173 MVT SExtVT;
43174 bool PropagateSExt = false;
43175 switch (SrcVT.getSimpleVT().SimpleTy) {
43176 default:
43177 return SDValue();
43178 case MVT::v2i1:
43179 SExtVT = MVT::v2i64;
43180 break;
43181 case MVT::v4i1:
43182 SExtVT = MVT::v4i32;
43183 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
43184 // sign-extend to a 256-bit operation to avoid truncation.
43185 if (Subtarget.hasAVX() &&
43186 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
43187 SExtVT = MVT::v4i64;
43188 PropagateSExt = true;
43189 }
43190 break;
43191 case MVT::v8i1:
43192 SExtVT = MVT::v8i16;
43193 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
43194 // sign-extend to a 256-bit operation to match the compare.
43195 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
43196 // 256-bit because the shuffle is cheaper than sign extending the result of
43197 // the compare.
43198 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
43199 checkBitcastSrcVectorSize(Src, 512, true))) {
43200 SExtVT = MVT::v8i32;
43201 PropagateSExt = true;
43202 }
43203 break;
43204 case MVT::v16i1:
43205 SExtVT = MVT::v16i8;
43206 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
43207 // it is not profitable to sign-extend to 256-bit because this will
43208 // require an extra cross-lane shuffle which is more expensive than
43209 // truncating the result of the compare to 128-bits.
43210 break;
43211 case MVT::v32i1:
43212 SExtVT = MVT::v32i8;
43213 break;
43214 case MVT::v64i1:
43215 // If we have AVX512F, but not AVX512BW and the input is truncated from
43216 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
43217 if (Subtarget.hasAVX512()) {
43218 if (Subtarget.hasBWI())
43219 return SDValue();
43220 SExtVT = MVT::v64i8;
43221 break;
43222 }
43223 // Split if this is a <64 x i8> comparison result.
43224 if (checkBitcastSrcVectorSize(Src, 512, false)) {
43225 SExtVT = MVT::v64i8;
43226 break;
43227 }
43228 return SDValue();
43229 };
43230
43231 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
43232 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
43233
43234 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
43235 V = getPMOVMSKB(DL, V, DAG, Subtarget);
43236 } else {
43237 if (SExtVT == MVT::v8i16) {
43238 V = widenSubVector(V, false, Subtarget, DAG, DL, 256);
43239 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v16i8, V);
43240 }
43241 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
43242 }
43243
43244 EVT IntVT =
43246 V = DAG.getZExtOrTrunc(V, DL, IntVT);
43247 return DAG.getBitcast(VT, V);
43248}
43249
43250// Convert a vXi1 constant build vector to the same width scalar integer.
43252 EVT SrcVT = Op.getValueType();
43253 assert(SrcVT.getVectorElementType() == MVT::i1 &&
43254 "Expected a vXi1 vector");
43256 "Expected a constant build vector");
43257
43258 APInt Imm(SrcVT.getVectorNumElements(), 0);
43259 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
43260 SDValue In = Op.getOperand(Idx);
43261 if (!In.isUndef() && (In->getAsZExtVal() & 0x1))
43262 Imm.setBit(Idx);
43263 }
43264 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
43265 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
43266}
43267
43270 const X86Subtarget &Subtarget) {
43271 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
43272
43273 if (!DCI.isBeforeLegalizeOps())
43274 return SDValue();
43275
43276 // Only do this if we have k-registers.
43277 if (!Subtarget.hasAVX512())
43278 return SDValue();
43279
43280 EVT DstVT = N->getValueType(0);
43281 SDValue Op = N->getOperand(0);
43282 EVT SrcVT = Op.getValueType();
43283
43284 if (!Op.hasOneUse())
43285 return SDValue();
43286
43287 // Look for logic ops.
43288 if (Op.getOpcode() != ISD::AND &&
43289 Op.getOpcode() != ISD::OR &&
43290 Op.getOpcode() != ISD::XOR)
43291 return SDValue();
43292
43293 // Make sure we have a bitcast between mask registers and a scalar type.
43294 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
43295 DstVT.isScalarInteger()) &&
43296 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
43297 SrcVT.isScalarInteger()))
43298 return SDValue();
43299
43300 SDValue LHS = Op.getOperand(0);
43301 SDValue RHS = Op.getOperand(1);
43302
43303 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
43304 LHS.getOperand(0).getValueType() == DstVT)
43305 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
43306 DAG.getBitcast(DstVT, RHS));
43307
43308 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
43309 RHS.getOperand(0).getValueType() == DstVT)
43310 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
43311 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
43312
43313 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
43314 // Most of these have to move a constant from the scalar domain anyway.
43317 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
43318 DAG.getBitcast(DstVT, LHS), RHS);
43319 }
43320
43321 return SDValue();
43322}
43323
43325 const X86Subtarget &Subtarget) {
43326 SDLoc DL(BV);
43327 unsigned NumElts = BV->getNumOperands();
43328 SDValue Splat = BV->getSplatValue();
43329
43330 // Build MMX element from integer GPR or SSE float values.
43331 auto CreateMMXElement = [&](SDValue V) {
43332 if (V.isUndef())
43333 return DAG.getUNDEF(MVT::x86mmx);
43334 if (V.getValueType().isFloatingPoint()) {
43335 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
43336 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
43337 V = DAG.getBitcast(MVT::v2i64, V);
43338 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
43339 }
43340 V = DAG.getBitcast(MVT::i32, V);
43341 } else {
43342 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
43343 }
43344 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
43345 };
43346
43347 // Convert build vector ops to MMX data in the bottom elements.
43349
43350 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43351
43352 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
43353 if (Splat) {
43354 if (Splat.isUndef())
43355 return DAG.getUNDEF(MVT::x86mmx);
43356
43357 Splat = CreateMMXElement(Splat);
43358
43359 if (Subtarget.hasSSE1()) {
43360 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
43361 if (NumElts == 8)
43362 Splat = DAG.getNode(
43363 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
43364 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
43365 TLI.getPointerTy(DAG.getDataLayout())),
43366 Splat, Splat);
43367
43368 // Use PSHUFW to repeat 16-bit elements.
43369 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
43370 return DAG.getNode(
43371 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
43372 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
43373 TLI.getPointerTy(DAG.getDataLayout())),
43374 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
43375 }
43376 Ops.append(NumElts, Splat);
43377 } else {
43378 for (unsigned i = 0; i != NumElts; ++i)
43379 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
43380 }
43381
43382 // Use tree of PUNPCKLs to build up general MMX vector.
43383 while (Ops.size() > 1) {
43384 unsigned NumOps = Ops.size();
43385 unsigned IntrinOp =
43386 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
43387 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
43388 : Intrinsic::x86_mmx_punpcklbw));
43389 SDValue Intrin = DAG.getTargetConstant(
43390 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
43391 for (unsigned i = 0; i != NumOps; i += 2)
43392 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
43393 Ops[i], Ops[i + 1]);
43394 Ops.resize(NumOps / 2);
43395 }
43396
43397 return Ops[0];
43398}
43399
43400// Recursive function that attempts to find if a bool vector node was originally
43401// a vector/float/double that got truncated/extended/bitcast to/from a scalar
43402// integer. If so, replace the scalar ops with bool vector equivalents back down
43403// the chain.
43405 SelectionDAG &DAG,
43406 const X86Subtarget &Subtarget) {
43407 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43408 unsigned Opc = V.getOpcode();
43409 switch (Opc) {
43410 case ISD::BITCAST: {
43411 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
43412 SDValue Src = V.getOperand(0);
43413 EVT SrcVT = Src.getValueType();
43414 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
43415 return DAG.getBitcast(VT, Src);
43416 break;
43417 }
43418 case ISD::TRUNCATE: {
43419 // If we find a suitable source, a truncated scalar becomes a subvector.
43420 SDValue Src = V.getOperand(0);
43421 EVT NewSrcVT =
43422 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
43423 if (TLI.isTypeLegal(NewSrcVT))
43424 if (SDValue N0 =
43425 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
43426 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
43427 DAG.getIntPtrConstant(0, DL));
43428 break;
43429 }
43430 case ISD::ANY_EXTEND:
43431 case ISD::ZERO_EXTEND: {
43432 // If we find a suitable source, an extended scalar becomes a subvector.
43433 SDValue Src = V.getOperand(0);
43434 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
43435 Src.getScalarValueSizeInBits());
43436 if (TLI.isTypeLegal(NewSrcVT))
43437 if (SDValue N0 =
43438 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
43439 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
43440 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
43441 : DAG.getConstant(0, DL, VT),
43442 N0, DAG.getIntPtrConstant(0, DL));
43443 break;
43444 }
43445 case ISD::OR: {
43446 // If we find suitable sources, we can just move an OR to the vector domain.
43447 SDValue Src0 = V.getOperand(0);
43448 SDValue Src1 = V.getOperand(1);
43449 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
43450 if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
43451 return DAG.getNode(Opc, DL, VT, N0, N1);
43452 break;
43453 }
43454 case ISD::SHL: {
43455 // If we find a suitable source, a SHL becomes a KSHIFTL.
43456 SDValue Src0 = V.getOperand(0);
43457 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
43458 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
43459 break;
43460
43461 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
43462 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
43463 return DAG.getNode(
43464 X86ISD::KSHIFTL, DL, VT, N0,
43465 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
43466 break;
43467 }
43468 }
43469 return SDValue();
43470}
43471
43474 const X86Subtarget &Subtarget) {
43475 SDValue N0 = N->getOperand(0);
43476 EVT VT = N->getValueType(0);
43477 EVT SrcVT = N0.getValueType();
43478 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43479
43480 // Try to match patterns such as
43481 // (i16 bitcast (v16i1 x))
43482 // ->
43483 // (i16 movmsk (16i8 sext (v16i1 x)))
43484 // before the setcc result is scalarized on subtargets that don't have legal
43485 // vxi1 types.
43486 if (DCI.isBeforeLegalize()) {
43487 SDLoc dl(N);
43488 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
43489 return V;
43490
43491 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
43492 // type, widen both sides to avoid a trip through memory.
43493 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
43494 Subtarget.hasAVX512()) {
43495 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
43496 N0 = DAG.getBitcast(MVT::v8i1, N0);
43497 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
43498 DAG.getIntPtrConstant(0, dl));
43499 }
43500
43501 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
43502 // type, widen both sides to avoid a trip through memory.
43503 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
43504 Subtarget.hasAVX512()) {
43505 // Use zeros for the widening if we already have some zeroes. This can
43506 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
43507 // stream of this.
43508 // FIXME: It might make sense to detect a concat_vectors with a mix of
43509 // zeroes and undef and turn it into insert_subvector for i1 vectors as
43510 // a separate combine. What we can't do is canonicalize the operands of
43511 // such a concat or we'll get into a loop with SimplifyDemandedBits.
43512 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
43513 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
43514 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
43515 SrcVT = LastOp.getValueType();
43516 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
43517 SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
43518 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
43519 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
43520 N0 = DAG.getBitcast(MVT::i8, N0);
43521 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
43522 }
43523 }
43524
43525 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
43526 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
43527 Ops[0] = N0;
43528 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
43529 N0 = DAG.getBitcast(MVT::i8, N0);
43530 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
43531 }
43532 } else {
43533 // If we're bitcasting from iX to vXi1, see if the integer originally
43534 // began as a vXi1 and whether we can remove the bitcast entirely.
43535 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
43536 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
43537 if (SDValue V =
43538 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
43539 return V;
43540 }
43541 }
43542
43543 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
43544 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
43545 // due to insert_subvector legalization on KNL. By promoting the copy to i16
43546 // we can help with known bits propagation from the vXi1 domain to the
43547 // scalar domain.
43548 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
43549 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
43550 N0.getOperand(0).getValueType() == MVT::v16i1 &&
43552 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
43553 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
43554
43555 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
43556 // and the vbroadcast_load are both integer or both fp. In some cases this
43557 // will remove the bitcast entirely.
43558 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
43559 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
43560 auto *BCast = cast<MemIntrinsicSDNode>(N0);
43561 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
43562 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
43563 // Don't swap i8/i16 since don't have fp types that size.
43564 if (MemSize >= 32) {
43565 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
43566 : MVT::getIntegerVT(MemSize);
43567 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
43568 : MVT::getIntegerVT(SrcVTSize);
43569 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
43570
43571 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
43572 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
43573 SDValue ResNode =
43575 MemVT, BCast->getMemOperand());
43576 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
43577 return DAG.getBitcast(VT, ResNode);
43578 }
43579 }
43580
43581 // Since MMX types are special and don't usually play with other vector types,
43582 // it's better to handle them early to be sure we emit efficient code by
43583 // avoiding store-load conversions.
43584 if (VT == MVT::x86mmx) {
43585 // Detect MMX constant vectors.
43586 APInt UndefElts;
43587 SmallVector<APInt, 1> EltBits;
43588 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits,
43589 /*AllowWholeUndefs*/ true,
43590 /*AllowPartialUndefs*/ true)) {
43591 SDLoc DL(N0);
43592 // Handle zero-extension of i32 with MOVD.
43593 if (EltBits[0].countl_zero() >= 32)
43594 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
43595 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
43596 // Else, bitcast to a double.
43597 // TODO - investigate supporting sext 32-bit immediates on x86_64.
43598 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
43599 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
43600 }
43601
43602 // Detect bitcasts to x86mmx low word.
43603 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
43604 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
43605 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
43606 bool LowUndef = true, AllUndefOrZero = true;
43607 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
43608 SDValue Op = N0.getOperand(i);
43609 LowUndef &= Op.isUndef() || (i >= e/2);
43610 AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
43611 }
43612 if (AllUndefOrZero) {
43613 SDValue N00 = N0.getOperand(0);
43614 SDLoc dl(N00);
43615 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
43616 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
43617 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
43618 }
43619 }
43620
43621 // Detect bitcasts of 64-bit build vectors and convert to a
43622 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
43623 // lowest element.
43624 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
43625 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
43626 SrcVT == MVT::v8i8))
43627 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
43628
43629 // Detect bitcasts between element or subvector extraction to x86mmx.
43630 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
43632 isNullConstant(N0.getOperand(1))) {
43633 SDValue N00 = N0.getOperand(0);
43634 if (N00.getValueType().is128BitVector())
43635 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
43636 DAG.getBitcast(MVT::v2i64, N00));
43637 }
43638
43639 // Detect bitcasts from FP_TO_SINT to x86mmx.
43640 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
43641 SDLoc DL(N0);
43642 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
43643 DAG.getUNDEF(MVT::v2i32));
43644 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
43645 DAG.getBitcast(MVT::v2i64, Res));
43646 }
43647 }
43648
43649 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
43650 // most of these to scalar anyway.
43651 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
43652 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
43654 return combinevXi1ConstantToInteger(N0, DAG);
43655 }
43656
43657 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
43658 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
43659 isa<ConstantSDNode>(N0)) {
43660 auto *C = cast<ConstantSDNode>(N0);
43661 if (C->isAllOnes())
43662 return DAG.getConstant(1, SDLoc(N0), VT);
43663 if (C->isZero())
43664 return DAG.getConstant(0, SDLoc(N0), VT);
43665 }
43666
43667 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
43668 // Turn it into a sign bit compare that produces a k-register. This avoids
43669 // a trip through a GPR.
43670 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
43671 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
43673 unsigned NumElts = VT.getVectorNumElements();
43674 SDValue Src = N0;
43675
43676 // Peek through truncate.
43677 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
43678 Src = N0.getOperand(0);
43679
43680 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
43681 SDValue MovmskIn = Src.getOperand(0);
43682 MVT MovmskVT = MovmskIn.getSimpleValueType();
43683 unsigned MovMskElts = MovmskVT.getVectorNumElements();
43684
43685 // We allow extra bits of the movmsk to be used since they are known zero.
43686 // We can't convert a VPMOVMSKB without avx512bw.
43687 if (MovMskElts <= NumElts &&
43688 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
43689 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
43690 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
43691 SDLoc dl(N);
43692 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
43693 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
43694 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
43695 if (EVT(CmpVT) == VT)
43696 return Cmp;
43697
43698 // Pad with zeroes up to original VT to replace the zeroes that were
43699 // being used from the MOVMSK.
43700 unsigned NumConcats = NumElts / MovMskElts;
43701 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
43702 Ops[0] = Cmp;
43703 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
43704 }
43705 }
43706 }
43707
43708 // Try to remove bitcasts from input and output of mask arithmetic to
43709 // remove GPR<->K-register crossings.
43710 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
43711 return V;
43712
43713 // Convert a bitcasted integer logic operation that has one bitcasted
43714 // floating-point operand into a floating-point logic operation. This may
43715 // create a load of a constant, but that is cheaper than materializing the
43716 // constant in an integer register and transferring it to an SSE register or
43717 // transferring the SSE operand to integer register and back.
43718 unsigned FPOpcode;
43719 switch (N0.getOpcode()) {
43720 // clang-format off
43721 case ISD::AND: FPOpcode = X86ISD::FAND; break;
43722 case ISD::OR: FPOpcode = X86ISD::FOR; break;
43723 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
43724 default: return SDValue();
43725 // clang-format on
43726 }
43727
43728 // Check if we have a bitcast from another integer type as well.
43729 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
43730 (Subtarget.hasSSE2() && VT == MVT::f64) ||
43731 (Subtarget.hasFP16() && VT == MVT::f16) ||
43732 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
43733 TLI.isTypeLegal(VT))))
43734 return SDValue();
43735
43736 SDValue LogicOp0 = N0.getOperand(0);
43737 SDValue LogicOp1 = N0.getOperand(1);
43738 SDLoc DL0(N0);
43739
43740 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
43741 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
43742 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
43743 LogicOp0.getOperand(0).getValueType() == VT &&
43744 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
43745 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
43746 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
43747 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
43748 }
43749 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
43750 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
43751 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
43752 LogicOp1.getOperand(0).getValueType() == VT &&
43753 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
43754 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
43755 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
43756 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
43757 }
43758
43759 return SDValue();
43760}
43761
43762// (mul (zext a), (sext, b))
43763static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
43764 SDValue &Op1) {
43765 Op0 = Mul.getOperand(0);
43766 Op1 = Mul.getOperand(1);
43767
43768 // The operand1 should be signed extend
43769 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
43770 std::swap(Op0, Op1);
43771
43772 auto IsFreeTruncation = [](SDValue &Op) -> bool {
43773 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
43774 Op.getOpcode() == ISD::SIGN_EXTEND) &&
43775 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
43776 return true;
43777
43778 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
43779 return (BV && BV->isConstant());
43780 };
43781
43782 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
43783 // value, we need to check Op0 is zero extended value. Op1 should be signed
43784 // value, so we just check the signed bits.
43785 if ((IsFreeTruncation(Op0) &&
43786 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
43787 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
43788 return true;
43789
43790 return false;
43791}
43792
43793// Given a ABS node, detect the following pattern:
43794// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
43795// This is useful as it is the input into a SAD pattern.
43796static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
43797 SDValue AbsOp1 = Abs->getOperand(0);
43798 if (AbsOp1.getOpcode() != ISD::SUB)
43799 return false;
43800
43801 Op0 = AbsOp1.getOperand(0);
43802 Op1 = AbsOp1.getOperand(1);
43803
43804 // Check if the operands of the sub are zero-extended from vectors of i8.
43805 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
43806 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
43807 Op1.getOpcode() != ISD::ZERO_EXTEND ||
43808 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
43809 return false;
43810
43811 return true;
43812}
43813
43815 unsigned &LogBias, const SDLoc &DL,
43816 const X86Subtarget &Subtarget) {
43817 // Extend or truncate to MVT::i8 first.
43818 MVT Vi8VT =
43819 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
43820 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
43821 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
43822
43823 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
43824 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
43825 // The src A, B element type is i8, but the dst C element type is i32.
43826 // When we calculate the reduce stage, we use src vector type vXi8 for it
43827 // so we need logbias 2 to avoid extra 2 stages.
43828 LogBias = 2;
43829
43830 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
43831 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
43832 RegSize = std::max(512u, RegSize);
43833
43834 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
43835 // fill in the missing vector elements with 0.
43836 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
43837 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
43838 Ops[0] = LHS;
43839 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
43840 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
43841 Ops[0] = RHS;
43842 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
43843
43844 // Actually build the DotProduct, split as 256/512 bits for
43845 // AVXVNNI/AVX512VNNI.
43846 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43847 ArrayRef<SDValue> Ops) {
43848 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
43849 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
43850 };
43851 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
43852 SDValue Zero = DAG.getConstant(0, DL, DpVT);
43853
43854 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
43855 DpBuilder, false);
43856}
43857
43858// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
43859// to these zexts.
43860static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
43861 const SDValue &Zext1, const SDLoc &DL,
43862 const X86Subtarget &Subtarget) {
43863 // Find the appropriate width for the PSADBW.
43864 EVT InVT = Zext0.getOperand(0).getValueType();
43865 unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
43866
43867 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
43868 // fill in the missing vector elements with 0.
43869 unsigned NumConcat = RegSize / InVT.getSizeInBits();
43870 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
43871 Ops[0] = Zext0.getOperand(0);
43872 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
43873 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
43874 Ops[0] = Zext1.getOperand(0);
43875 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
43876
43877 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
43878 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43879 ArrayRef<SDValue> Ops) {
43880 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
43881 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
43882 };
43883 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
43884 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
43885 PSADBWBuilder);
43886}
43887
43888// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
43889// PHMINPOSUW.
43891 const X86Subtarget &Subtarget) {
43892 // Bail without SSE41.
43893 if (!Subtarget.hasSSE41())
43894 return SDValue();
43895
43896 EVT ExtractVT = Extract->getValueType(0);
43897 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
43898 return SDValue();
43899
43900 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
43901 ISD::NodeType BinOp;
43902 SDValue Src = DAG.matchBinOpReduction(
43903 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
43904 if (!Src)
43905 return SDValue();
43906
43907 EVT SrcVT = Src.getValueType();
43908 EVT SrcSVT = SrcVT.getScalarType();
43909 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
43910 return SDValue();
43911
43912 SDLoc DL(Extract);
43913 SDValue MinPos = Src;
43914
43915 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
43916 while (SrcVT.getSizeInBits() > 128) {
43917 SDValue Lo, Hi;
43918 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
43919 SrcVT = Lo.getValueType();
43920 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
43921 }
43922 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
43923 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
43924 "Unexpected value type");
43925
43926 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
43927 // to flip the value accordingly.
43928 SDValue Mask;
43929 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
43930 if (BinOp == ISD::SMAX)
43931 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
43932 else if (BinOp == ISD::SMIN)
43933 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
43934 else if (BinOp == ISD::UMAX)
43935 Mask = DAG.getAllOnesConstant(DL, SrcVT);
43936
43937 if (Mask)
43938 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
43939
43940 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
43941 // shuffling each upper element down and insert zeros. This means that the
43942 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
43943 // ready for the PHMINPOS.
43944 if (ExtractVT == MVT::i8) {
43946 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
43947 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
43948 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
43949 }
43950
43951 // Perform the PHMINPOS on a v8i16 vector,
43952 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
43953 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
43954 MinPos = DAG.getBitcast(SrcVT, MinPos);
43955
43956 if (Mask)
43957 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
43958
43959 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
43960 DAG.getIntPtrConstant(0, DL));
43961}
43962
43963// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
43965 const X86Subtarget &Subtarget) {
43966 // Bail without SSE2.
43967 if (!Subtarget.hasSSE2())
43968 return SDValue();
43969
43970 EVT ExtractVT = Extract->getValueType(0);
43971 unsigned BitWidth = ExtractVT.getSizeInBits();
43972 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
43973 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
43974 return SDValue();
43975
43976 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
43977 ISD::NodeType BinOp;
43978 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
43979 if (!Match && ExtractVT == MVT::i1)
43980 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
43981 if (!Match)
43982 return SDValue();
43983
43984 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
43985 // which we can't support here for now.
43986 if (Match.getScalarValueSizeInBits() != BitWidth)
43987 return SDValue();
43988
43989 SDValue Movmsk;
43990 SDLoc DL(Extract);
43991 EVT MatchVT = Match.getValueType();
43992 unsigned NumElts = MatchVT.getVectorNumElements();
43993 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
43994 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43995 LLVMContext &Ctx = *DAG.getContext();
43996
43997 if (ExtractVT == MVT::i1) {
43998 // Special case for (pre-legalization) vXi1 reductions.
43999 if (NumElts > 64 || !isPowerOf2_32(NumElts))
44000 return SDValue();
44001 if (Match.getOpcode() == ISD::SETCC) {
44002 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
44003 if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
44004 (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
44005 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
44006 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
44007 X86::CondCode X86CC;
44008 SDValue LHS = DAG.getFreeze(Match.getOperand(0));
44009 SDValue RHS = DAG.getFreeze(Match.getOperand(1));
44010 APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
44011 if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
44012 DAG, X86CC))
44013 return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
44014 getSETCC(X86CC, V, DL, DAG));
44015 }
44016 }
44017 if (TLI.isTypeLegal(MatchVT)) {
44018 // If this is a legal AVX512 predicate type then we can just bitcast.
44019 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
44020 Movmsk = DAG.getBitcast(MovmskVT, Match);
44021 } else {
44022 // Use combineBitcastvxi1 to create the MOVMSK.
44023 while (NumElts > MaxElts) {
44024 SDValue Lo, Hi;
44025 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
44026 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
44027 NumElts /= 2;
44028 }
44029 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
44030 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
44031 }
44032 if (!Movmsk)
44033 return SDValue();
44034 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
44035 } else {
44036 // FIXME: Better handling of k-registers or 512-bit vectors?
44037 unsigned MatchSizeInBits = Match.getValueSizeInBits();
44038 if (!(MatchSizeInBits == 128 ||
44039 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
44040 return SDValue();
44041
44042 // Make sure this isn't a vector of 1 element. The perf win from using
44043 // MOVMSK diminishes with less elements in the reduction, but it is
44044 // generally better to get the comparison over to the GPRs as soon as
44045 // possible to reduce the number of vector ops.
44046 if (Match.getValueType().getVectorNumElements() < 2)
44047 return SDValue();
44048
44049 // Check that we are extracting a reduction of all sign bits.
44050 if (DAG.ComputeNumSignBits(Match) != BitWidth)
44051 return SDValue();
44052
44053 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
44054 SDValue Lo, Hi;
44055 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
44056 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
44057 MatchSizeInBits = Match.getValueSizeInBits();
44058 }
44059
44060 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
44061 MVT MaskSrcVT;
44062 if (64 == BitWidth || 32 == BitWidth)
44064 MatchSizeInBits / BitWidth);
44065 else
44066 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
44067
44068 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
44069 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
44070 NumElts = MaskSrcVT.getVectorNumElements();
44071 }
44072 assert((NumElts <= 32 || NumElts == 64) &&
44073 "Not expecting more than 64 elements");
44074
44075 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
44076 if (BinOp == ISD::XOR) {
44077 // parity -> (PARITY(MOVMSK X))
44078 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
44079 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
44080 }
44081
44082 SDValue CmpC;
44083 ISD::CondCode CondCode;
44084 if (BinOp == ISD::OR) {
44085 // any_of -> MOVMSK != 0
44086 CmpC = DAG.getConstant(0, DL, CmpVT);
44087 CondCode = ISD::CondCode::SETNE;
44088 } else {
44089 // all_of -> MOVMSK == ((1 << NumElts) - 1)
44090 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
44091 DL, CmpVT);
44092 CondCode = ISD::CondCode::SETEQ;
44093 }
44094
44095 // The setcc produces an i8 of 0/1, so extend that to the result width and
44096 // negate to get the final 0/-1 mask value.
44097 EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
44098 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
44099 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
44100 return DAG.getNegative(Zext, DL, ExtractVT);
44101}
44102
44104 const X86Subtarget &Subtarget) {
44105 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
44106 return SDValue();
44107
44108 EVT ExtractVT = Extract->getValueType(0);
44109 // Verify the type we're extracting is i32, as the output element type of
44110 // vpdpbusd is i32.
44111 if (ExtractVT != MVT::i32)
44112 return SDValue();
44113
44114 EVT VT = Extract->getOperand(0).getValueType();
44116 return SDValue();
44117
44118 // Match shuffle + add pyramid.
44119 ISD::NodeType BinOp;
44120 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
44121
44122 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
44123 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
44124 // before adding into the accumulator.
44125 // TODO:
44126 // We also need to verify that the multiply has at least 2x the number of bits
44127 // of the input. We shouldn't match
44128 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
44129 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
44130 // Root = Root.getOperand(0);
44131
44132 // If there was a match, we want Root to be a mul.
44133 if (!Root || Root.getOpcode() != ISD::MUL)
44134 return SDValue();
44135
44136 // Check whether we have an extend and mul pattern
44137 SDValue LHS, RHS;
44138 if (!detectExtMul(DAG, Root, LHS, RHS))
44139 return SDValue();
44140
44141 // Create the dot product instruction.
44142 SDLoc DL(Extract);
44143 unsigned StageBias;
44144 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
44145
44146 // If the original vector was wider than 4 elements, sum over the results
44147 // in the DP vector.
44148 unsigned Stages = Log2_32(VT.getVectorNumElements());
44149 EVT DpVT = DP.getValueType();
44150
44151 if (Stages > StageBias) {
44152 unsigned DpElems = DpVT.getVectorNumElements();
44153
44154 for (unsigned i = Stages - StageBias; i > 0; --i) {
44155 SmallVector<int, 16> Mask(DpElems, -1);
44156 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
44157 Mask[j] = MaskEnd + j;
44158
44159 SDValue Shuffle =
44160 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
44161 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
44162 }
44163 }
44164
44165 // Return the lowest ExtractSizeInBits bits.
44166 EVT ResVT =
44167 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
44168 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
44169 DP = DAG.getBitcast(ResVT, DP);
44170 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
44171 Extract->getOperand(1));
44172}
44173
44175 const X86Subtarget &Subtarget) {
44176 // PSADBW is only supported on SSE2 and up.
44177 if (!Subtarget.hasSSE2())
44178 return SDValue();
44179
44180 EVT ExtractVT = Extract->getValueType(0);
44181 // Verify the type we're extracting is either i32 or i64.
44182 // FIXME: Could support other types, but this is what we have coverage for.
44183 if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
44184 return SDValue();
44185
44186 EVT VT = Extract->getOperand(0).getValueType();
44188 return SDValue();
44189
44190 // Match shuffle + add pyramid.
44191 ISD::NodeType BinOp;
44192 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
44193
44194 // The operand is expected to be zero extended from i8
44195 // (verified in detectZextAbsDiff).
44196 // In order to convert to i64 and above, additional any/zero/sign
44197 // extend is expected.
44198 // The zero extend from 32 bit has no mathematical effect on the result.
44199 // Also the sign extend is basically zero extend
44200 // (extends the sign bit which is zero).
44201 // So it is correct to skip the sign/zero extend instruction.
44202 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
44203 Root.getOpcode() == ISD::ZERO_EXTEND ||
44204 Root.getOpcode() == ISD::ANY_EXTEND))
44205 Root = Root.getOperand(0);
44206
44207 // If there was a match, we want Root to be a select that is the root of an
44208 // abs-diff pattern.
44209 if (!Root || Root.getOpcode() != ISD::ABS)
44210 return SDValue();
44211
44212 // Check whether we have an abs-diff pattern feeding into the select.
44213 SDValue Zext0, Zext1;
44214 if (!detectZextAbsDiff(Root, Zext0, Zext1))
44215 return SDValue();
44216
44217 // Create the SAD instruction.
44218 SDLoc DL(Extract);
44219 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
44220
44221 // If the original vector was wider than 8 elements, sum over the results
44222 // in the SAD vector.
44223 unsigned Stages = Log2_32(VT.getVectorNumElements());
44224 EVT SadVT = SAD.getValueType();
44225 if (Stages > 3) {
44226 unsigned SadElems = SadVT.getVectorNumElements();
44227
44228 for(unsigned i = Stages - 3; i > 0; --i) {
44229 SmallVector<int, 16> Mask(SadElems, -1);
44230 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
44231 Mask[j] = MaskEnd + j;
44232
44233 SDValue Shuffle =
44234 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
44235 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
44236 }
44237 }
44238
44239 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
44240 // Return the lowest ExtractSizeInBits bits.
44241 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
44242 SadVT.getSizeInBits() / ExtractSizeInBits);
44243 SAD = DAG.getBitcast(ResVT, SAD);
44244 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
44245 Extract->getOperand(1));
44246}
44247
44248// If this extract is from a loaded vector value and will be used as an
44249// integer, that requires a potentially expensive XMM -> GPR transfer.
44250// Additionally, if we can convert to a scalar integer load, that will likely
44251// be folded into a subsequent integer op.
44252// Note: SrcVec might not have a VecVT type, but it must be the same size.
44253// Note: Unlike the related fold for this in DAGCombiner, this is not limited
44254// to a single-use of the loaded vector. For the reasons above, we
44255// expect this to be profitable even if it creates an extra load.
44256static SDValue
44258 const SDLoc &dl, SelectionDAG &DAG,
44260 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
44261 "Only EXTRACT_VECTOR_ELT supported so far");
44262
44263 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44264 EVT VT = N->getValueType(0);
44265
44266 bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {
44267 return Use->getOpcode() == ISD::STORE ||
44268 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
44269 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
44270 });
44271
44272 auto *LoadVec = dyn_cast<LoadSDNode>(SrcVec);
44273 if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
44274 VecVT.getVectorElementType() == VT &&
44275 VecVT.getSizeInBits() == SrcVec.getValueSizeInBits() &&
44276 DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {
44277 SDValue NewPtr = TLI.getVectorElementPointer(
44278 DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));
44279 unsigned PtrOff = VT.getSizeInBits() * Idx / 8;
44280 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
44281 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
44282 SDValue Load =
44283 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
44284 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
44285 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
44286 return Load;
44287 }
44288
44289 return SDValue();
44290}
44291
44292// Attempt to peek through a target shuffle and extract the scalar from the
44293// source.
44296 const X86Subtarget &Subtarget) {
44297 if (DCI.isBeforeLegalizeOps())
44298 return SDValue();
44299
44300 SDLoc dl(N);
44301 SDValue Src = N->getOperand(0);
44302 SDValue Idx = N->getOperand(1);
44303
44304 EVT VT = N->getValueType(0);
44305 EVT SrcVT = Src.getValueType();
44306 EVT SrcSVT = SrcVT.getVectorElementType();
44307 unsigned SrcEltBits = SrcSVT.getSizeInBits();
44308 unsigned NumSrcElts = SrcVT.getVectorNumElements();
44309
44310 // Don't attempt this for boolean mask vectors or unknown extraction indices.
44311 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
44312 return SDValue();
44313
44314 const APInt &IdxC = N->getConstantOperandAPInt(1);
44315 if (IdxC.uge(NumSrcElts))
44316 return SDValue();
44317
44318 SDValue SrcBC = peekThroughBitcasts(Src);
44319
44320 // Handle extract(bitcast(broadcast(scalar_value))).
44321 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
44322 SDValue SrcOp = SrcBC.getOperand(0);
44323 EVT SrcOpVT = SrcOp.getValueType();
44324 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
44325 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
44326 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
44327 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
44328 // TODO support non-zero offsets.
44329 if (Offset == 0) {
44330 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
44331 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
44332 return SrcOp;
44333 }
44334 }
44335 }
44336
44337 // If we're extracting a single element from a broadcast load and there are
44338 // no other users, just create a single load.
44339 if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
44340 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
44341 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
44342 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
44343 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
44344 SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
44345 MemIntr->getBasePtr(),
44346 MemIntr->getPointerInfo(),
44347 MemIntr->getOriginalAlign(),
44348 MemIntr->getMemOperand()->getFlags());
44349 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
44350 return Load;
44351 }
44352 }
44353
44354 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
44355 // TODO: Move to DAGCombine?
44356 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
44357 SrcBC.getValueType().isInteger() &&
44358 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
44359 SrcBC.getScalarValueSizeInBits() ==
44360 SrcBC.getOperand(0).getValueSizeInBits()) {
44361 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
44362 if (IdxC.ult(Scale)) {
44363 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
44364 SDValue Scl = SrcBC.getOperand(0);
44365 EVT SclVT = Scl.getValueType();
44366 if (Offset) {
44367 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
44368 DAG.getShiftAmountConstant(Offset, SclVT, dl));
44369 }
44370 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
44371 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
44372 return Scl;
44373 }
44374 }
44375
44376 // Handle extract(truncate(x)) for 0'th index.
44377 // TODO: Treat this as a faux shuffle?
44378 // TODO: When can we use this for general indices?
44379 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
44380 (SrcVT.getSizeInBits() % 128) == 0) {
44381 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
44382 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
44383 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
44384 Idx);
44385 }
44386
44387 // We can only legally extract other elements from 128-bit vectors and in
44388 // certain circumstances, depending on SSE-level.
44389 // TODO: Investigate float/double extraction if it will be just stored.
44390 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
44391 unsigned Idx) {
44392 EVT VecSVT = VecVT.getScalarType();
44393 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
44394 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
44395 VecSVT == MVT::i64)) {
44396 unsigned EltSizeInBits = VecSVT.getSizeInBits();
44397 unsigned NumEltsPerLane = 128 / EltSizeInBits;
44398 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
44399 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
44400 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
44401 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
44402 Idx &= (NumEltsPerLane - 1);
44403 }
44404 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
44405 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
44406 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
44407 DAG.getBitcast(VecVT, Vec),
44408 DAG.getIntPtrConstant(Idx, dl));
44409 }
44410 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
44411 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
44412 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
44413 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
44414 DAG.getTargetConstant(Idx, dl, MVT::i8));
44415 }
44416 return SDValue();
44417 };
44418
44419 // Resolve the target shuffle inputs and mask.
44422 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
44423 return SDValue();
44424
44425 // Shuffle inputs must be the same size as the result.
44426 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
44427 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
44428 }))
44429 return SDValue();
44430
44431 // Attempt to narrow/widen the shuffle mask to the correct size.
44432 if (Mask.size() != NumSrcElts) {
44433 if ((NumSrcElts % Mask.size()) == 0) {
44434 SmallVector<int, 16> ScaledMask;
44435 int Scale = NumSrcElts / Mask.size();
44436 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
44437 Mask = std::move(ScaledMask);
44438 } else if ((Mask.size() % NumSrcElts) == 0) {
44439 // Simplify Mask based on demanded element.
44440 int ExtractIdx = (int)IdxC.getZExtValue();
44441 int Scale = Mask.size() / NumSrcElts;
44442 int Lo = Scale * ExtractIdx;
44443 int Hi = Scale * (ExtractIdx + 1);
44444 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
44445 if (i < Lo || Hi <= i)
44446 Mask[i] = SM_SentinelUndef;
44447
44448 SmallVector<int, 16> WidenedMask;
44449 while (Mask.size() > NumSrcElts &&
44450 canWidenShuffleElements(Mask, WidenedMask))
44451 Mask = std::move(WidenedMask);
44452 }
44453 }
44454
44455 // If narrowing/widening failed, see if we can extract+zero-extend.
44456 int ExtractIdx;
44457 EVT ExtractVT;
44458 if (Mask.size() == NumSrcElts) {
44459 ExtractIdx = Mask[IdxC.getZExtValue()];
44460 ExtractVT = SrcVT;
44461 } else {
44462 unsigned Scale = Mask.size() / NumSrcElts;
44463 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
44464 return SDValue();
44465 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
44466 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
44467 return SDValue();
44468 ExtractIdx = Mask[ScaledIdx];
44469 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
44470 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
44471 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
44472 "Failed to widen vector type");
44473 }
44474
44475 // If the shuffle source element is undef/zero then we can just accept it.
44476 if (ExtractIdx == SM_SentinelUndef)
44477 return DAG.getUNDEF(VT);
44478
44479 if (ExtractIdx == SM_SentinelZero)
44480 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
44481 : DAG.getConstant(0, dl, VT);
44482
44483 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
44484 ExtractIdx = ExtractIdx % Mask.size();
44485 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
44486 return DAG.getZExtOrTrunc(V, dl, VT);
44487
44488 if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)
44490 N, SrcVT, peekThroughBitcasts(SrcOp), ExtractIdx, dl, DAG, DCI))
44491 return V;
44492
44493 return SDValue();
44494}
44495
44496/// Extracting a scalar FP value from vector element 0 is free, so extract each
44497/// operand first, then perform the math as a scalar op.
44499 const X86Subtarget &Subtarget) {
44500 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
44501 SDValue Vec = ExtElt->getOperand(0);
44502 SDValue Index = ExtElt->getOperand(1);
44503 EVT VT = ExtElt->getValueType(0);
44504 EVT VecVT = Vec.getValueType();
44505
44506 // TODO: If this is a unary/expensive/expand op, allow extraction from a
44507 // non-zero element because the shuffle+scalar op will be cheaper?
44508 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
44509 return SDValue();
44510
44511 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
44512 // extract, the condition code), so deal with those as a special-case.
44513 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
44514 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
44515 if (OpVT != MVT::f32 && OpVT != MVT::f64)
44516 return SDValue();
44517
44518 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
44519 SDLoc DL(ExtElt);
44520 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
44521 Vec.getOperand(0), Index);
44522 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
44523 Vec.getOperand(1), Index);
44524 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
44525 }
44526
44527 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
44528 VT != MVT::f64)
44529 return SDValue();
44530
44531 // Vector FP selects don't fit the pattern of FP math ops (because the
44532 // condition has a different type and we have to change the opcode), so deal
44533 // with those here.
44534 // FIXME: This is restricted to pre type legalization by ensuring the setcc
44535 // has i1 elements. If we loosen this we need to convert vector bool to a
44536 // scalar bool.
44537 if (Vec.getOpcode() == ISD::VSELECT &&
44538 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
44539 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
44540 Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
44541 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
44542 SDLoc DL(ExtElt);
44545 Vec.getOperand(0), Index);
44546 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
44547 Vec.getOperand(1), Index);
44548 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
44549 Vec.getOperand(2), Index);
44550 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
44551 }
44552
44553 // TODO: This switch could include FNEG and the x86-specific FP logic ops
44554 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
44555 // missed load folding and fma+fneg combining.
44556 switch (Vec.getOpcode()) {
44557 case ISD::FMA: // Begin 3 operands
44558 case ISD::FMAD:
44559 case ISD::FADD: // Begin 2 operands
44560 case ISD::FSUB:
44561 case ISD::FMUL:
44562 case ISD::FDIV:
44563 case ISD::FREM:
44564 case ISD::FCOPYSIGN:
44565 case ISD::FMINNUM:
44566 case ISD::FMAXNUM:
44567 case ISD::FMINNUM_IEEE:
44568 case ISD::FMAXNUM_IEEE:
44569 case ISD::FMAXIMUM:
44570 case ISD::FMINIMUM:
44571 case X86ISD::FMAX:
44572 case X86ISD::FMIN:
44573 case ISD::FABS: // Begin 1 operand
44574 case ISD::FSQRT:
44575 case ISD::FRINT:
44576 case ISD::FCEIL:
44577 case ISD::FTRUNC:
44578 case ISD::FNEARBYINT:
44579 case ISD::FROUNDEVEN:
44580 case ISD::FROUND:
44581 case ISD::FFLOOR:
44582 case X86ISD::FRCP:
44583 case X86ISD::FRSQRT: {
44584 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
44585 SDLoc DL(ExtElt);
44587 for (SDValue Op : Vec->ops())
44588 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
44589 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
44590 }
44591 default:
44592 return SDValue();
44593 }
44594 llvm_unreachable("All opcodes should return within switch");
44595}
44596
44597/// Try to convert a vector reduction sequence composed of binops and shuffles
44598/// into horizontal ops.
44600 const X86Subtarget &Subtarget) {
44601 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
44602
44603 // We need at least SSE2 to anything here.
44604 if (!Subtarget.hasSSE2())
44605 return SDValue();
44606
44607 ISD::NodeType Opc;
44608 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
44609 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
44610 if (!Rdx)
44611 return SDValue();
44612
44613 SDValue Index = ExtElt->getOperand(1);
44615 "Reduction doesn't end in an extract from index 0");
44616
44617 EVT VT = ExtElt->getValueType(0);
44618 EVT VecVT = Rdx.getValueType();
44619 if (VecVT.getScalarType() != VT)
44620 return SDValue();
44621
44622 SDLoc DL(ExtElt);
44623 unsigned NumElts = VecVT.getVectorNumElements();
44624 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
44625
44626 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
44627 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
44628 if (V.getValueType() == MVT::v4i8) {
44629 if (ZeroExtend && Subtarget.hasSSE41()) {
44630 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
44631 DAG.getConstant(0, DL, MVT::v4i32),
44632 DAG.getBitcast(MVT::i32, V),
44633 DAG.getIntPtrConstant(0, DL));
44634 return DAG.getBitcast(MVT::v16i8, V);
44635 }
44636 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
44637 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
44638 : DAG.getUNDEF(MVT::v4i8));
44639 }
44640 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
44641 DAG.getUNDEF(MVT::v8i8));
44642 };
44643
44644 // vXi8 mul reduction - promote to vXi16 mul reduction.
44645 if (Opc == ISD::MUL) {
44646 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
44647 return SDValue();
44648 if (VecVT.getSizeInBits() >= 128) {
44649 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
44650 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
44651 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
44652 Lo = DAG.getBitcast(WideVT, Lo);
44653 Hi = DAG.getBitcast(WideVT, Hi);
44654 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
44655 while (Rdx.getValueSizeInBits() > 128) {
44656 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
44657 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
44658 }
44659 } else {
44660 Rdx = WidenToV16I8(Rdx, false);
44661 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
44662 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
44663 }
44664 if (NumElts >= 8)
44665 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
44666 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
44667 {4, 5, 6, 7, -1, -1, -1, -1}));
44668 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
44669 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
44670 {2, 3, -1, -1, -1, -1, -1, -1}));
44671 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
44672 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
44673 {1, -1, -1, -1, -1, -1, -1, -1}));
44674 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
44675 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
44676 }
44677
44678 // vXi8 add reduction - sub 128-bit vector.
44679 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
44680 Rdx = WidenToV16I8(Rdx, true);
44681 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
44682 DAG.getConstant(0, DL, MVT::v16i8));
44683 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
44684 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
44685 }
44686
44687 // Must be a >=128-bit vector with pow2 elements.
44688 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
44689 return SDValue();
44690
44691 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
44692 if (VT == MVT::i8) {
44693 while (Rdx.getValueSizeInBits() > 128) {
44694 SDValue Lo, Hi;
44695 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
44696 VecVT = Lo.getValueType();
44697 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
44698 }
44699 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
44700
44702 MVT::v16i8, DL, Rdx, Rdx,
44703 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
44704 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
44705 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
44706 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
44707 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
44708 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
44709 }
44710
44711 // See if we can use vXi8 PSADBW add reduction for larger zext types.
44712 // If the source vector values are 0-255, then we can use PSADBW to
44713 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
44714 // TODO: See if its worth avoiding vXi16/i32 truncations?
44715 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
44716 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
44717 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
44718 Subtarget.hasAVX512())) {
44719 if (Rdx.getValueType() == MVT::v8i16) {
44720 Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx,
44721 DAG.getUNDEF(MVT::v8i16));
44722 } else {
44723 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
44724 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
44725 if (ByteVT.getSizeInBits() < 128)
44726 Rdx = WidenToV16I8(Rdx, true);
44727 }
44728
44729 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
44730 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
44731 ArrayRef<SDValue> Ops) {
44732 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
44733 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
44734 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
44735 };
44736 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
44737 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
44738
44739 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
44740 while (Rdx.getValueSizeInBits() > 128) {
44741 SDValue Lo, Hi;
44742 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
44743 VecVT = Lo.getValueType();
44744 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
44745 }
44746 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
44747
44748 if (NumElts > 8) {
44749 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
44750 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
44751 }
44752
44753 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
44754 Rdx = DAG.getBitcast(VecVT, Rdx);
44755 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
44756 }
44757
44758 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
44759 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
44760 return SDValue();
44761
44762 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
44763
44764 // 256-bit horizontal instructions operate on 128-bit chunks rather than
44765 // across the whole vector, so we need an extract + hop preliminary stage.
44766 // This is the only step where the operands of the hop are not the same value.
44767 // TODO: We could extend this to handle 512-bit or even longer vectors.
44768 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
44769 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
44770 unsigned NumElts = VecVT.getVectorNumElements();
44771 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
44772 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
44773 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
44774 VecVT = Rdx.getValueType();
44775 }
44776 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
44777 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
44778 return SDValue();
44779
44780 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
44781 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
44782 for (unsigned i = 0; i != ReductionSteps; ++i)
44783 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
44784
44785 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
44786}
44787
44788/// Detect vector gather/scatter index generation and convert it from being a
44789/// bunch of shuffles and extracts into a somewhat faster sequence.
44790/// For i686, the best sequence is apparently storing the value and loading
44791/// scalars back, while for x64 we should use 64-bit extracts and shifts.
44794 const X86Subtarget &Subtarget) {
44795 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
44796 return NewOp;
44797
44798 SDValue InputVector = N->getOperand(0);
44799 SDValue EltIdx = N->getOperand(1);
44800 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
44801
44802 EVT SrcVT = InputVector.getValueType();
44803 EVT VT = N->getValueType(0);
44804 SDLoc dl(InputVector);
44805 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
44806 unsigned NumSrcElts = SrcVT.getVectorNumElements();
44807 unsigned NumEltBits = VT.getScalarSizeInBits();
44808 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44809
44810 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
44811 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
44812
44813 // Integer Constant Folding.
44814 if (CIdx && VT.isInteger()) {
44815 APInt UndefVecElts;
44816 SmallVector<APInt, 16> EltBits;
44817 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
44818 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
44819 EltBits, /*AllowWholeUndefs*/ true,
44820 /*AllowPartialUndefs*/ false)) {
44821 uint64_t Idx = CIdx->getZExtValue();
44822 if (UndefVecElts[Idx])
44823 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
44824 return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
44825 }
44826
44827 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
44828 // Improves lowering of bool masks on rust which splits them into byte array.
44829 if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
44830 SDValue Src = peekThroughBitcasts(InputVector);
44831 if (Src.getValueType().getScalarType() == MVT::i1 &&
44832 TLI.isTypeLegal(Src.getValueType())) {
44833 MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
44834 SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
44835 DAG.getIntPtrConstant(CIdx->getZExtValue() * NumEltBits, dl));
44836 return DAG.getBitcast(VT, Sub);
44837 }
44838 }
44839 }
44840
44841 if (IsPextr) {
44842 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
44843 DCI))
44844 return SDValue(N, 0);
44845
44846 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
44847 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
44848 InputVector.getOpcode() == X86ISD::PINSRW) &&
44849 InputVector.getOperand(2) == EltIdx) {
44850 assert(SrcVT == InputVector.getOperand(0).getValueType() &&
44851 "Vector type mismatch");
44852 SDValue Scl = InputVector.getOperand(1);
44853 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
44854 return DAG.getZExtOrTrunc(Scl, dl, VT);
44855 }
44856
44857 // TODO - Remove this once we can handle the implicit zero-extension of
44858 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
44859 // combineBasicSADPattern.
44860 return SDValue();
44861 }
44862
44863 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
44864 if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
44865 InputVector.getOpcode() == ISD::BITCAST &&
44866 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
44867 isNullConstant(EltIdx) && InputVector.hasOneUse())
44868 return DAG.getBitcast(VT, InputVector);
44869
44870 // Detect mmx to i32 conversion through a v2i32 elt extract.
44871 if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
44872 InputVector.getOpcode() == ISD::BITCAST &&
44873 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
44874 isNullConstant(EltIdx) && InputVector.hasOneUse())
44875 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
44876 InputVector.getOperand(0));
44877
44878 // Check whether this extract is the root of a sum of absolute differences
44879 // pattern. This has to be done here because we really want it to happen
44880 // pre-legalization,
44881 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
44882 return SAD;
44883
44884 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
44885 return VPDPBUSD;
44886
44887 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
44888 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
44889 return Cmp;
44890
44891 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
44892 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
44893 return MinMax;
44894
44895 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
44896 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
44897 return V;
44898
44899 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget))
44900 return V;
44901
44902 if (CIdx)
44904 N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),
44905 dl, DAG, DCI))
44906 return V;
44907
44908 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
44909 // and then testing the relevant element.
44910 //
44911 // Note that we only combine extracts on the *same* result number, i.e.
44912 // t0 = merge_values a0, a1, a2, a3
44913 // i1 = extract_vector_elt t0, Constant:i64<2>
44914 // i1 = extract_vector_elt t0, Constant:i64<3>
44915 // but not
44916 // i1 = extract_vector_elt t0:1, Constant:i64<2>
44917 // since the latter would need its own MOVMSK.
44918 if (SrcVT.getScalarType() == MVT::i1) {
44919 bool IsVar = !CIdx;
44920 SmallVector<SDNode *, 16> BoolExtracts;
44921 unsigned ResNo = InputVector.getResNo();
44922 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
44923 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
44924 Use->getOperand(0).getResNo() == ResNo &&
44925 Use->getValueType(0) == MVT::i1) {
44926 BoolExtracts.push_back(Use);
44927 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
44928 return true;
44929 }
44930 return false;
44931 };
44932 // TODO: Can we drop the oneuse check for constant extracts?
44933 if (all_of(InputVector->uses(), IsBoolExtract) &&
44934 (IsVar || BoolExtracts.size() > 1)) {
44935 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
44936 if (SDValue BC =
44937 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
44938 for (SDNode *Use : BoolExtracts) {
44939 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
44940 // Mask = 1 << MaskIdx
44941 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
44942 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
44943 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
44944 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
44945 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
44946 DCI.CombineTo(Use, Res);
44947 }
44948 return SDValue(N, 0);
44949 }
44950 }
44951 }
44952
44953 // Attempt to fold extract(trunc(x),c) -> trunc(extract(x,c)).
44954 if (CIdx && InputVector.getOpcode() == ISD::TRUNCATE) {
44955 SDValue TruncSrc = InputVector.getOperand(0);
44956 EVT TruncSVT = TruncSrc.getValueType().getScalarType();
44957 if (DCI.isBeforeLegalize() && TLI.isTypeLegal(TruncSVT)) {
44958 SDValue NewExt =
44959 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TruncSVT, TruncSrc, EltIdx);
44960 return DAG.getAnyExtOrTrunc(NewExt, dl, VT);
44961 }
44962 }
44963
44964 return SDValue();
44965}
44966
44967// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
44968// This is more or less the reverse of combineBitcastvxi1.
44970 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
44971 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
44972 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
44973 Opcode != ISD::ANY_EXTEND)
44974 return SDValue();
44975 if (!DCI.isBeforeLegalizeOps())
44976 return SDValue();
44977 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
44978 return SDValue();
44979
44980 EVT SVT = VT.getScalarType();
44981 EVT InSVT = N0.getValueType().getScalarType();
44982 unsigned EltSizeInBits = SVT.getSizeInBits();
44983
44984 // Input type must be extending a bool vector (bit-casted from a scalar
44985 // integer) to legal integer types.
44986 if (!VT.isVector())
44987 return SDValue();
44988 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
44989 return SDValue();
44990 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
44991 return SDValue();
44992
44993 SDValue N00 = N0.getOperand(0);
44994 EVT SclVT = N00.getValueType();
44995 if (!SclVT.isScalarInteger())
44996 return SDValue();
44997
44998 SDValue Vec;
44999 SmallVector<int> ShuffleMask;
45000 unsigned NumElts = VT.getVectorNumElements();
45001 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
45002
45003 // Broadcast the scalar integer to the vector elements.
45004 if (NumElts > EltSizeInBits) {
45005 // If the scalar integer is greater than the vector element size, then we
45006 // must split it down into sub-sections for broadcasting. For example:
45007 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
45008 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
45009 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
45010 unsigned Scale = NumElts / EltSizeInBits;
45011 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
45012 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
45013 Vec = DAG.getBitcast(VT, Vec);
45014
45015 for (unsigned i = 0; i != Scale; ++i)
45016 ShuffleMask.append(EltSizeInBits, i);
45017 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
45018 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
45019 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
45020 // If we have register broadcast instructions, use the scalar size as the
45021 // element type for the shuffle. Then cast to the wider element type. The
45022 // widened bits won't be used, and this might allow the use of a broadcast
45023 // load.
45024 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
45025 unsigned Scale = EltSizeInBits / NumElts;
45026 EVT BroadcastVT =
45027 EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
45028 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
45029 ShuffleMask.append(NumElts * Scale, 0);
45030 Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
45031 Vec = DAG.getBitcast(VT, Vec);
45032 } else {
45033 // For smaller scalar integers, we can simply any-extend it to the vector
45034 // element size (we don't care about the upper bits) and broadcast it to all
45035 // elements.
45036 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
45037 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
45038 ShuffleMask.append(NumElts, 0);
45039 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
45040 }
45041
45042 // Now, mask the relevant bit in each element.
45044 for (unsigned i = 0; i != NumElts; ++i) {
45045 int BitIdx = (i % EltSizeInBits);
45046 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
45047 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
45048 }
45049 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
45050 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
45051
45052 // Compare against the bitmask and extend the result.
45053 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
45054 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
45055 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
45056
45057 // For SEXT, this is now done, otherwise shift the result down for
45058 // zero-extension.
45059 if (Opcode == ISD::SIGN_EXTEND)
45060 return Vec;
45061 return DAG.getNode(ISD::SRL, DL, VT, Vec,
45062 DAG.getConstant(EltSizeInBits - 1, DL, VT));
45063}
45064
45065/// If a vector select has an operand that is -1 or 0, try to simplify the
45066/// select to a bitwise logic operation.
45067/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
45068static SDValue
45071 const X86Subtarget &Subtarget) {
45072 SDValue Cond = N->getOperand(0);
45073 SDValue LHS = N->getOperand(1);
45074 SDValue RHS = N->getOperand(2);
45075 EVT VT = LHS.getValueType();
45076 EVT CondVT = Cond.getValueType();
45077 SDLoc DL(N);
45078 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45079
45080 if (N->getOpcode() != ISD::VSELECT)
45081 return SDValue();
45082
45083 assert(CondVT.isVector() && "Vector select expects a vector selector!");
45084
45085 // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
45086 // TODO: Can we assert that both operands are not zeros (because that should
45087 // get simplified at node creation time)?
45088 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
45089 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
45090
45091 // If both inputs are 0/undef, create a complete zero vector.
45092 // FIXME: As noted above this should be handled by DAGCombiner/getNode.
45093 if (TValIsAllZeros && FValIsAllZeros) {
45094 if (VT.isFloatingPoint())
45095 return DAG.getConstantFP(0.0, DL, VT);
45096 return DAG.getConstant(0, DL, VT);
45097 }
45098
45099 // To use the condition operand as a bitwise mask, it must have elements that
45100 // are the same size as the select elements. Ie, the condition operand must
45101 // have already been promoted from the IR select condition type <N x i1>.
45102 // Don't check if the types themselves are equal because that excludes
45103 // vector floating-point selects.
45104 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
45105 return SDValue();
45106
45107 // Try to invert the condition if true value is not all 1s and false value is
45108 // not all 0s. Only do this if the condition has one use.
45109 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
45110 if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
45111 // Check if the selector will be produced by CMPP*/PCMP*.
45112 Cond.getOpcode() == ISD::SETCC &&
45113 // Check if SETCC has already been promoted.
45114 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
45115 CondVT) {
45116 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
45117
45118 if (TValIsAllZeros || FValIsAllOnes) {
45119 SDValue CC = Cond.getOperand(2);
45121 cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
45122 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
45123 NewCC);
45124 std::swap(LHS, RHS);
45125 TValIsAllOnes = FValIsAllOnes;
45126 FValIsAllZeros = TValIsAllZeros;
45127 }
45128 }
45129
45130 // Cond value must be 'sign splat' to be converted to a logical op.
45131 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
45132 return SDValue();
45133
45134 // vselect Cond, 111..., 000... -> Cond
45135 if (TValIsAllOnes && FValIsAllZeros)
45136 return DAG.getBitcast(VT, Cond);
45137
45138 if (!TLI.isTypeLegal(CondVT))
45139 return SDValue();
45140
45141 // vselect Cond, 111..., X -> or Cond, X
45142 if (TValIsAllOnes) {
45143 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
45144 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
45145 return DAG.getBitcast(VT, Or);
45146 }
45147
45148 // vselect Cond, X, 000... -> and Cond, X
45149 if (FValIsAllZeros) {
45150 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
45151 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
45152 return DAG.getBitcast(VT, And);
45153 }
45154
45155 // vselect Cond, 000..., X -> andn Cond, X
45156 if (TValIsAllZeros) {
45157 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
45158 SDValue AndN;
45159 // The canonical form differs for i1 vectors - x86andnp is not used
45160 if (CondVT.getScalarType() == MVT::i1)
45161 AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
45162 CastRHS);
45163 else
45164 AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
45165 return DAG.getBitcast(VT, AndN);
45166 }
45167
45168 return SDValue();
45169}
45170
45171/// If both arms of a vector select are concatenated vectors, split the select,
45172/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
45173/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
45174/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
45176 const X86Subtarget &Subtarget) {
45177 unsigned Opcode = N->getOpcode();
45178 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
45179 return SDValue();
45180
45181 // TODO: Split 512-bit vectors too?
45182 EVT VT = N->getValueType(0);
45183 if (!VT.is256BitVector())
45184 return SDValue();
45185
45186 // TODO: Split as long as any 2 of the 3 operands are concatenated?
45187 SDValue Cond = N->getOperand(0);
45188 SDValue TVal = N->getOperand(1);
45189 SDValue FVal = N->getOperand(2);
45190 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
45191 !isFreeToSplitVector(TVal.getNode(), DAG) ||
45192 !isFreeToSplitVector(FVal.getNode(), DAG))
45193 return SDValue();
45194
45195 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
45196 ArrayRef<SDValue> Ops) {
45197 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
45198 };
45199 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
45200 makeBlend, /*CheckBWI*/ false);
45201}
45202
45204 SDValue Cond = N->getOperand(0);
45205 SDValue LHS = N->getOperand(1);
45206 SDValue RHS = N->getOperand(2);
45207 SDLoc DL(N);
45208
45209 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
45210 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
45211 if (!TrueC || !FalseC)
45212 return SDValue();
45213
45214 // Don't do this for crazy integer types.
45215 EVT VT = N->getValueType(0);
45216 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
45217 return SDValue();
45218
45219 // We're going to use the condition bit in math or logic ops. We could allow
45220 // this with a wider condition value (post-legalization it becomes an i8),
45221 // but if nothing is creating selects that late, it doesn't matter.
45222 if (Cond.getValueType() != MVT::i1)
45223 return SDValue();
45224
45225 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
45226 // 3, 5, or 9 with i32/i64, so those get transformed too.
45227 // TODO: For constants that overflow or do not differ by power-of-2 or small
45228 // multiplier, convert to 'and' + 'add'.
45229 const APInt &TrueVal = TrueC->getAPIntValue();
45230 const APInt &FalseVal = FalseC->getAPIntValue();
45231
45232 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
45233 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
45234 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
45235 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
45236 if (CC == ISD::SETEQ || CC == ISD::SETNE)
45237 return SDValue();
45238 }
45239
45240 bool OV;
45241 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
45242 if (OV)
45243 return SDValue();
45244
45245 APInt AbsDiff = Diff.abs();
45246 if (AbsDiff.isPowerOf2() ||
45247 ((VT == MVT::i32 || VT == MVT::i64) &&
45248 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
45249
45250 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
45251 // of the condition can usually be folded into a compare predicate, but even
45252 // without that, the sequence should be cheaper than a CMOV alternative.
45253 if (TrueVal.slt(FalseVal)) {
45254 Cond = DAG.getNOT(DL, Cond, MVT::i1);
45255 std::swap(TrueC, FalseC);
45256 }
45257
45258 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
45259 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
45260
45261 // Multiply condition by the difference if non-one.
45262 if (!AbsDiff.isOne())
45263 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
45264
45265 // Add the base if non-zero.
45266 if (!FalseC->isZero())
45267 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
45268
45269 return R;
45270 }
45271
45272 return SDValue();
45273}
45274
45275/// If this is a *dynamic* select (non-constant condition) and we can match
45276/// this node with one of the variable blend instructions, restructure the
45277/// condition so that blends can use the high (sign) bit of each element.
45278/// This function will also call SimplifyDemandedBits on already created
45279/// BLENDV to perform additional simplifications.
45282 const X86Subtarget &Subtarget) {
45283 SDValue Cond = N->getOperand(0);
45284 if ((N->getOpcode() != ISD::VSELECT &&
45285 N->getOpcode() != X86ISD::BLENDV) ||
45287 return SDValue();
45288
45289 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45290 unsigned BitWidth = Cond.getScalarValueSizeInBits();
45291 EVT VT = N->getValueType(0);
45292
45293 // We can only handle the cases where VSELECT is directly legal on the
45294 // subtarget. We custom lower VSELECT nodes with constant conditions and
45295 // this makes it hard to see whether a dynamic VSELECT will correctly
45296 // lower, so we both check the operation's status and explicitly handle the
45297 // cases where a *dynamic* blend will fail even though a constant-condition
45298 // blend could be custom lowered.
45299 // FIXME: We should find a better way to handle this class of problems.
45300 // Potentially, we should combine constant-condition vselect nodes
45301 // pre-legalization into shuffles and not mark as many types as custom
45302 // lowered.
45304 return SDValue();
45305 // FIXME: We don't support i16-element blends currently. We could and
45306 // should support them by making *all* the bits in the condition be set
45307 // rather than just the high bit and using an i8-element blend.
45308 if (VT.getVectorElementType() == MVT::i16)
45309 return SDValue();
45310 // Dynamic blending was only available from SSE4.1 onward.
45311 if (VT.is128BitVector() && !Subtarget.hasSSE41())
45312 return SDValue();
45313 // Byte blends are only available in AVX2
45314 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
45315 return SDValue();
45316 // There are no 512-bit blend instructions that use sign bits.
45317 if (VT.is512BitVector())
45318 return SDValue();
45319
45320 // Don't optimize before the condition has been transformed to a legal type
45321 // and don't ever optimize vector selects that map to AVX512 mask-registers.
45322 if (BitWidth < 8 || BitWidth > 64)
45323 return SDValue();
45324
45325 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
45326 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
45327 UI != UE; ++UI)
45328 if ((UI->getOpcode() != ISD::VSELECT &&
45329 UI->getOpcode() != X86ISD::BLENDV) ||
45330 UI.getOperandNo() != 0)
45331 return false;
45332
45333 return true;
45334 };
45335
45337
45338 if (OnlyUsedAsSelectCond(Cond)) {
45339 KnownBits Known;
45341 !DCI.isBeforeLegalizeOps());
45342 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
45343 return SDValue();
45344
45345 // If we changed the computation somewhere in the DAG, this change will
45346 // affect all users of Cond. Update all the nodes so that we do not use
45347 // the generic VSELECT anymore. Otherwise, we may perform wrong
45348 // optimizations as we messed with the actual expectation for the vector
45349 // boolean values.
45350 for (SDNode *U : Cond->uses()) {
45351 if (U->getOpcode() == X86ISD::BLENDV)
45352 continue;
45353
45354 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
45355 Cond, U->getOperand(1), U->getOperand(2));
45356 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
45357 DCI.AddToWorklist(U);
45358 }
45359 DCI.CommitTargetLoweringOpt(TLO);
45360 return SDValue(N, 0);
45361 }
45362
45363 // Otherwise we can still at least try to simplify multiple use bits.
45365 return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
45366 N->getOperand(1), N->getOperand(2));
45367
45368 return SDValue();
45369}
45370
45371// Try to match:
45372// (or (and (M, (sub 0, X)), (pandn M, X)))
45373// which is a special case of:
45374// (select M, (sub 0, X), X)
45375// Per:
45376// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
45377// We know that, if fNegate is 0 or 1:
45378// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
45379//
45380// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
45381// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
45382// ( M ? -X : X) == ((X ^ M ) + (M & 1))
45383// This lets us transform our vselect to:
45384// (add (xor X, M), (and M, 1))
45385// And further to:
45386// (sub (xor X, M), M)
45388 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
45389 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
45390 EVT MaskVT = Mask.getValueType();
45391 assert(MaskVT.isInteger() &&
45392 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
45393 "Mask must be zero/all-bits");
45394
45395 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
45396 return SDValue();
45398 return SDValue();
45399
45400 auto IsNegV = [](SDNode *N, SDValue V) {
45401 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
45402 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
45403 };
45404
45405 SDValue V;
45406 if (IsNegV(Y.getNode(), X))
45407 V = X;
45408 else if (IsNegV(X.getNode(), Y))
45409 V = Y;
45410 else
45411 return SDValue();
45412
45413 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
45414 SDValue SubOp2 = Mask;
45415
45416 // If the negate was on the false side of the select, then
45417 // the operands of the SUB need to be swapped. PR 27251.
45418 // This is because the pattern being matched above is
45419 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
45420 // but if the pattern matched was
45421 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
45422 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
45423 // pattern also needs to be a negation of the replacement pattern above.
45424 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
45425 // sub accomplishes the negation of the replacement pattern.
45426 if (V == Y)
45427 std::swap(SubOp1, SubOp2);
45428
45429 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
45430 return DAG.getBitcast(VT, Res);
45431}
45432
45434 const X86Subtarget &Subtarget) {
45435 if (!Subtarget.hasAVX512())
45436 return SDValue();
45437 if (N->getOpcode() != ISD::VSELECT)
45438 return SDValue();
45439
45440 SDLoc DL(N);
45441 SDValue Cond = N->getOperand(0);
45442 SDValue LHS = N->getOperand(1);
45443 SDValue RHS = N->getOperand(2);
45444
45445 if (canCombineAsMaskOperation(LHS, Subtarget))
45446 return SDValue();
45447
45448 if (!canCombineAsMaskOperation(RHS, Subtarget))
45449 return SDValue();
45450
45451 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
45452 return SDValue();
45453
45454 // Commute LHS and RHS to create opportunity to select mask instruction.
45455 // (vselect M, L, R) -> (vselect ~M, R, L)
45456 ISD::CondCode NewCC =
45457 ISD::getSetCCInverse(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
45458 Cond.getOperand(0).getValueType());
45459 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), Cond.getOperand(0),
45460 Cond.getOperand(1), NewCC);
45461 return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);
45462}
45463
45464/// Do target-specific dag combines on SELECT and VSELECT nodes.
45467 const X86Subtarget &Subtarget) {
45468 SDLoc DL(N);
45469 SDValue Cond = N->getOperand(0);
45470 SDValue LHS = N->getOperand(1);
45471 SDValue RHS = N->getOperand(2);
45472
45473 // Try simplification again because we use this function to optimize
45474 // BLENDV nodes that are not handled by the generic combiner.
45475 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
45476 return V;
45477
45478 // When avx512 is available the lhs operand of select instruction can be
45479 // folded with mask instruction, while the rhs operand can't. Commute the
45480 // lhs and rhs of the select instruction to create the opportunity of
45481 // folding.
45482 if (SDValue V = commuteSelect(N, DAG, Subtarget))
45483 return V;
45484
45485 EVT VT = LHS.getValueType();
45486 EVT CondVT = Cond.getValueType();
45487 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45488 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
45489
45490 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
45491 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
45492 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
45493 if (CondVT.isVector() && CondVT.isInteger() &&
45494 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
45495 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
45498 DL, DAG, Subtarget))
45499 return V;
45500
45501 // Convert vselects with constant condition into shuffles.
45502 if (CondConstantVector && DCI.isBeforeLegalizeOps() &&
45503 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {
45506 N->getOpcode() == X86ISD::BLENDV))
45507 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
45508 }
45509
45510 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
45511 // by forcing the unselected elements to zero.
45512 // TODO: Can we handle more shuffles with this?
45513 if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
45514 LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
45515 LHS.hasOneUse() && RHS.hasOneUse()) {
45516 MVT SimpleVT = VT.getSimpleVT();
45517 SmallVector<SDValue, 1> LHSOps, RHSOps;
45518 SmallVector<int, 64> LHSMask, RHSMask, CondMask;
45519 if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
45520 getTargetShuffleMask(LHS, true, LHSOps, LHSMask) &&
45521 getTargetShuffleMask(RHS, true, RHSOps, RHSMask)) {
45522 int NumElts = VT.getVectorNumElements();
45523 for (int i = 0; i != NumElts; ++i) {
45524 // getConstVector sets negative shuffle mask values as undef, so ensure
45525 // we hardcode SM_SentinelZero values to zero (0x80).
45526 if (CondMask[i] < NumElts) {
45527 LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i];
45528 RHSMask[i] = 0x80;
45529 } else {
45530 LHSMask[i] = 0x80;
45531 RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i];
45532 }
45533 }
45534 LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
45535 getConstVector(LHSMask, SimpleVT, DAG, DL, true));
45536 RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
45537 getConstVector(RHSMask, SimpleVT, DAG, DL, true));
45538 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
45539 }
45540 }
45541
45542 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
45543 // instructions match the semantics of the common C idiom x<y?x:y but not
45544 // x<=y?x:y, because of how they handle negative zero (which can be
45545 // ignored in unsafe-math mode).
45546 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
45547 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
45548 VT != MVT::f80 && VT != MVT::f128 && !isSoftF16(VT, Subtarget) &&
45549 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
45550 (Subtarget.hasSSE2() ||
45551 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
45552 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
45553
45554 unsigned Opcode = 0;
45555 // Check for x CC y ? x : y.
45556 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
45557 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
45558 switch (CC) {
45559 default: break;
45560 case ISD::SETULT:
45561 // Converting this to a min would handle NaNs incorrectly, and swapping
45562 // the operands would cause it to handle comparisons between positive
45563 // and negative zero incorrectly.
45564 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
45566 !(DAG.isKnownNeverZeroFloat(LHS) ||
45568 break;
45569 std::swap(LHS, RHS);
45570 }
45571 Opcode = X86ISD::FMIN;
45572 break;
45573 case ISD::SETOLE:
45574 // Converting this to a min would handle comparisons between positive
45575 // and negative zero incorrectly.
45578 break;
45579 Opcode = X86ISD::FMIN;
45580 break;
45581 case ISD::SETULE:
45582 // Converting this to a min would handle both negative zeros and NaNs
45583 // incorrectly, but we can swap the operands to fix both.
45584 std::swap(LHS, RHS);
45585 [[fallthrough]];
45586 case ISD::SETOLT:
45587 case ISD::SETLT:
45588 case ISD::SETLE:
45589 Opcode = X86ISD::FMIN;
45590 break;
45591
45592 case ISD::SETOGE:
45593 // Converting this to a max would handle comparisons between positive
45594 // and negative zero incorrectly.
45597 break;
45598 Opcode = X86ISD::FMAX;
45599 break;
45600 case ISD::SETUGT:
45601 // Converting this to a max would handle NaNs incorrectly, and swapping
45602 // the operands would cause it to handle comparisons between positive
45603 // and negative zero incorrectly.
45604 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
45606 !(DAG.isKnownNeverZeroFloat(LHS) ||
45608 break;
45609 std::swap(LHS, RHS);
45610 }
45611 Opcode = X86ISD::FMAX;
45612 break;
45613 case ISD::SETUGE:
45614 // Converting this to a max would handle both negative zeros and NaNs
45615 // incorrectly, but we can swap the operands to fix both.
45616 std::swap(LHS, RHS);
45617 [[fallthrough]];
45618 case ISD::SETOGT:
45619 case ISD::SETGT:
45620 case ISD::SETGE:
45621 Opcode = X86ISD::FMAX;
45622 break;
45623 }
45624 // Check for x CC y ? y : x -- a min/max with reversed arms.
45625 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
45626 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
45627 switch (CC) {
45628 default: break;
45629 case ISD::SETOGE:
45630 // Converting this to a min would handle comparisons between positive
45631 // and negative zero incorrectly, and swapping the operands would
45632 // cause it to handle NaNs incorrectly.
45634 !(DAG.isKnownNeverZeroFloat(LHS) ||
45635 DAG.isKnownNeverZeroFloat(RHS))) {
45636 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
45637 break;
45638 std::swap(LHS, RHS);
45639 }
45640 Opcode = X86ISD::FMIN;
45641 break;
45642 case ISD::SETUGT:
45643 // Converting this to a min would handle NaNs incorrectly.
45644 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
45645 break;
45646 Opcode = X86ISD::FMIN;
45647 break;
45648 case ISD::SETUGE:
45649 // Converting this to a min would handle both negative zeros and NaNs
45650 // incorrectly, but we can swap the operands to fix both.
45651 std::swap(LHS, RHS);
45652 [[fallthrough]];
45653 case ISD::SETOGT:
45654 case ISD::SETGT:
45655 case ISD::SETGE:
45656 Opcode = X86ISD::FMIN;
45657 break;
45658
45659 case ISD::SETULT:
45660 // Converting this to a max would handle NaNs incorrectly.
45661 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
45662 break;
45663 Opcode = X86ISD::FMAX;
45664 break;
45665 case ISD::SETOLE:
45666 // Converting this to a max would handle comparisons between positive
45667 // and negative zero incorrectly, and swapping the operands would
45668 // cause it to handle NaNs incorrectly.
45670 !DAG.isKnownNeverZeroFloat(LHS) &&
45671 !DAG.isKnownNeverZeroFloat(RHS)) {
45672 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
45673 break;
45674 std::swap(LHS, RHS);
45675 }
45676 Opcode = X86ISD::FMAX;
45677 break;
45678 case ISD::SETULE:
45679 // Converting this to a max would handle both negative zeros and NaNs
45680 // incorrectly, but we can swap the operands to fix both.
45681 std::swap(LHS, RHS);
45682 [[fallthrough]];
45683 case ISD::SETOLT:
45684 case ISD::SETLT:
45685 case ISD::SETLE:
45686 Opcode = X86ISD::FMAX;
45687 break;
45688 }
45689 }
45690
45691 if (Opcode)
45692 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
45693 }
45694
45695 // Some mask scalar intrinsics rely on checking if only one bit is set
45696 // and implement it in C code like this:
45697 // A[0] = (U & 1) ? A[0] : W[0];
45698 // This creates some redundant instructions that break pattern matching.
45699 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
45700 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
45701 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
45702 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
45703 SDValue AndNode = Cond.getOperand(0);
45704 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
45705 isNullConstant(Cond.getOperand(1)) &&
45706 isOneConstant(AndNode.getOperand(1))) {
45707 // LHS and RHS swapped due to
45708 // setcc outputting 1 when AND resulted in 0 and vice versa.
45709 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
45710 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
45711 }
45712 }
45713
45714 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
45715 // lowering on KNL. In this case we convert it to
45716 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
45717 // The same situation all vectors of i8 and i16 without BWI.
45718 // Make sure we extend these even before type legalization gets a chance to
45719 // split wide vectors.
45720 // Since SKX these selects have a proper lowering.
45721 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
45722 CondVT.getVectorElementType() == MVT::i1 &&
45723 (VT.getVectorElementType() == MVT::i8 ||
45724 VT.getVectorElementType() == MVT::i16)) {
45725 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
45726 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
45727 }
45728
45729 // AVX512 - Extend select with zero to merge with target shuffle.
45730 // select(mask, extract_subvector(shuffle(x)), zero) -->
45731 // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
45732 // TODO - support non target shuffles as well.
45733 if (Subtarget.hasAVX512() && CondVT.isVector() &&
45734 CondVT.getVectorElementType() == MVT::i1) {
45735 auto SelectableOp = [&TLI](SDValue Op) {
45736 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
45737 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
45738 isNullConstant(Op.getOperand(1)) &&
45739 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
45740 Op.hasOneUse() && Op.getOperand(0).hasOneUse();
45741 };
45742
45743 bool SelectableLHS = SelectableOp(LHS);
45744 bool SelectableRHS = SelectableOp(RHS);
45745 bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
45746 bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
45747
45748 if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
45749 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
45750 : RHS.getOperand(0).getValueType();
45751 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
45752 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
45753 VT.getSizeInBits());
45754 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
45755 VT.getSizeInBits());
45756 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
45757 DAG.getUNDEF(SrcCondVT), Cond,
45758 DAG.getIntPtrConstant(0, DL));
45759 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
45760 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
45761 }
45762 }
45763
45765 return V;
45766
45767 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
45768 Cond.hasOneUse()) {
45769 EVT CondVT = Cond.getValueType();
45770 SDValue Cond0 = Cond.getOperand(0);
45771 SDValue Cond1 = Cond.getOperand(1);
45772 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
45773
45774 // Canonicalize min/max:
45775 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
45776 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
45777 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
45778 // the need for an extra compare against zero. e.g.
45779 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
45780 // subl %esi, %edi
45781 // testl %edi, %edi
45782 // movl $0, %eax
45783 // cmovgl %edi, %eax
45784 // =>
45785 // xorl %eax, %eax
45786 // subl %esi, $edi
45787 // cmovsl %eax, %edi
45788 //
45789 // We can also canonicalize
45790 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
45791 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
45792 // This allows the use of a test instruction for the compare.
45793 if (LHS == Cond0 && RHS == Cond1) {
45794 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
45797 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
45798 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
45799 }
45800 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
45801 ISD::CondCode NewCC = ISD::SETUGE;
45802 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
45803 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
45804 }
45805 }
45806
45807 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
45808 // fold eq + gt/lt nested selects into ge/le selects
45809 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
45810 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
45811 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
45812 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
45813 // .. etc ..
45814 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
45815 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
45816 SDValue InnerSetCC = RHS.getOperand(0);
45817 ISD::CondCode InnerCC =
45818 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
45819 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
45820 Cond0 == InnerSetCC.getOperand(0) &&
45821 Cond1 == InnerSetCC.getOperand(1)) {
45822 ISD::CondCode NewCC;
45823 switch (CC == ISD::SETEQ ? InnerCC : CC) {
45824 // clang-format off
45825 case ISD::SETGT: NewCC = ISD::SETGE; break;
45826 case ISD::SETLT: NewCC = ISD::SETLE; break;
45827 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
45828 case ISD::SETULT: NewCC = ISD::SETULE; break;
45829 default: NewCC = ISD::SETCC_INVALID; break;
45830 // clang-format on
45831 }
45832 if (NewCC != ISD::SETCC_INVALID) {
45833 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
45834 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
45835 }
45836 }
45837 }
45838 }
45839
45840 // Check if the first operand is all zeros and Cond type is vXi1.
45841 // If this an avx512 target we can improve the use of zero masking by
45842 // swapping the operands and inverting the condition.
45843 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
45844 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
45845 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
45846 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
45847 // Invert the cond to not(cond) : xor(op,allones)=not(op)
45848 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
45849 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
45850 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
45851 }
45852
45853 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
45854 // get split by legalization.
45855 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
45856 CondVT.getVectorElementType() == MVT::i1 &&
45857 TLI.isTypeLegal(VT.getScalarType())) {
45858 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
45860 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
45861 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
45862 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
45863 }
45864 }
45865
45866 // Early exit check
45867 if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
45868 return SDValue();
45869
45870 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
45871 return V;
45872
45873 if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
45874 return V;
45875
45876 if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
45877 return V;
45878
45879 // select(~Cond, X, Y) -> select(Cond, Y, X)
45880 if (CondVT.getScalarType() != MVT::i1) {
45881 if (SDValue CondNot = IsNOT(Cond, DAG))
45882 return DAG.getNode(N->getOpcode(), DL, VT,
45883 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
45884
45885 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
45886 // signbit.
45887 if (Cond.getOpcode() == X86ISD::PCMPGT &&
45888 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
45889 Cond.hasOneUse()) {
45890 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
45891 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
45892 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
45893 }
45894 }
45895
45896 // Try to optimize vXi1 selects if both operands are either all constants or
45897 // bitcasts from scalar integer type. In that case we can convert the operands
45898 // to integer and use an integer select which will be converted to a CMOV.
45899 // We need to take a little bit of care to avoid creating an i64 type after
45900 // type legalization.
45901 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
45902 VT.getVectorElementType() == MVT::i1 &&
45903 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
45905 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
45906 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
45907 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
45908
45909 if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
45910 LHS.getOperand(0).getValueType() == IntVT)) &&
45911 (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
45912 RHS.getOperand(0).getValueType() == IntVT))) {
45913 if (LHSIsConst)
45915 else
45916 LHS = LHS.getOperand(0);
45917
45918 if (RHSIsConst)
45920 else
45921 RHS = RHS.getOperand(0);
45922
45923 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
45924 return DAG.getBitcast(VT, Select);
45925 }
45926 }
45927 }
45928
45929 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
45930 // single bits, then invert the predicate and swap the select operands.
45931 // This can lower using a vector shift bit-hack rather than mask and compare.
45932 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
45933 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
45934 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
45935 Cond.getOperand(0).getOpcode() == ISD::AND &&
45936 isNullOrNullSplat(Cond.getOperand(1)) &&
45937 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
45938 Cond.getOperand(0).getValueType() == VT) {
45939 // The 'and' mask must be composed of power-of-2 constants.
45940 SDValue And = Cond.getOperand(0);
45941 auto *C = isConstOrConstSplat(And.getOperand(1));
45942 if (C && C->getAPIntValue().isPowerOf2()) {
45943 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
45944 SDValue NotCond =
45945 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
45946 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
45947 }
45948
45949 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
45950 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
45951 // 16-bit lacks a proper blendv.
45952 unsigned EltBitWidth = VT.getScalarSizeInBits();
45953 bool CanShiftBlend =
45954 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
45955 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
45956 (Subtarget.hasXOP()));
45957 if (CanShiftBlend &&
45958 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
45959 return C->getAPIntValue().isPowerOf2();
45960 })) {
45961 // Create a left-shift constant to get the mask bits over to the sign-bit.
45962 SDValue Mask = And.getOperand(1);
45963 SmallVector<int, 32> ShlVals;
45964 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
45965 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
45966 ShlVals.push_back(EltBitWidth - 1 -
45967 MaskVal->getAPIntValue().exactLogBase2());
45968 }
45969 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
45970 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
45971 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
45972 SDValue NewCond =
45973 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
45974 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
45975 }
45976 }
45977
45978 return SDValue();
45979}
45980
45981/// Combine:
45982/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
45983/// to:
45984/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
45985/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
45986/// Note that this is only legal for some op/cc combinations.
45988 SelectionDAG &DAG,
45989 const X86Subtarget &Subtarget) {
45990 // This combine only operates on CMP-like nodes.
45991 if (!(Cmp.getOpcode() == X86ISD::CMP ||
45992 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
45993 return SDValue();
45994
45995 // Can't replace the cmp if it has more uses than the one we're looking at.
45996 // FIXME: We would like to be able to handle this, but would need to make sure
45997 // all uses were updated.
45998 if (!Cmp.hasOneUse())
45999 return SDValue();
46000
46001 // This only applies to variations of the common case:
46002 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
46003 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
46004 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
46005 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
46006 // Using the proper condcodes (see below), overflow is checked for.
46007
46008 // FIXME: We can generalize both constraints:
46009 // - XOR/OR/AND (if they were made to survive AtomicExpand)
46010 // - LHS != 1
46011 // if the result is compared.
46012
46013 SDValue CmpLHS = Cmp.getOperand(0);
46014 SDValue CmpRHS = Cmp.getOperand(1);
46015 EVT CmpVT = CmpLHS.getValueType();
46016
46017 if (!CmpLHS.hasOneUse())
46018 return SDValue();
46019
46020 unsigned Opc = CmpLHS.getOpcode();
46021 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
46022 return SDValue();
46023
46024 SDValue OpRHS = CmpLHS.getOperand(2);
46025 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
46026 if (!OpRHSC)
46027 return SDValue();
46028
46029 APInt Addend = OpRHSC->getAPIntValue();
46030 if (Opc == ISD::ATOMIC_LOAD_SUB)
46031 Addend = -Addend;
46032
46033 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
46034 if (!CmpRHSC)
46035 return SDValue();
46036
46037 APInt Comparison = CmpRHSC->getAPIntValue();
46038 APInt NegAddend = -Addend;
46039
46040 // See if we can adjust the CC to make the comparison match the negated
46041 // addend.
46042 if (Comparison != NegAddend) {
46043 APInt IncComparison = Comparison + 1;
46044 if (IncComparison == NegAddend) {
46045 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
46046 Comparison = IncComparison;
46047 CC = X86::COND_AE;
46048 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
46049 Comparison = IncComparison;
46050 CC = X86::COND_L;
46051 }
46052 }
46053 APInt DecComparison = Comparison - 1;
46054 if (DecComparison == NegAddend) {
46055 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
46056 Comparison = DecComparison;
46057 CC = X86::COND_A;
46058 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
46059 Comparison = DecComparison;
46060 CC = X86::COND_LE;
46061 }
46062 }
46063 }
46064
46065 // If the addend is the negation of the comparison value, then we can do
46066 // a full comparison by emitting the atomic arithmetic as a locked sub.
46067 if (Comparison == NegAddend) {
46068 // The CC is fine, but we need to rewrite the LHS of the comparison as an
46069 // atomic sub.
46070 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
46071 auto AtomicSub = DAG.getAtomic(
46072 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
46073 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
46074 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
46075 AN->getMemOperand());
46076 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
46077 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
46078 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
46079 return LockOp;
46080 }
46081
46082 // We can handle comparisons with zero in a number of cases by manipulating
46083 // the CC used.
46084 if (!Comparison.isZero())
46085 return SDValue();
46086
46087 if (CC == X86::COND_S && Addend == 1)
46088 CC = X86::COND_LE;
46089 else if (CC == X86::COND_NS && Addend == 1)
46090 CC = X86::COND_G;
46091 else if (CC == X86::COND_G && Addend == -1)
46092 CC = X86::COND_GE;
46093 else if (CC == X86::COND_LE && Addend == -1)
46094 CC = X86::COND_L;
46095 else
46096 return SDValue();
46097
46098 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
46099 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
46100 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
46101 return LockOp;
46102}
46103
46104// Check whether a boolean test is testing a boolean value generated by
46105// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
46106// code.
46107//
46108// Simplify the following patterns:
46109// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
46110// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
46111// to (Op EFLAGS Cond)
46112//
46113// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
46114// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
46115// to (Op EFLAGS !Cond)
46116//
46117// where Op could be BRCOND or CMOV.
46118//
46120 // This combine only operates on CMP-like nodes.
46121 if (!(Cmp.getOpcode() == X86ISD::CMP ||
46122 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
46123 return SDValue();
46124
46125 // Quit if not used as a boolean value.
46126 if (CC != X86::COND_E && CC != X86::COND_NE)
46127 return SDValue();
46128
46129 // Check CMP operands. One of them should be 0 or 1 and the other should be
46130 // an SetCC or extended from it.
46131 SDValue Op1 = Cmp.getOperand(0);
46132 SDValue Op2 = Cmp.getOperand(1);
46133
46134 SDValue SetCC;
46135 const ConstantSDNode* C = nullptr;
46136 bool needOppositeCond = (CC == X86::COND_E);
46137 bool checkAgainstTrue = false; // Is it a comparison against 1?
46138
46139 if ((C = dyn_cast<ConstantSDNode>(Op1)))
46140 SetCC = Op2;
46141 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
46142 SetCC = Op1;
46143 else // Quit if all operands are not constants.
46144 return SDValue();
46145
46146 if (C->getZExtValue() == 1) {
46147 needOppositeCond = !needOppositeCond;
46148 checkAgainstTrue = true;
46149 } else if (C->getZExtValue() != 0)
46150 // Quit if the constant is neither 0 or 1.
46151 return SDValue();
46152
46153 bool truncatedToBoolWithAnd = false;
46154 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
46155 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
46156 SetCC.getOpcode() == ISD::TRUNCATE ||
46157 SetCC.getOpcode() == ISD::AND) {
46158 if (SetCC.getOpcode() == ISD::AND) {
46159 int OpIdx = -1;
46160 if (isOneConstant(SetCC.getOperand(0)))
46161 OpIdx = 1;
46162 if (isOneConstant(SetCC.getOperand(1)))
46163 OpIdx = 0;
46164 if (OpIdx < 0)
46165 break;
46166 SetCC = SetCC.getOperand(OpIdx);
46167 truncatedToBoolWithAnd = true;
46168 } else
46169 SetCC = SetCC.getOperand(0);
46170 }
46171
46172 switch (SetCC.getOpcode()) {
46174 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
46175 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
46176 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
46177 // truncated to i1 using 'and'.
46178 if (checkAgainstTrue && !truncatedToBoolWithAnd)
46179 break;
46181 "Invalid use of SETCC_CARRY!");
46182 [[fallthrough]];
46183 case X86ISD::SETCC:
46184 // Set the condition code or opposite one if necessary.
46186 if (needOppositeCond)
46188 return SetCC.getOperand(1);
46189 case X86ISD::CMOV: {
46190 // Check whether false/true value has canonical one, i.e. 0 or 1.
46191 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
46192 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
46193 // Quit if true value is not a constant.
46194 if (!TVal)
46195 return SDValue();
46196 // Quit if false value is not a constant.
46197 if (!FVal) {
46198 SDValue Op = SetCC.getOperand(0);
46199 // Skip 'zext' or 'trunc' node.
46200 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
46201 Op.getOpcode() == ISD::TRUNCATE)
46202 Op = Op.getOperand(0);
46203 // A special case for rdrand/rdseed, where 0 is set if false cond is
46204 // found.
46205 if ((Op.getOpcode() != X86ISD::RDRAND &&
46206 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
46207 return SDValue();
46208 }
46209 // Quit if false value is not the constant 0 or 1.
46210 bool FValIsFalse = true;
46211 if (FVal && FVal->getZExtValue() != 0) {
46212 if (FVal->getZExtValue() != 1)
46213 return SDValue();
46214 // If FVal is 1, opposite cond is needed.
46215 needOppositeCond = !needOppositeCond;
46216 FValIsFalse = false;
46217 }
46218 // Quit if TVal is not the constant opposite of FVal.
46219 if (FValIsFalse && TVal->getZExtValue() != 1)
46220 return SDValue();
46221 if (!FValIsFalse && TVal->getZExtValue() != 0)
46222 return SDValue();
46224 if (needOppositeCond)
46226 return SetCC.getOperand(3);
46227 }
46228 }
46229
46230 return SDValue();
46231}
46232
46233/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
46234/// Match:
46235/// (X86or (X86setcc) (X86setcc))
46236/// (X86cmp (and (X86setcc) (X86setcc)), 0)
46238 X86::CondCode &CC1, SDValue &Flags,
46239 bool &isAnd) {
46240 if (Cond->getOpcode() == X86ISD::CMP) {
46241 if (!isNullConstant(Cond->getOperand(1)))
46242 return false;
46243
46244 Cond = Cond->getOperand(0);
46245 }
46246
46247 isAnd = false;
46248
46249 SDValue SetCC0, SetCC1;
46250 switch (Cond->getOpcode()) {
46251 default: return false;
46252 case ISD::AND:
46253 case X86ISD::AND:
46254 isAnd = true;
46255 [[fallthrough]];
46256 case ISD::OR:
46257 case X86ISD::OR:
46258 SetCC0 = Cond->getOperand(0);
46259 SetCC1 = Cond->getOperand(1);
46260 break;
46261 };
46262
46263 // Make sure we have SETCC nodes, using the same flags value.
46264 if (SetCC0.getOpcode() != X86ISD::SETCC ||
46265 SetCC1.getOpcode() != X86ISD::SETCC ||
46266 SetCC0->getOperand(1) != SetCC1->getOperand(1))
46267 return false;
46268
46269 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
46270 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
46271 Flags = SetCC0->getOperand(1);
46272 return true;
46273}
46274
46275// When legalizing carry, we create carries via add X, -1
46276// If that comes from an actual carry, via setcc, we use the
46277// carry directly.
46279 if (EFLAGS.getOpcode() == X86ISD::ADD) {
46280 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
46281 bool FoundAndLSB = false;
46282 SDValue Carry = EFLAGS.getOperand(0);
46283 while (Carry.getOpcode() == ISD::TRUNCATE ||
46284 Carry.getOpcode() == ISD::ZERO_EXTEND ||
46285 (Carry.getOpcode() == ISD::AND &&
46286 isOneConstant(Carry.getOperand(1)))) {
46287 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
46288 Carry = Carry.getOperand(0);
46289 }
46290 if (Carry.getOpcode() == X86ISD::SETCC ||
46291 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
46292 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
46293 uint64_t CarryCC = Carry.getConstantOperandVal(0);
46294 SDValue CarryOp1 = Carry.getOperand(1);
46295 if (CarryCC == X86::COND_B)
46296 return CarryOp1;
46297 if (CarryCC == X86::COND_A) {
46298 // Try to convert COND_A into COND_B in an attempt to facilitate
46299 // materializing "setb reg".
46300 //
46301 // Do not flip "e > c", where "c" is a constant, because Cmp
46302 // instruction cannot take an immediate as its first operand.
46303 //
46304 if (CarryOp1.getOpcode() == X86ISD::SUB &&
46305 CarryOp1.getNode()->hasOneUse() &&
46306 CarryOp1.getValueType().isInteger() &&
46307 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
46308 SDValue SubCommute =
46309 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
46310 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
46311 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
46312 }
46313 }
46314 // If this is a check of the z flag of an add with 1, switch to the
46315 // C flag.
46316 if (CarryCC == X86::COND_E &&
46317 CarryOp1.getOpcode() == X86ISD::ADD &&
46318 isOneConstant(CarryOp1.getOperand(1)))
46319 return CarryOp1;
46320 } else if (FoundAndLSB) {
46321 SDLoc DL(Carry);
46322 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
46323 if (Carry.getOpcode() == ISD::SRL) {
46324 BitNo = Carry.getOperand(1);
46325 Carry = Carry.getOperand(0);
46326 }
46327 return getBT(Carry, BitNo, DL, DAG);
46328 }
46329 }
46330 }
46331
46332 return SDValue();
46333}
46334
46335/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
46336/// to avoid the inversion.
46338 SelectionDAG &DAG,
46339 const X86Subtarget &Subtarget) {
46340 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
46341 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
46342 EFLAGS.getOpcode() != X86ISD::TESTP)
46343 return SDValue();
46344
46345 // PTEST/TESTP sets EFLAGS as:
46346 // TESTZ: ZF = (Op0 & Op1) == 0
46347 // TESTC: CF = (~Op0 & Op1) == 0
46348 // TESTNZC: ZF == 0 && CF == 0
46349 MVT VT = EFLAGS.getSimpleValueType();
46350 SDValue Op0 = EFLAGS.getOperand(0);
46351 SDValue Op1 = EFLAGS.getOperand(1);
46352 MVT OpVT = Op0.getSimpleValueType();
46353 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46354
46355 // TEST*(~X,Y) == TEST*(X,Y)
46356 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
46357 X86::CondCode InvCC;
46358 switch (CC) {
46359 case X86::COND_B:
46360 // testc -> testz.
46361 InvCC = X86::COND_E;
46362 break;
46363 case X86::COND_AE:
46364 // !testc -> !testz.
46365 InvCC = X86::COND_NE;
46366 break;
46367 case X86::COND_E:
46368 // testz -> testc.
46369 InvCC = X86::COND_B;
46370 break;
46371 case X86::COND_NE:
46372 // !testz -> !testc.
46373 InvCC = X86::COND_AE;
46374 break;
46375 case X86::COND_A:
46376 case X86::COND_BE:
46377 // testnzc -> testnzc (no change).
46378 InvCC = CC;
46379 break;
46380 default:
46381 InvCC = X86::COND_INVALID;
46382 break;
46383 }
46384
46385 if (InvCC != X86::COND_INVALID) {
46386 CC = InvCC;
46387 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46388 DAG.getBitcast(OpVT, NotOp0), Op1);
46389 }
46390 }
46391
46392 if (CC == X86::COND_B || CC == X86::COND_AE) {
46393 // TESTC(X,~X) == TESTC(X,-1)
46394 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
46395 if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
46396 SDLoc DL(EFLAGS);
46397 return DAG.getNode(
46398 EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),
46399 DAG.getBitcast(OpVT,
46400 DAG.getAllOnesConstant(DL, NotOp1.getValueType())));
46401 }
46402 }
46403 }
46404
46405 if (CC == X86::COND_E || CC == X86::COND_NE) {
46406 // TESTZ(X,~Y) == TESTC(Y,X)
46407 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
46409 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46410 DAG.getBitcast(OpVT, NotOp1), Op0);
46411 }
46412
46413 if (Op0 == Op1) {
46414 SDValue BC = peekThroughBitcasts(Op0);
46415 EVT BCVT = BC.getValueType();
46416
46417 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
46418 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
46419 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46420 DAG.getBitcast(OpVT, BC.getOperand(0)),
46421 DAG.getBitcast(OpVT, BC.getOperand(1)));
46422 }
46423
46424 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
46425 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
46427 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46428 DAG.getBitcast(OpVT, BC.getOperand(0)),
46429 DAG.getBitcast(OpVT, BC.getOperand(1)));
46430 }
46431
46432 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
46433 // to more efficiently extract the sign bits and compare that.
46434 // TODO: Handle TESTC with comparison inversion.
46435 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
46436 // TESTP/MOVMSK combines to make sure its never worse than PTEST?
46437 if (BCVT.isVector() && TLI.isTypeLegal(BCVT)) {
46438 unsigned EltBits = BCVT.getScalarSizeInBits();
46439 if (DAG.ComputeNumSignBits(BC) == EltBits) {
46440 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
46441 APInt SignMask = APInt::getSignMask(EltBits);
46442 if (SDValue Res =
46443 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
46444 // For vXi16 cases we need to use pmovmksb and extract every other
46445 // sign bit.
46446 SDLoc DL(EFLAGS);
46447 if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
46448 MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
46449 MVT FloatVT =
46450 MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
46451 Res = DAG.getBitcast(FloatVT, Res);
46452 return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
46453 } else if (EltBits == 16) {
46454 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
46455 Res = DAG.getBitcast(MovmskVT, Res);
46456 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
46457 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
46458 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
46459 } else {
46460 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
46461 }
46462 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
46463 DAG.getConstant(0, DL, MVT::i32));
46464 }
46465 }
46466 }
46467 }
46468
46469 // TESTZ(-1,X) == TESTZ(X,X)
46471 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
46472
46473 // TESTZ(X,-1) == TESTZ(X,X)
46475 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
46476
46477 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
46478 // TODO: Add COND_NE handling?
46479 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
46480 SDValue Src0 = peekThroughBitcasts(Op0);
46481 SDValue Src1 = peekThroughBitcasts(Op1);
46482 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
46484 peekThroughBitcasts(Src0.getOperand(1)), true);
46486 peekThroughBitcasts(Src1.getOperand(1)), true);
46487 if (Src0 && Src1) {
46488 MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();
46489 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46490 DAG.getBitcast(OpVT2, Src0),
46491 DAG.getBitcast(OpVT2, Src1));
46492 }
46493 }
46494 }
46495 }
46496
46497 return SDValue();
46498}
46499
46500// Attempt to simplify the MOVMSK input based on the comparison type.
46502 SelectionDAG &DAG,
46503 const X86Subtarget &Subtarget) {
46504 // Handle eq/ne against zero (any_of).
46505 // Handle eq/ne against -1 (all_of).
46506 if (!(CC == X86::COND_E || CC == X86::COND_NE))
46507 return SDValue();
46508 if (EFLAGS.getValueType() != MVT::i32)
46509 return SDValue();
46510 unsigned CmpOpcode = EFLAGS.getOpcode();
46511 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
46512 return SDValue();
46513 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
46514 if (!CmpConstant)
46515 return SDValue();
46516 const APInt &CmpVal = CmpConstant->getAPIntValue();
46517
46518 SDValue CmpOp = EFLAGS.getOperand(0);
46519 unsigned CmpBits = CmpOp.getValueSizeInBits();
46520 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
46521
46522 // Peek through any truncate.
46523 if (CmpOp.getOpcode() == ISD::TRUNCATE)
46524 CmpOp = CmpOp.getOperand(0);
46525
46526 // Bail if we don't find a MOVMSK.
46527 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
46528 return SDValue();
46529
46530 SDValue Vec = CmpOp.getOperand(0);
46531 MVT VecVT = Vec.getSimpleValueType();
46532 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
46533 "Unexpected MOVMSK operand");
46534 unsigned NumElts = VecVT.getVectorNumElements();
46535 unsigned NumEltBits = VecVT.getScalarSizeInBits();
46536
46537 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
46538 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
46539 NumElts <= CmpBits && CmpVal.isMask(NumElts);
46540 if (!IsAnyOf && !IsAllOf)
46541 return SDValue();
46542
46543 // TODO: Check more combining cases for me.
46544 // Here we check the cmp use number to decide do combining or not.
46545 // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
46546 // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
46547 bool IsOneUse = CmpOp.getNode()->hasOneUse();
46548
46549 // See if we can peek through to a vector with a wider element type, if the
46550 // signbits extend down to all the sub-elements as well.
46551 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
46552 // potential SimplifyDemandedBits/Elts cases.
46553 // If we looked through a truncate that discard bits, we can't do this
46554 // transform.
46555 // FIXME: We could do this transform for truncates that discarded bits by
46556 // inserting an AND mask between the new MOVMSK and the CMP.
46557 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
46558 SDValue BC = peekThroughBitcasts(Vec);
46559 MVT BCVT = BC.getSimpleValueType();
46560 unsigned BCNumElts = BCVT.getVectorNumElements();
46561 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
46562 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
46563 BCNumEltBits > NumEltBits &&
46564 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
46565 SDLoc DL(EFLAGS);
46566 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
46567 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
46568 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
46569 DAG.getConstant(CmpMask, DL, MVT::i32));
46570 }
46571 }
46572
46573 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
46574 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
46575 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
46576 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
46577 if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
46579 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
46580 Ops.size() == 2) {
46581 SDLoc DL(EFLAGS);
46582 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
46583 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
46584 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
46585 DAG.getBitcast(SubVT, Ops[0]),
46586 DAG.getBitcast(SubVT, Ops[1]));
46587 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
46588 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
46589 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
46590 DAG.getConstant(CmpMask, DL, MVT::i32));
46591 }
46592 }
46593
46594 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
46595 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
46596 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
46597 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
46598 if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
46599 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
46600 SDValue BC = peekThroughBitcasts(Vec);
46601 // Ensure MOVMSK was testing every signbit of BC.
46602 if (BC.getValueType().getVectorNumElements() <= NumElts) {
46603 if (BC.getOpcode() == X86ISD::PCMPEQ) {
46604 SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
46605 BC.getOperand(0), BC.getOperand(1));
46606 V = DAG.getBitcast(TestVT, V);
46607 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
46608 }
46609 // Check for 256-bit split vector cases.
46610 if (BC.getOpcode() == ISD::AND &&
46611 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
46612 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
46613 SDValue LHS = BC.getOperand(0);
46614 SDValue RHS = BC.getOperand(1);
46615 LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
46616 LHS.getOperand(0), LHS.getOperand(1));
46617 RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
46618 RHS.getOperand(0), RHS.getOperand(1));
46619 LHS = DAG.getBitcast(TestVT, LHS);
46620 RHS = DAG.getBitcast(TestVT, RHS);
46621 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
46622 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
46623 }
46624 }
46625 }
46626
46627 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
46628 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
46629 // sign bits prior to the comparison with zero unless we know that
46630 // the vXi16 splats the sign bit down to the lower i8 half.
46631 // TODO: Handle all_of patterns.
46632 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
46633 SDValue VecOp0 = Vec.getOperand(0);
46634 SDValue VecOp1 = Vec.getOperand(1);
46635 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
46636 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
46637 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
46638 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
46639 SDLoc DL(EFLAGS);
46640 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
46641 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
46642 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
46643 if (!SignExt0) {
46644 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
46645 DAG.getConstant(0xAAAA, DL, MVT::i16));
46646 }
46647 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
46648 DAG.getConstant(0, DL, MVT::i16));
46649 }
46650 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
46651 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
46652 if (CmpBits >= 16 && Subtarget.hasInt256() &&
46653 (IsAnyOf || (SignExt0 && SignExt1))) {
46654 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
46655 SDLoc DL(EFLAGS);
46656 SDValue Result = peekThroughBitcasts(Src);
46657 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
46658 Result.getValueType().getVectorNumElements() <= NumElts) {
46659 SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
46660 Result.getOperand(0), Result.getOperand(1));
46661 V = DAG.getBitcast(MVT::v4i64, V);
46662 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
46663 }
46664 Result = DAG.getBitcast(MVT::v32i8, Result);
46665 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
46666 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
46667 if (!SignExt0 || !SignExt1) {
46668 assert(IsAnyOf &&
46669 "Only perform v16i16 signmasks for any_of patterns");
46670 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
46671 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
46672 }
46673 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
46674 DAG.getConstant(CmpMask, DL, MVT::i32));
46675 }
46676 }
46677 }
46678
46679 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
46680 // Since we peek through a bitcast, we need to be careful if the base vector
46681 // type has smaller elements than the MOVMSK type. In that case, even if
46682 // all the elements are demanded by the shuffle mask, only the "high"
46683 // elements which have highbits that align with highbits in the MOVMSK vec
46684 // elements are actually demanded. A simplification of spurious operations
46685 // on the "low" elements take place during other simplifications.
46686 //
46687 // For example:
46688 // MOVMSK64(BITCAST(SHUF32 X, (1,0,3,2))) even though all the elements are
46689 // demanded, because we are swapping around the result can change.
46690 //
46691 // To address this, we check that we can scale the shuffle mask to MOVMSK
46692 // element width (this will ensure "high" elements match). Its slightly overly
46693 // conservative, but fine for an edge case fold.
46694 SmallVector<int, 32> ShuffleMask;
46695 SmallVector<SDValue, 2> ShuffleInputs;
46696 if (NumElts <= CmpBits &&
46697 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
46698 ShuffleMask, DAG) &&
46699 ShuffleInputs.size() == 1 && isCompletePermute(ShuffleMask) &&
46700 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits() &&
46701 canScaleShuffleElements(ShuffleMask, NumElts)) {
46702 SDLoc DL(EFLAGS);
46703 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
46704 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
46705 Result =
46706 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
46707 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, EFLAGS.getOperand(1));
46708 }
46709
46710 // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
46711 // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
46712 // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
46713 // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
46714 // iff every element is referenced.
46715 if (NumElts <= CmpBits && Subtarget.hasAVX() &&
46716 !Subtarget.preferMovmskOverVTest() && IsOneUse &&
46717 (NumEltBits == 32 || NumEltBits == 64)) {
46718 SDLoc DL(EFLAGS);
46719 MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
46720 MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);
46721 MVT IntVT = FloatVT.changeVectorElementTypeToInteger();
46722 SDValue LHS = Vec;
46723 SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);
46724 CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
46725 return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,
46726 DAG.getBitcast(FloatVT, LHS),
46727 DAG.getBitcast(FloatVT, RHS));
46728 }
46729
46730 return SDValue();
46731}
46732
46733/// Optimize an EFLAGS definition used according to the condition code \p CC
46734/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
46735/// uses of chain values.
46737 SelectionDAG &DAG,
46738 const X86Subtarget &Subtarget) {
46739 if (CC == X86::COND_B)
46740 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
46741 return Flags;
46742
46743 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
46744 return R;
46745
46746 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
46747 return R;
46748
46749 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
46750 return R;
46751
46752 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
46753}
46754
46755/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
46758 const X86Subtarget &Subtarget) {
46759 SDLoc DL(N);
46760
46761 SDValue FalseOp = N->getOperand(0);
46762 SDValue TrueOp = N->getOperand(1);
46763 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
46764 SDValue Cond = N->getOperand(3);
46765
46766 // cmov X, X, ?, ? --> X
46767 if (TrueOp == FalseOp)
46768 return TrueOp;
46769
46770 // Try to simplify the EFLAGS and condition code operands.
46771 // We can't always do this as FCMOV only supports a subset of X86 cond.
46772 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
46773 if (!(FalseOp.getValueType() == MVT::f80 ||
46774 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
46775 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
46776 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
46777 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
46778 Flags};
46779 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
46780 }
46781 }
46782
46783 // If this is a select between two integer constants, try to do some
46784 // optimizations. Note that the operands are ordered the opposite of SELECT
46785 // operands.
46786 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
46787 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
46788 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
46789 // larger than FalseC (the false value).
46790 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
46792 std::swap(TrueC, FalseC);
46793 std::swap(TrueOp, FalseOp);
46794 }
46795
46796 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
46797 // This is efficient for any integer data type (including i8/i16) and
46798 // shift amount.
46799 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
46800 Cond = getSETCC(CC, Cond, DL, DAG);
46801
46802 // Zero extend the condition if needed.
46803 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
46804
46805 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
46806 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
46807 DAG.getConstant(ShAmt, DL, MVT::i8));
46808 return Cond;
46809 }
46810
46811 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
46812 // for any integer data type, including i8/i16.
46813 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
46814 Cond = getSETCC(CC, Cond, DL, DAG);
46815
46816 // Zero extend the condition if needed.
46818 FalseC->getValueType(0), Cond);
46819 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
46820 SDValue(FalseC, 0));
46821 return Cond;
46822 }
46823
46824 // Optimize cases that will turn into an LEA instruction. This requires
46825 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
46826 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
46827 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
46828 assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&
46829 "Implicit constant truncation");
46830
46831 bool isFastMultiplier = false;
46832 if (Diff.ult(10)) {
46833 switch (Diff.getZExtValue()) {
46834 default: break;
46835 case 1: // result = add base, cond
46836 case 2: // result = lea base( , cond*2)
46837 case 3: // result = lea base(cond, cond*2)
46838 case 4: // result = lea base( , cond*4)
46839 case 5: // result = lea base(cond, cond*4)
46840 case 8: // result = lea base( , cond*8)
46841 case 9: // result = lea base(cond, cond*8)
46842 isFastMultiplier = true;
46843 break;
46844 }
46845 }
46846
46847 if (isFastMultiplier) {
46848 Cond = getSETCC(CC, Cond, DL ,DAG);
46849 // Zero extend the condition if needed.
46850 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
46851 Cond);
46852 // Scale the condition by the difference.
46853 if (Diff != 1)
46854 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
46855 DAG.getConstant(Diff, DL, Cond.getValueType()));
46856
46857 // Add the base if non-zero.
46858 if (FalseC->getAPIntValue() != 0)
46859 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
46860 SDValue(FalseC, 0));
46861 return Cond;
46862 }
46863 }
46864 }
46865 }
46866
46867 // Handle these cases:
46868 // (select (x != c), e, c) -> select (x != c), e, x),
46869 // (select (x == c), c, e) -> select (x == c), x, e)
46870 // where the c is an integer constant, and the "select" is the combination
46871 // of CMOV and CMP.
46872 //
46873 // The rationale for this change is that the conditional-move from a constant
46874 // needs two instructions, however, conditional-move from a register needs
46875 // only one instruction.
46876 //
46877 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
46878 // some instruction-combining opportunities. This opt needs to be
46879 // postponed as late as possible.
46880 //
46881 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
46882 // the DCI.xxxx conditions are provided to postpone the optimization as
46883 // late as possible.
46884
46885 ConstantSDNode *CmpAgainst = nullptr;
46886 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
46887 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
46888 !isa<ConstantSDNode>(Cond.getOperand(0))) {
46889
46890 if (CC == X86::COND_NE &&
46891 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
46893 std::swap(TrueOp, FalseOp);
46894 }
46895
46896 if (CC == X86::COND_E &&
46897 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
46898 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
46899 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
46900 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
46901 }
46902 }
46903 }
46904
46905 // Transform:
46906 //
46907 // (cmov 1 T (uge T 2))
46908 //
46909 // to:
46910 //
46911 // (adc T 0 (sub T 1))
46912 if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
46913 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
46914 SDValue Cond0 = Cond.getOperand(0);
46915 if (Cond0.getOpcode() == ISD::TRUNCATE)
46916 Cond0 = Cond0.getOperand(0);
46917 auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
46918 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
46919 EVT CondVT = Cond->getValueType(0);
46920 EVT OuterVT = N->getValueType(0);
46921 // Subtract 1 and generate a carry.
46922 SDValue NewSub =
46923 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
46924 DAG.getConstant(1, DL, CondVT));
46925 SDValue EFLAGS(NewSub.getNode(), 1);
46926 return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(OuterVT, MVT::i32),
46927 TrueOp, DAG.getConstant(0, DL, OuterVT), EFLAGS);
46928 }
46929 }
46930
46931 // Fold and/or of setcc's to double CMOV:
46932 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
46933 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
46934 //
46935 // This combine lets us generate:
46936 // cmovcc1 (jcc1 if we don't have CMOV)
46937 // cmovcc2 (same)
46938 // instead of:
46939 // setcc1
46940 // setcc2
46941 // and/or
46942 // cmovne (jne if we don't have CMOV)
46943 // When we can't use the CMOV instruction, it might increase branch
46944 // mispredicts.
46945 // When we can use CMOV, or when there is no mispredict, this improves
46946 // throughput and reduces register pressure.
46947 //
46948 if (CC == X86::COND_NE) {
46949 SDValue Flags;
46950 X86::CondCode CC0, CC1;
46951 bool isAndSetCC;
46952 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
46953 if (isAndSetCC) {
46954 std::swap(FalseOp, TrueOp);
46957 }
46958
46959 SDValue LOps[] = {FalseOp, TrueOp,
46960 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
46961 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
46962 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
46963 Flags};
46964 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
46965 return CMOV;
46966 }
46967 }
46968
46969 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
46970 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
46971 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
46972 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
46973 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
46974 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
46975 SDValue Add = TrueOp;
46976 SDValue Const = FalseOp;
46977 // Canonicalize the condition code for easier matching and output.
46978 if (CC == X86::COND_E)
46979 std::swap(Add, Const);
46980
46981 // We might have replaced the constant in the cmov with the LHS of the
46982 // compare. If so change it to the RHS of the compare.
46983 if (Const == Cond.getOperand(0))
46984 Const = Cond.getOperand(1);
46985
46986 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
46987 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
46988 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
46989 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
46990 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
46991 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
46992 EVT VT = N->getValueType(0);
46993 // This should constant fold.
46994 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
46995 SDValue CMov =
46996 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
46997 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
46998 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
46999 }
47000 }
47001
47002 return SDValue();
47003}
47004
47005/// Different mul shrinking modes.
47007
47009 EVT VT = N->getOperand(0).getValueType();
47010 if (VT.getScalarSizeInBits() != 32)
47011 return false;
47012
47013 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
47014 unsigned SignBits[2] = {1, 1};
47015 bool IsPositive[2] = {false, false};
47016 for (unsigned i = 0; i < 2; i++) {
47017 SDValue Opd = N->getOperand(i);
47018
47019 SignBits[i] = DAG.ComputeNumSignBits(Opd);
47020 IsPositive[i] = DAG.SignBitIsZero(Opd);
47021 }
47022
47023 bool AllPositive = IsPositive[0] && IsPositive[1];
47024 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
47025 // When ranges are from -128 ~ 127, use MULS8 mode.
47026 if (MinSignBits >= 25)
47027 Mode = ShrinkMode::MULS8;
47028 // When ranges are from 0 ~ 255, use MULU8 mode.
47029 else if (AllPositive && MinSignBits >= 24)
47030 Mode = ShrinkMode::MULU8;
47031 // When ranges are from -32768 ~ 32767, use MULS16 mode.
47032 else if (MinSignBits >= 17)
47033 Mode = ShrinkMode::MULS16;
47034 // When ranges are from 0 ~ 65535, use MULU16 mode.
47035 else if (AllPositive && MinSignBits >= 16)
47036 Mode = ShrinkMode::MULU16;
47037 else
47038 return false;
47039 return true;
47040}
47041
47042/// When the operands of vector mul are extended from smaller size values,
47043/// like i8 and i16, the type of mul may be shrinked to generate more
47044/// efficient code. Two typical patterns are handled:
47045/// Pattern1:
47046/// %2 = sext/zext <N x i8> %1 to <N x i32>
47047/// %4 = sext/zext <N x i8> %3 to <N x i32>
47048// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
47049/// %5 = mul <N x i32> %2, %4
47050///
47051/// Pattern2:
47052/// %2 = zext/sext <N x i16> %1 to <N x i32>
47053/// %4 = zext/sext <N x i16> %3 to <N x i32>
47054/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
47055/// %5 = mul <N x i32> %2, %4
47056///
47057/// There are four mul shrinking modes:
47058/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
47059/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
47060/// generate pmullw+sext32 for it (MULS8 mode).
47061/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
47062/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
47063/// generate pmullw+zext32 for it (MULU8 mode).
47064/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
47065/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
47066/// generate pmullw+pmulhw for it (MULS16 mode).
47067/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
47068/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
47069/// generate pmullw+pmulhuw for it (MULU16 mode).
47071 const X86Subtarget &Subtarget) {
47072 // Check for legality
47073 // pmullw/pmulhw are not supported by SSE.
47074 if (!Subtarget.hasSSE2())
47075 return SDValue();
47076
47077 // Check for profitability
47078 // pmulld is supported since SSE41. It is better to use pmulld
47079 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
47080 // the expansion.
47081 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
47082 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
47083 return SDValue();
47084
47085 ShrinkMode Mode;
47086 if (!canReduceVMulWidth(N, DAG, Mode))
47087 return SDValue();
47088
47089 SDValue N0 = N->getOperand(0);
47090 SDValue N1 = N->getOperand(1);
47091 EVT VT = N->getOperand(0).getValueType();
47092 unsigned NumElts = VT.getVectorNumElements();
47093 if ((NumElts % 2) != 0)
47094 return SDValue();
47095
47096 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
47097
47098 // Shrink the operands of mul.
47099 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
47100 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
47101
47102 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
47103 // lower part is needed.
47104 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
47105 if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
47106 return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
47108 DL, VT, MulLo);
47109
47110 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
47111 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
47112 // the higher part is also needed.
47113 SDValue MulHi =
47114 DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
47115 ReducedVT, NewN0, NewN1);
47116
47117 // Repack the lower part and higher part result of mul into a wider
47118 // result.
47119 // Generate shuffle functioning as punpcklwd.
47120 SmallVector<int, 16> ShuffleMask(NumElts);
47121 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
47122 ShuffleMask[2 * i] = i;
47123 ShuffleMask[2 * i + 1] = i + NumElts;
47124 }
47125 SDValue ResLo =
47126 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
47127 ResLo = DAG.getBitcast(ResVT, ResLo);
47128 // Generate shuffle functioning as punpckhwd.
47129 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
47130 ShuffleMask[2 * i] = i + NumElts / 2;
47131 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
47132 }
47133 SDValue ResHi =
47134 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
47135 ResHi = DAG.getBitcast(ResVT, ResHi);
47136 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
47137}
47138
47140 EVT VT, const SDLoc &DL) {
47141
47142 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
47143 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
47144 DAG.getConstant(Mult, DL, VT));
47145 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
47146 DAG.getConstant(Shift, DL, MVT::i8));
47147 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
47148 N->getOperand(0));
47149 return Result;
47150 };
47151
47152 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
47153 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
47154 DAG.getConstant(Mul1, DL, VT));
47155 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
47156 DAG.getConstant(Mul2, DL, VT));
47157 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
47158 N->getOperand(0));
47159 return Result;
47160 };
47161
47162 switch (MulAmt) {
47163 default:
47164 break;
47165 case 11:
47166 // mul x, 11 => add ((shl (mul x, 5), 1), x)
47167 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
47168 case 21:
47169 // mul x, 21 => add ((shl (mul x, 5), 2), x)
47170 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
47171 case 41:
47172 // mul x, 41 => add ((shl (mul x, 5), 3), x)
47173 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
47174 case 22:
47175 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
47176 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
47177 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
47178 case 19:
47179 // mul x, 19 => add ((shl (mul x, 9), 1), x)
47180 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
47181 case 37:
47182 // mul x, 37 => add ((shl (mul x, 9), 2), x)
47183 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
47184 case 73:
47185 // mul x, 73 => add ((shl (mul x, 9), 3), x)
47186 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
47187 case 13:
47188 // mul x, 13 => add ((shl (mul x, 3), 2), x)
47189 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
47190 case 23:
47191 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
47192 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
47193 case 26:
47194 // mul x, 26 => add ((mul (mul x, 5), 5), x)
47195 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
47196 case 28:
47197 // mul x, 28 => add ((mul (mul x, 9), 3), x)
47198 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
47199 case 29:
47200 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
47201 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
47202 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
47203 }
47204
47205 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
47206 // by a single LEA.
47207 // First check if this a sum of two power of 2s because that's easy. Then
47208 // count how many zeros are up to the first bit.
47209 // TODO: We can do this even without LEA at a cost of two shifts and an add.
47210 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
47211 unsigned ScaleShift = llvm::countr_zero(MulAmt);
47212 if (ScaleShift >= 1 && ScaleShift < 4) {
47213 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
47214 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47215 DAG.getConstant(ShiftAmt, DL, MVT::i8));
47216 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47217 DAG.getConstant(ScaleShift, DL, MVT::i8));
47218 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
47219 }
47220 }
47221
47222 return SDValue();
47223}
47224
47225// If the upper 17 bits of either element are zero and the other element are
47226// zero/sign bits then we can use PMADDWD, which is always at least as quick as
47227// PMULLD, except on KNL.
47229 SelectionDAG &DAG,
47230 const X86Subtarget &Subtarget) {
47231 if (!Subtarget.hasSSE2())
47232 return SDValue();
47233
47234 if (Subtarget.isPMADDWDSlow())
47235 return SDValue();
47236
47237 EVT VT = N->getValueType(0);
47238
47239 // Only support vXi32 vectors.
47240 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
47241 return SDValue();
47242
47243 // Make sure the type is legal or can split/widen to a legal type.
47244 // With AVX512 but without BWI, we would need to split v32i16.
47245 unsigned NumElts = VT.getVectorNumElements();
47246 if (NumElts == 1 || !isPowerOf2_32(NumElts))
47247 return SDValue();
47248
47249 // With AVX512 but without BWI, we would need to split v32i16.
47250 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
47251 return SDValue();
47252
47253 SDValue N0 = N->getOperand(0);
47254 SDValue N1 = N->getOperand(1);
47255
47256 // If we are zero/sign extending two steps without SSE4.1, its better to
47257 // reduce the vmul width instead.
47258 if (!Subtarget.hasSSE41() &&
47259 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
47260 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
47261 (N1.getOpcode() == ISD::ZERO_EXTEND &&
47262 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
47263 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
47264 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
47265 (N1.getOpcode() == ISD::SIGN_EXTEND &&
47266 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
47267 return SDValue();
47268
47269 // If we are sign extending a wide vector without SSE4.1, its better to reduce
47270 // the vmul width instead.
47271 if (!Subtarget.hasSSE41() &&
47272 (N0.getOpcode() == ISD::SIGN_EXTEND &&
47273 N0.getOperand(0).getValueSizeInBits() > 128) &&
47274 (N1.getOpcode() == ISD::SIGN_EXTEND &&
47275 N1.getOperand(0).getValueSizeInBits() > 128))
47276 return SDValue();
47277
47278 // Sign bits must extend down to the lowest i16.
47279 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
47280 DAG.ComputeMaxSignificantBits(N0) > 16)
47281 return SDValue();
47282
47283 // At least one of the elements must be zero in the upper 17 bits, or can be
47284 // safely made zero without altering the final result.
47285 auto GetZeroableOp = [&](SDValue Op) {
47286 APInt Mask17 = APInt::getHighBitsSet(32, 17);
47287 if (DAG.MaskedValueIsZero(Op, Mask17))
47288 return Op;
47289 // Mask off upper 16-bits of sign-extended constants.
47291 return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(0xFFFF, DL, VT));
47292 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
47293 SDValue Src = Op.getOperand(0);
47294 // Convert sext(vXi16) to zext(vXi16).
47295 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
47296 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
47297 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
47298 // which will expand the extension.
47299 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
47300 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
47301 Src = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, Src);
47302 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
47303 }
47304 }
47305 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
47306 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
47307 N->isOnlyUserOf(Op.getNode())) {
47308 SDValue Src = Op.getOperand(0);
47309 if (Src.getScalarValueSizeInBits() == 16)
47310 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Src);
47311 }
47312 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
47313 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
47314 N->isOnlyUserOf(Op.getNode())) {
47315 return DAG.getNode(X86ISD::VSRLI, DL, VT, Op.getOperand(0),
47316 Op.getOperand(1));
47317 }
47318 return SDValue();
47319 };
47320 SDValue ZeroN0 = GetZeroableOp(N0);
47321 SDValue ZeroN1 = GetZeroableOp(N1);
47322 if (!ZeroN0 && !ZeroN1)
47323 return SDValue();
47324 N0 = ZeroN0 ? ZeroN0 : N0;
47325 N1 = ZeroN1 ? ZeroN1 : N1;
47326
47327 // Use SplitOpsAndApply to handle AVX splitting.
47328 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47329 ArrayRef<SDValue> Ops) {
47330 MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
47331 MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
47332 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
47333 DAG.getBitcast(OpVT, Ops[0]),
47334 DAG.getBitcast(OpVT, Ops[1]));
47335 };
47336 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDWDBuilder);
47337}
47338
47340 const X86Subtarget &Subtarget) {
47341 if (!Subtarget.hasSSE2())
47342 return SDValue();
47343
47344 EVT VT = N->getValueType(0);
47345
47346 // Only support vXi64 vectors.
47347 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
47348 VT.getVectorNumElements() < 2 ||
47350 return SDValue();
47351
47352 SDValue N0 = N->getOperand(0);
47353 SDValue N1 = N->getOperand(1);
47354
47355 // MULDQ returns the 64-bit result of the signed multiplication of the lower
47356 // 32-bits. We can lower with this if the sign bits stretch that far.
47357 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
47358 DAG.ComputeNumSignBits(N1) > 32) {
47359 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47360 ArrayRef<SDValue> Ops) {
47361 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
47362 };
47363 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULDQBuilder,
47364 /*CheckBWI*/ false);
47365 }
47366
47367 // If the upper bits are zero we can use a single pmuludq.
47368 APInt Mask = APInt::getHighBitsSet(64, 32);
47369 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
47370 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47371 ArrayRef<SDValue> Ops) {
47372 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
47373 };
47374 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULUDQBuilder,
47375 /*CheckBWI*/ false);
47376 }
47377
47378 return SDValue();
47379}
47380
47383 const X86Subtarget &Subtarget) {
47384 EVT VT = N->getValueType(0);
47385 SDLoc DL(N);
47386
47387 if (SDValue V = combineMulToPMADDWD(N, DL, DAG, Subtarget))
47388 return V;
47389
47390 if (SDValue V = combineMulToPMULDQ(N, DL, DAG, Subtarget))
47391 return V;
47392
47393 if (DCI.isBeforeLegalize() && VT.isVector())
47394 return reduceVMULWidth(N, DL, DAG, Subtarget);
47395
47396 // Optimize a single multiply with constant into two operations in order to
47397 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
47399 return SDValue();
47400
47401 // An imul is usually smaller than the alternative sequence.
47403 return SDValue();
47404
47405 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
47406 return SDValue();
47407
47408 if (VT != MVT::i64 && VT != MVT::i32 &&
47409 (!VT.isVector() || !VT.isSimple() || !VT.isInteger()))
47410 return SDValue();
47411
47413 N->getOperand(1), /*AllowUndefs*/ true, /*AllowTrunc*/ false);
47414 const APInt *C = nullptr;
47415 if (!CNode) {
47416 if (VT.isVector())
47417 if (auto *RawC = getTargetConstantFromNode(N->getOperand(1)))
47418 if (auto *SplatC = RawC->getSplatValue())
47419 C = &(SplatC->getUniqueInteger());
47420
47421 if (!C || C->getBitWidth() != VT.getScalarSizeInBits())
47422 return SDValue();
47423 } else {
47424 C = &(CNode->getAPIntValue());
47425 }
47426
47427 if (isPowerOf2_64(C->getZExtValue()))
47428 return SDValue();
47429
47430 int64_t SignMulAmt = C->getSExtValue();
47431 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
47432 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
47433
47434 SDValue NewMul = SDValue();
47435 if (VT == MVT::i64 || VT == MVT::i32) {
47436 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
47437 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
47438 DAG.getConstant(AbsMulAmt, DL, VT));
47439 if (SignMulAmt < 0)
47440 NewMul = DAG.getNegative(NewMul, DL, VT);
47441
47442 return NewMul;
47443 }
47444
47445 uint64_t MulAmt1 = 0;
47446 uint64_t MulAmt2 = 0;
47447 if ((AbsMulAmt % 9) == 0) {
47448 MulAmt1 = 9;
47449 MulAmt2 = AbsMulAmt / 9;
47450 } else if ((AbsMulAmt % 5) == 0) {
47451 MulAmt1 = 5;
47452 MulAmt2 = AbsMulAmt / 5;
47453 } else if ((AbsMulAmt % 3) == 0) {
47454 MulAmt1 = 3;
47455 MulAmt2 = AbsMulAmt / 3;
47456 }
47457
47458 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
47459 if (MulAmt2 &&
47460 (isPowerOf2_64(MulAmt2) ||
47461 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
47462
47463 if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
47464 N->use_begin()->getOpcode() == ISD::ADD))
47465 // If second multiplifer is pow2, issue it first. We want the multiply
47466 // by 3, 5, or 9 to be folded into the addressing mode unless the lone
47467 // use is an add. Only do this for positive multiply amounts since the
47468 // negate would prevent it from being used as an address mode anyway.
47469 std::swap(MulAmt1, MulAmt2);
47470
47471 if (isPowerOf2_64(MulAmt1))
47472 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47473 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
47474 else
47475 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
47476 DAG.getConstant(MulAmt1, DL, VT));
47477
47478 if (isPowerOf2_64(MulAmt2))
47479 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
47480 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
47481 else
47482 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
47483 DAG.getConstant(MulAmt2, DL, VT));
47484
47485 // Negate the result.
47486 if (SignMulAmt < 0)
47487 NewMul = DAG.getNegative(NewMul, DL, VT);
47488 } else if (!Subtarget.slowLEA())
47489 NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
47490 }
47491 if (!NewMul) {
47492 EVT ShiftVT = VT.isVector() ? VT : MVT::i8;
47493 assert(C->getZExtValue() != 0 &&
47494 C->getZExtValue() != maxUIntN(VT.getScalarSizeInBits()) &&
47495 "Both cases that could cause potential overflows should have "
47496 "already been handled.");
47497 if (isPowerOf2_64(AbsMulAmt - 1)) {
47498 // (mul x, 2^N + 1) => (add (shl x, N), x)
47499 NewMul = DAG.getNode(
47500 ISD::ADD, DL, VT, N->getOperand(0),
47501 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47502 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
47503 if (SignMulAmt < 0)
47504 NewMul = DAG.getNegative(NewMul, DL, VT);
47505 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
47506 // (mul x, 2^N - 1) => (sub (shl x, N), x)
47507 NewMul =
47508 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47509 DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT));
47510 // To negate, reverse the operands of the subtract.
47511 if (SignMulAmt < 0)
47512 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
47513 else
47514 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
47515 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
47516 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
47517 // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
47518 NewMul =
47519 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47520 DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
47521 NewMul = DAG.getNode(
47522 ISD::ADD, DL, VT, NewMul,
47523 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
47524 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) &&
47525 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
47526 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
47527 NewMul =
47528 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47529 DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT));
47530 NewMul = DAG.getNode(
47531 ISD::SUB, DL, VT, NewMul,
47532 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
47533 } else if (SignMulAmt >= 0 && VT.isVector() &&
47534 Subtarget.fastImmVectorShift()) {
47535 uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
47536 uint64_t ShiftAmt1;
47537 std::optional<unsigned> Opc;
47538 if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
47539 ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
47540 Opc = ISD::ADD;
47541 } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) {
47542 ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit;
47543 Opc = ISD::SUB;
47544 }
47545
47546 if (Opc) {
47547 SDValue Shift1 =
47548 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47549 DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT));
47550 SDValue Shift2 =
47551 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47552 DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT));
47553 NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2);
47554 }
47555 }
47556 }
47557
47558 return NewMul;
47559}
47560
47561// Try to form a MULHU or MULHS node by looking for
47562// (srl (mul ext, ext), 16)
47563// TODO: This is X86 specific because we want to be able to handle wide types
47564// before type legalization. But we can only do it if the vector will be
47565// legalized via widening/splitting. Type legalization can't handle promotion
47566// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
47567// combiner.
47569 const X86Subtarget &Subtarget) {
47570 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
47571 "SRL or SRA node is required here!");
47572 SDLoc DL(N);
47573
47574 if (!Subtarget.hasSSE2())
47575 return SDValue();
47576
47577 // The operation feeding into the shift must be a multiply.
47578 SDValue ShiftOperand = N->getOperand(0);
47579 if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
47580 return SDValue();
47581
47582 // Input type should be at least vXi32.
47583 EVT VT = N->getValueType(0);
47584 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
47585 return SDValue();
47586
47587 // Need a shift by 16.
47588 APInt ShiftAmt;
47589 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
47590 ShiftAmt != 16)
47591 return SDValue();
47592
47593 SDValue LHS = ShiftOperand.getOperand(0);
47594 SDValue RHS = ShiftOperand.getOperand(1);
47595
47596 unsigned ExtOpc = LHS.getOpcode();
47597 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
47598 RHS.getOpcode() != ExtOpc)
47599 return SDValue();
47600
47601 // Peek through the extends.
47602 LHS = LHS.getOperand(0);
47603 RHS = RHS.getOperand(0);
47604
47605 // Ensure the input types match.
47606 EVT MulVT = LHS.getValueType();
47607 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
47608 return SDValue();
47609
47610 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
47611 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
47612
47613 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
47614 return DAG.getNode(ExtOpc, DL, VT, Mulh);
47615}
47616
47618 SDValue N0 = N->getOperand(0);
47619 SDValue N1 = N->getOperand(1);
47620 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
47621 EVT VT = N0.getValueType();
47622
47623 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
47624 // since the result of setcc_c is all zero's or all ones.
47625 if (VT.isInteger() && !VT.isVector() &&
47626 N1C && N0.getOpcode() == ISD::AND &&
47627 N0.getOperand(1).getOpcode() == ISD::Constant) {
47628 SDValue N00 = N0.getOperand(0);
47629 APInt Mask = N0.getConstantOperandAPInt(1);
47630 Mask <<= N1C->getAPIntValue();
47631 bool MaskOK = false;
47632 // We can handle cases concerning bit-widening nodes containing setcc_c if
47633 // we carefully interrogate the mask to make sure we are semantics
47634 // preserving.
47635 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
47636 // of the underlying setcc_c operation if the setcc_c was zero extended.
47637 // Consider the following example:
47638 // zext(setcc_c) -> i32 0x0000FFFF
47639 // c1 -> i32 0x0000FFFF
47640 // c2 -> i32 0x00000001
47641 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
47642 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
47643 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
47644 MaskOK = true;
47645 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
47647 MaskOK = true;
47648 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
47649 N00.getOpcode() == ISD::ANY_EXTEND) &&
47651 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
47652 }
47653 if (MaskOK && Mask != 0) {
47654 SDLoc DL(N);
47655 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
47656 }
47657 }
47658
47659 return SDValue();
47660}
47661
47663 const X86Subtarget &Subtarget) {
47664 SDValue N0 = N->getOperand(0);
47665 SDValue N1 = N->getOperand(1);
47666 EVT VT = N0.getValueType();
47667 unsigned Size = VT.getSizeInBits();
47668
47669 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
47670 return V;
47671
47672 APInt ShiftAmt;
47673 if (supportedVectorVarShift(VT, Subtarget, ISD::SRA) &&
47674 N1.getOpcode() == ISD::UMIN &&
47675 ISD::isConstantSplatVector(N1.getOperand(1).getNode(), ShiftAmt) &&
47676 ShiftAmt == VT.getScalarSizeInBits() - 1) {
47677 SDValue ShrAmtVal = N1.getOperand(0);
47678 SDLoc DL(N);
47679 return DAG.getNode(X86ISD::VSRAV, DL, N->getVTList(), N0, ShrAmtVal);
47680 }
47681
47682 // fold (SRA (SHL X, ShlConst), SraConst)
47683 // into (SHL (sext_in_reg X), ShlConst - SraConst)
47684 // or (sext_in_reg X)
47685 // or (SRA (sext_in_reg X), SraConst - ShlConst)
47686 // depending on relation between SraConst and ShlConst.
47687 // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows
47688 // us to do the sext_in_reg from corresponding bit.
47689
47690 // sexts in X86 are MOVs. The MOVs have the same code size
47691 // as above SHIFTs (only SHIFT on 1 has lower code size).
47692 // However the MOVs have 2 advantages to a SHIFT:
47693 // 1. MOVs can write to a register that differs from source
47694 // 2. MOVs accept memory operands
47695
47696 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
47697 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
47699 return SDValue();
47700
47701 SDValue N00 = N0.getOperand(0);
47702 SDValue N01 = N0.getOperand(1);
47703 APInt ShlConst = N01->getAsAPIntVal();
47704 APInt SraConst = N1->getAsAPIntVal();
47705 EVT CVT = N1.getValueType();
47706
47707 if (CVT != N01.getValueType())
47708 return SDValue();
47709 if (SraConst.isNegative())
47710 return SDValue();
47711
47712 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
47713 unsigned ShiftSize = SVT.getSizeInBits();
47714 // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
47715 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
47716 continue;
47717 SDLoc DL(N);
47718 SDValue NN =
47719 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
47720 if (SraConst.eq(ShlConst))
47721 return NN;
47722 if (SraConst.ult(ShlConst))
47723 return DAG.getNode(ISD::SHL, DL, VT, NN,
47724 DAG.getConstant(ShlConst - SraConst, DL, CVT));
47725 return DAG.getNode(ISD::SRA, DL, VT, NN,
47726 DAG.getConstant(SraConst - ShlConst, DL, CVT));
47727 }
47728 return SDValue();
47729}
47730
47733 const X86Subtarget &Subtarget) {
47734 SDValue N0 = N->getOperand(0);
47735 SDValue N1 = N->getOperand(1);
47736 EVT VT = N0.getValueType();
47737
47738 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
47739 return V;
47740
47741 // Only do this on the last DAG combine as it can interfere with other
47742 // combines.
47743 if (!DCI.isAfterLegalizeDAG())
47744 return SDValue();
47745
47746 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
47747 // TODO: This is a generic DAG combine that became an x86-only combine to
47748 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
47749 // and-not ('andn').
47750 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
47751 return SDValue();
47752
47753 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
47754 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
47755 if (!ShiftC || !AndC)
47756 return SDValue();
47757
47758 // If we can shrink the constant mask below 8-bits or 32-bits, then this
47759 // transform should reduce code size. It may also enable secondary transforms
47760 // from improved known-bits analysis or instruction selection.
47761 APInt MaskVal = AndC->getAPIntValue();
47762
47763 // If this can be matched by a zero extend, don't optimize.
47764 if (MaskVal.isMask()) {
47765 unsigned TO = MaskVal.countr_one();
47766 if (TO >= 8 && isPowerOf2_32(TO))
47767 return SDValue();
47768 }
47769
47770 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
47771 unsigned OldMaskSize = MaskVal.getSignificantBits();
47772 unsigned NewMaskSize = NewMaskVal.getSignificantBits();
47773 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
47774 (OldMaskSize > 32 && NewMaskSize <= 32)) {
47775 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
47776 SDLoc DL(N);
47777 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
47778 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
47779 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
47780 }
47781 return SDValue();
47782}
47783
47785 const X86Subtarget &Subtarget) {
47786 unsigned Opcode = N->getOpcode();
47787 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
47788
47789 SDLoc DL(N);
47790 EVT VT = N->getValueType(0);
47791 SDValue N0 = N->getOperand(0);
47792 SDValue N1 = N->getOperand(1);
47793 EVT SrcVT = N0.getValueType();
47794
47795 SDValue BC0 =
47796 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
47797 SDValue BC1 =
47798 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
47799
47800 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
47801 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
47802 // truncation trees that help us avoid lane crossing shuffles.
47803 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
47804 // TODO: We don't handle vXf64 shuffles yet.
47805 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
47806 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
47808 SmallVector<int> ShuffleMask, ScaledMask;
47809 SDValue Vec = peekThroughBitcasts(BCSrc);
47810 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
47812 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
47813 // shuffle to a v4X64 width - we can probably relax this in the future.
47814 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
47815 ShuffleOps[0].getValueType().is256BitVector() &&
47816 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
47817 SDValue Lo, Hi;
47818 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
47819 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
47820 Lo = DAG.getBitcast(SrcVT, Lo);
47821 Hi = DAG.getBitcast(SrcVT, Hi);
47822 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
47823 Res = DAG.getBitcast(ShufVT, Res);
47824 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
47825 return DAG.getBitcast(VT, Res);
47826 }
47827 }
47828 }
47829 }
47830
47831 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
47832 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
47833 // If either/both ops are a shuffle that can scale to v2x64,
47834 // then see if we can perform this as a v4x32 post shuffle.
47835 SmallVector<SDValue> Ops0, Ops1;
47836 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
47837 bool IsShuf0 =
47838 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
47839 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
47840 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
47841 bool IsShuf1 =
47842 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
47843 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
47844 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
47845 if (IsShuf0 || IsShuf1) {
47846 if (!IsShuf0) {
47847 Ops0.assign({BC0});
47848 ScaledMask0.assign({0, 1});
47849 }
47850 if (!IsShuf1) {
47851 Ops1.assign({BC1});
47852 ScaledMask1.assign({0, 1});
47853 }
47854
47855 SDValue LHS, RHS;
47856 int PostShuffle[4] = {-1, -1, -1, -1};
47857 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
47858 if (M < 0)
47859 return true;
47860 Idx = M % 2;
47861 SDValue Src = Ops[M / 2];
47862 if (!LHS || LHS == Src) {
47863 LHS = Src;
47864 return true;
47865 }
47866 if (!RHS || RHS == Src) {
47867 Idx += 2;
47868 RHS = Src;
47869 return true;
47870 }
47871 return false;
47872 };
47873 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
47874 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
47875 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
47876 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
47877 LHS = DAG.getBitcast(SrcVT, LHS);
47878 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
47879 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
47880 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
47881 Res = DAG.getBitcast(ShufVT, Res);
47882 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
47883 return DAG.getBitcast(VT, Res);
47884 }
47885 }
47886 }
47887
47888 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
47889 if (VT.is256BitVector() && Subtarget.hasInt256()) {
47890 SmallVector<int> Mask0, Mask1;
47891 SmallVector<SDValue> Ops0, Ops1;
47892 SmallVector<int, 2> ScaledMask0, ScaledMask1;
47893 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
47894 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
47895 !Ops0.empty() && !Ops1.empty() &&
47896 all_of(Ops0,
47897 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
47898 all_of(Ops1,
47899 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
47900 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
47901 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
47902 SDValue Op00 = peekThroughBitcasts(Ops0.front());
47903 SDValue Op10 = peekThroughBitcasts(Ops1.front());
47904 SDValue Op01 = peekThroughBitcasts(Ops0.back());
47905 SDValue Op11 = peekThroughBitcasts(Ops1.back());
47906 if ((Op00 == Op11) && (Op01 == Op10)) {
47907 std::swap(Op10, Op11);
47909 }
47910 if ((Op00 == Op10) && (Op01 == Op11)) {
47911 const int Map[4] = {0, 2, 1, 3};
47912 SmallVector<int, 4> ShuffleMask(
47913 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
47914 Map[ScaledMask1[1]]});
47915 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
47916 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
47917 DAG.getBitcast(SrcVT, Op01));
47918 Res = DAG.getBitcast(ShufVT, Res);
47919 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
47920 return DAG.getBitcast(VT, Res);
47921 }
47922 }
47923 }
47924
47925 return SDValue();
47926}
47927
47930 const X86Subtarget &Subtarget) {
47931 unsigned Opcode = N->getOpcode();
47932 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
47933 "Unexpected pack opcode");
47934
47935 EVT VT = N->getValueType(0);
47936 SDValue N0 = N->getOperand(0);
47937 SDValue N1 = N->getOperand(1);
47938 unsigned NumDstElts = VT.getVectorNumElements();
47939 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
47940 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
47941 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
47942 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
47943 "Unexpected PACKSS/PACKUS input type");
47944
47945 bool IsSigned = (X86ISD::PACKSS == Opcode);
47946
47947 // Constant Folding.
47948 APInt UndefElts0, UndefElts1;
47949 SmallVector<APInt, 32> EltBits0, EltBits1;
47950 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
47951 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
47952 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0,
47953 /*AllowWholeUndefs*/ true,
47954 /*AllowPartialUndefs*/ true) &&
47955 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1,
47956 /*AllowWholeUndefs*/ true,
47957 /*AllowPartialUndefs*/ true)) {
47958 unsigned NumLanes = VT.getSizeInBits() / 128;
47959 unsigned NumSrcElts = NumDstElts / 2;
47960 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
47961 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
47962
47963 APInt Undefs(NumDstElts, 0);
47964 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
47965 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
47966 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
47967 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
47968 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
47969 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
47970
47971 if (UndefElts[SrcIdx]) {
47972 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
47973 continue;
47974 }
47975
47976 APInt &Val = EltBits[SrcIdx];
47977 if (IsSigned) {
47978 // PACKSS: Truncate signed value with signed saturation.
47979 // Source values less than dst minint are saturated to minint.
47980 // Source values greater than dst maxint are saturated to maxint.
47981 Val = Val.truncSSat(DstBitsPerElt);
47982 } else {
47983 // PACKUS: Truncate signed value with unsigned saturation.
47984 // Source values less than zero are saturated to zero.
47985 // Source values greater than dst maxuint are saturated to maxuint.
47986 // NOTE: This is different from APInt::truncUSat.
47987 if (Val.isIntN(DstBitsPerElt))
47988 Val = Val.trunc(DstBitsPerElt);
47989 else if (Val.isNegative())
47990 Val = APInt::getZero(DstBitsPerElt);
47991 else
47992 Val = APInt::getAllOnes(DstBitsPerElt);
47993 }
47994 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
47995 }
47996 }
47997
47998 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
47999 }
48000
48001 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
48002 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
48003 return V;
48004
48005 // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)).
48006 // Currently limit this to allsignbits cases only.
48007 if (IsSigned &&
48008 (N0.isUndef() || DAG.ComputeNumSignBits(N0) == SrcBitsPerElt) &&
48009 (N1.isUndef() || DAG.ComputeNumSignBits(N1) == SrcBitsPerElt)) {
48010 SDValue Not0 = N0.isUndef() ? N0 : IsNOT(N0, DAG);
48011 SDValue Not1 = N1.isUndef() ? N1 : IsNOT(N1, DAG);
48012 if (Not0 && Not1) {
48013 SDLoc DL(N);
48014 MVT SrcVT = N0.getSimpleValueType();
48015 SDValue Pack =
48016 DAG.getNode(X86ISD::PACKSS, DL, VT, DAG.getBitcast(SrcVT, Not0),
48017 DAG.getBitcast(SrcVT, Not1));
48018 return DAG.getNOT(DL, Pack, VT);
48019 }
48020 }
48021
48022 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
48023 // truncate to create a larger truncate.
48024 if (Subtarget.hasAVX512() &&
48025 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
48026 N0.getOperand(0).getValueType() == MVT::v8i32) {
48027 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
48028 (!IsSigned &&
48029 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
48030 if (Subtarget.hasVLX())
48031 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
48032
48033 // Widen input to v16i32 so we can truncate that.
48034 SDLoc dl(N);
48035 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
48036 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
48037 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
48038 }
48039 }
48040
48041 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
48042 if (VT.is128BitVector()) {
48043 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
48044 SDValue Src0, Src1;
48045 if (N0.getOpcode() == ExtOpc &&
48047 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
48048 Src0 = N0.getOperand(0);
48049 }
48050 if (N1.getOpcode() == ExtOpc &&
48052 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
48053 Src1 = N1.getOperand(0);
48054 }
48055 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
48056 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
48057 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
48058 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
48059 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
48060 }
48061
48062 // Try again with pack(*_extend_vector_inreg, undef).
48063 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
48065 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
48066 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
48067 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
48068 DAG);
48069 }
48070
48071 // Attempt to combine as shuffle.
48072 SDValue Op(N, 0);
48073 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
48074 return Res;
48075
48076 return SDValue();
48077}
48078
48081 const X86Subtarget &Subtarget) {
48082 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
48083 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
48084 "Unexpected horizontal add/sub opcode");
48085
48086 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
48087 MVT VT = N->getSimpleValueType(0);
48088 SDValue LHS = N->getOperand(0);
48089 SDValue RHS = N->getOperand(1);
48090
48091 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
48092 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
48093 LHS.getOpcode() == RHS.getOpcode() &&
48094 LHS.getValueType() == RHS.getValueType() &&
48095 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
48096 SDValue LHS0 = LHS.getOperand(0);
48097 SDValue LHS1 = LHS.getOperand(1);
48098 SDValue RHS0 = RHS.getOperand(0);
48099 SDValue RHS1 = RHS.getOperand(1);
48100 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
48101 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
48102 SDLoc DL(N);
48103 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
48104 LHS0.isUndef() ? LHS1 : LHS0,
48105 RHS0.isUndef() ? RHS1 : RHS0);
48106 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
48107 Res = DAG.getBitcast(ShufVT, Res);
48108 SDValue NewLHS =
48109 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
48110 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
48111 SDValue NewRHS =
48112 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
48113 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
48114 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
48115 DAG.getBitcast(VT, NewRHS));
48116 }
48117 }
48118 }
48119
48120 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
48121 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
48122 return V;
48123
48124 return SDValue();
48125}
48126
48129 const X86Subtarget &Subtarget) {
48130 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
48131 X86ISD::VSRL == N->getOpcode()) &&
48132 "Unexpected shift opcode");
48133 EVT VT = N->getValueType(0);
48134 SDValue N0 = N->getOperand(0);
48135 SDValue N1 = N->getOperand(1);
48136
48137 // Shift zero -> zero.
48139 return DAG.getConstant(0, SDLoc(N), VT);
48140
48141 // Detect constant shift amounts.
48142 APInt UndefElts;
48143 SmallVector<APInt, 32> EltBits;
48144 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits,
48145 /*AllowWholeUndefs*/ true,
48146 /*AllowPartialUndefs*/ false)) {
48147 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
48148 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
48149 EltBits[0].getZExtValue(), DAG);
48150 }
48151
48152 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48153 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
48154 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
48155 return SDValue(N, 0);
48156
48157 return SDValue();
48158}
48159
48162 const X86Subtarget &Subtarget) {
48163 unsigned Opcode = N->getOpcode();
48164 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
48165 X86ISD::VSRLI == Opcode) &&
48166 "Unexpected shift opcode");
48167 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
48168 EVT VT = N->getValueType(0);
48169 SDValue N0 = N->getOperand(0);
48170 SDValue N1 = N->getOperand(1);
48171 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
48172 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
48173 "Unexpected value type");
48174 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
48175
48176 // (shift undef, X) -> 0
48177 if (N0.isUndef())
48178 return DAG.getConstant(0, SDLoc(N), VT);
48179
48180 // Out of range logical bit shifts are guaranteed to be zero.
48181 // Out of range arithmetic bit shifts splat the sign bit.
48182 unsigned ShiftVal = N->getConstantOperandVal(1);
48183 if (ShiftVal >= NumBitsPerElt) {
48184 if (LogicalShift)
48185 return DAG.getConstant(0, SDLoc(N), VT);
48186 ShiftVal = NumBitsPerElt - 1;
48187 }
48188
48189 // (shift X, 0) -> X
48190 if (!ShiftVal)
48191 return N0;
48192
48193 // (shift 0, C) -> 0
48195 // N0 is all zeros or undef. We guarantee that the bits shifted into the
48196 // result are all zeros, not undef.
48197 return DAG.getConstant(0, SDLoc(N), VT);
48198
48199 // (VSRAI -1, C) -> -1
48200 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
48201 // N0 is all ones or undef. We guarantee that the bits shifted into the
48202 // result are all ones, not undef.
48203 return DAG.getConstant(-1, SDLoc(N), VT);
48204
48205 auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
48206 unsigned NewShiftVal = Amt0 + Amt1;
48207 if (NewShiftVal >= NumBitsPerElt) {
48208 // Out of range logical bit shifts are guaranteed to be zero.
48209 // Out of range arithmetic bit shifts splat the sign bit.
48210 if (LogicalShift)
48211 return DAG.getConstant(0, SDLoc(N), VT);
48212 NewShiftVal = NumBitsPerElt - 1;
48213 }
48214 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
48215 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
48216 };
48217
48218 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
48219 if (Opcode == N0.getOpcode())
48220 return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
48221
48222 // (shl (add X, X), C) -> (shl X, (C + 1))
48223 if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
48224 N0.getOperand(0) == N0.getOperand(1))
48225 return MergeShifts(N0.getOperand(0), ShiftVal, 1);
48226
48227 // We can decode 'whole byte' logical bit shifts as shuffles.
48228 if (LogicalShift && (ShiftVal % 8) == 0) {
48229 SDValue Op(N, 0);
48230 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
48231 return Res;
48232 }
48233
48234 // Attempt to detect an expanded vXi64 SIGN_EXTEND_INREG vXi1 pattern, and
48235 // convert to a splatted v2Xi32 SIGN_EXTEND_INREG pattern:
48236 // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->
48237 // pshufd(psrad(pslld(X,31),31),0,0,2,2).
48238 if (Opcode == X86ISD::VSRAI && NumBitsPerElt == 32 && ShiftVal == 31 &&
48239 N0.getOpcode() == X86ISD::PSHUFD &&
48240 N0.getConstantOperandVal(1) == getV4X86ShuffleImm({1, 1, 3, 3}) &&
48241 N0->hasOneUse()) {
48243 if (BC.getOpcode() == X86ISD::VSHLI &&
48244 BC.getScalarValueSizeInBits() == 64 &&
48245 BC.getConstantOperandVal(1) == 63) {
48246 SDLoc DL(N);
48247 SDValue Src = BC.getOperand(0);
48248 Src = DAG.getBitcast(VT, Src);
48249 Src = DAG.getNode(X86ISD::PSHUFD, DL, VT, Src,
48250 getV4X86ShuffleImm8ForMask({0, 0, 2, 2}, DL, DAG));
48251 Src = DAG.getNode(X86ISD::VSHLI, DL, VT, Src, N1);
48252 Src = DAG.getNode(X86ISD::VSRAI, DL, VT, Src, N1);
48253 return Src;
48254 }
48255 }
48256
48257 auto TryConstantFold = [&](SDValue V) {
48258 APInt UndefElts;
48259 SmallVector<APInt, 32> EltBits;
48260 if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits,
48261 /*AllowWholeUndefs*/ true,
48262 /*AllowPartialUndefs*/ true))
48263 return SDValue();
48264 assert(EltBits.size() == VT.getVectorNumElements() &&
48265 "Unexpected shift value type");
48266 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
48267 // created an undef input due to no input bits being demanded, but user
48268 // still expects 0 in other bits.
48269 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
48270 APInt &Elt = EltBits[i];
48271 if (UndefElts[i])
48272 Elt = 0;
48273 else if (X86ISD::VSHLI == Opcode)
48274 Elt <<= ShiftVal;
48275 else if (X86ISD::VSRAI == Opcode)
48276 Elt.ashrInPlace(ShiftVal);
48277 else
48278 Elt.lshrInPlace(ShiftVal);
48279 }
48280 // Reset undef elements since they were zeroed above.
48281 UndefElts = 0;
48282 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
48283 };
48284
48285 // Constant Folding.
48286 if (N->isOnlyUserOf(N0.getNode())) {
48287 if (SDValue C = TryConstantFold(N0))
48288 return C;
48289
48290 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
48291 // Don't break NOT patterns.
48293 if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
48294 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
48296 if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
48297 SDLoc DL(N);
48298 SDValue LHS = DAG.getNode(Opcode, DL, VT,
48299 DAG.getBitcast(VT, BC.getOperand(0)), N1);
48300 return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
48301 }
48302 }
48303 }
48304
48305 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48306 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
48307 DCI))
48308 return SDValue(N, 0);
48309
48310 return SDValue();
48311}
48312
48315 const X86Subtarget &Subtarget) {
48316 EVT VT = N->getValueType(0);
48317 unsigned Opcode = N->getOpcode();
48318 assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||
48319 (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||
48320 Opcode == ISD::INSERT_VECTOR_ELT) &&
48321 "Unexpected vector insertion");
48322
48323 SDValue Vec = N->getOperand(0);
48324 SDValue Scl = N->getOperand(1);
48325 SDValue Idx = N->getOperand(2);
48326
48327 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
48328 if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
48329 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
48330
48331 if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
48332 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
48333 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48334 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
48335 APInt::getAllOnes(NumBitsPerElt), DCI))
48336 return SDValue(N, 0);
48337 }
48338
48339 // Attempt to combine insertion patterns to a shuffle.
48340 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
48341 SDValue Op(N, 0);
48342 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
48343 return Res;
48344 }
48345
48346 return SDValue();
48347}
48348
48349/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
48350/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
48351/// OR -> CMPNEQSS.
48354 const X86Subtarget &Subtarget) {
48355 unsigned opcode;
48356
48357 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
48358 // we're requiring SSE2 for both.
48359 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
48360 SDValue N0 = N->getOperand(0);
48361 SDValue N1 = N->getOperand(1);
48362 SDValue CMP0 = N0.getOperand(1);
48363 SDValue CMP1 = N1.getOperand(1);
48364 SDLoc DL(N);
48365
48366 // The SETCCs should both refer to the same CMP.
48367 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
48368 return SDValue();
48369
48370 SDValue CMP00 = CMP0->getOperand(0);
48371 SDValue CMP01 = CMP0->getOperand(1);
48372 EVT VT = CMP00.getValueType();
48373
48374 if (VT == MVT::f32 || VT == MVT::f64 ||
48375 (VT == MVT::f16 && Subtarget.hasFP16())) {
48376 bool ExpectingFlags = false;
48377 // Check for any users that want flags:
48378 for (const SDNode *U : N->uses()) {
48379 if (ExpectingFlags)
48380 break;
48381
48382 switch (U->getOpcode()) {
48383 default:
48384 case ISD::BR_CC:
48385 case ISD::BRCOND:
48386 case ISD::SELECT:
48387 ExpectingFlags = true;
48388 break;
48389 case ISD::CopyToReg:
48390 case ISD::SIGN_EXTEND:
48391 case ISD::ZERO_EXTEND:
48392 case ISD::ANY_EXTEND:
48393 break;
48394 }
48395 }
48396
48397 if (!ExpectingFlags) {
48398 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
48399 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
48400
48401 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
48402 X86::CondCode tmp = cc0;
48403 cc0 = cc1;
48404 cc1 = tmp;
48405 }
48406
48407 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
48408 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
48409 // FIXME: need symbolic constants for these magic numbers.
48410 // See X86ATTInstPrinter.cpp:printSSECC().
48411 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
48412 if (Subtarget.hasAVX512()) {
48413 SDValue FSetCC =
48414 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
48415 DAG.getTargetConstant(x86cc, DL, MVT::i8));
48416 // Need to fill with zeros to ensure the bitcast will produce zeroes
48417 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
48418 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
48419 DAG.getConstant(0, DL, MVT::v16i1),
48420 FSetCC, DAG.getIntPtrConstant(0, DL));
48421 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
48422 N->getSimpleValueType(0));
48423 }
48424 SDValue OnesOrZeroesF =
48425 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
48426 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
48427
48428 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
48429 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
48430
48431 if (is64BitFP && !Subtarget.is64Bit()) {
48432 // On a 32-bit target, we cannot bitcast the 64-bit float to a
48433 // 64-bit integer, since that's not a legal type. Since
48434 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
48435 // bits, but can do this little dance to extract the lowest 32 bits
48436 // and work with those going forward.
48437 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
48438 OnesOrZeroesF);
48439 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
48440 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
48441 Vector32, DAG.getIntPtrConstant(0, DL));
48442 IntVT = MVT::i32;
48443 }
48444
48445 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
48446 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
48447 DAG.getConstant(1, DL, IntVT));
48448 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
48449 ANDed);
48450 return OneBitOfTruth;
48451 }
48452 }
48453 }
48454 }
48455 return SDValue();
48456}
48457
48458/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
48460 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
48461
48462 MVT VT = N->getSimpleValueType(0);
48463 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
48464 return SDValue();
48465
48466 SDValue X, Y;
48467 SDValue N0 = N->getOperand(0);
48468 SDValue N1 = N->getOperand(1);
48469
48470 if (SDValue Not = IsNOT(N0, DAG)) {
48471 X = Not;
48472 Y = N1;
48473 } else if (SDValue Not = IsNOT(N1, DAG)) {
48474 X = Not;
48475 Y = N0;
48476 } else
48477 return SDValue();
48478
48479 X = DAG.getBitcast(VT, X);
48480 Y = DAG.getBitcast(VT, Y);
48481 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
48482}
48483
48484/// Try to fold:
48485/// and (vector_shuffle<Z,...,Z>
48486/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
48487/// ->
48488/// andnp (vector_shuffle<Z,...,Z>
48489/// (insert_vector_elt undef, X, Z), undef), Y
48491 const X86Subtarget &Subtarget) {
48492 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
48493
48494 EVT VT = N->getValueType(0);
48495 // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
48496 // value and require extra moves.
48497 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
48498 ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
48499 return SDValue();
48500
48501 auto GetNot = [&DAG](SDValue V) {
48502 auto *SVN = dyn_cast<ShuffleVectorSDNode>(peekThroughOneUseBitcasts(V));
48503 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
48504 // end-users are ISD::AND including cases
48505 // (and(extract_vector_element(SVN), Y)).
48506 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
48507 !SVN->getOperand(1).isUndef()) {
48508 return SDValue();
48509 }
48510 SDValue IVEN = SVN->getOperand(0);
48511 if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
48512 !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
48513 return SDValue();
48514 if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
48515 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
48516 return SDValue();
48517 SDValue Src = IVEN.getOperand(1);
48518 if (SDValue Not = IsNOT(Src, DAG)) {
48519 SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
48520 SDValue NotIVEN =
48522 IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
48523 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
48524 SVN->getOperand(1), SVN->getMask());
48525 }
48526 return SDValue();
48527 };
48528
48529 SDValue X, Y;
48530 SDValue N0 = N->getOperand(0);
48531 SDValue N1 = N->getOperand(1);
48532 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48533
48534 if (SDValue Not = GetNot(N0)) {
48535 X = Not;
48536 Y = N1;
48537 } else if (SDValue Not = GetNot(N1)) {
48538 X = Not;
48539 Y = N0;
48540 } else
48541 return SDValue();
48542
48543 X = DAG.getBitcast(VT, X);
48544 Y = DAG.getBitcast(VT, Y);
48545 SDLoc DL(N);
48546
48547 // We do not split for SSE at all, but we need to split vectors for AVX1 and
48548 // AVX2.
48549 if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
48551 SDValue LoX, HiX;
48552 std::tie(LoX, HiX) = splitVector(X, DAG, DL);
48553 SDValue LoY, HiY;
48554 std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
48555 EVT SplitVT = LoX.getValueType();
48556 SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
48557 SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
48558 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
48559 }
48560
48561 if (TLI.isTypeLegal(VT))
48562 return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
48563
48564 return SDValue();
48565}
48566
48567// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
48568// logical operations, like in the example below.
48569// or (and (truncate x, truncate y)),
48570// (xor (truncate z, build_vector (constants)))
48571// Given a target type \p VT, we generate
48572// or (and x, y), (xor z, zext(build_vector (constants)))
48573// given x, y and z are of type \p VT. We can do so, if operands are either
48574// truncates from VT types, the second operand is a vector of constants or can
48575// be recursively promoted.
48577 SelectionDAG &DAG, unsigned Depth) {
48578 // Limit recursion to avoid excessive compile times.
48580 return SDValue();
48581
48582 if (!ISD::isBitwiseLogicOp(N.getOpcode()))
48583 return SDValue();
48584
48585 SDValue N0 = N.getOperand(0);
48586 SDValue N1 = N.getOperand(1);
48587
48588 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48589 if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
48590 return SDValue();
48591
48592 if (SDValue NN0 = PromoteMaskArithmetic(N0, DL, VT, DAG, Depth + 1))
48593 N0 = NN0;
48594 else {
48595 // The left side has to be a trunc.
48596 if (N0.getOpcode() != ISD::TRUNCATE)
48597 return SDValue();
48598
48599 // The type of the truncated inputs.
48600 if (N0.getOperand(0).getValueType() != VT)
48601 return SDValue();
48602
48603 N0 = N0.getOperand(0);
48604 }
48605
48606 if (SDValue NN1 = PromoteMaskArithmetic(N1, DL, VT, DAG, Depth + 1))
48607 N1 = NN1;
48608 else {
48609 // The right side has to be a 'trunc' or a (foldable) constant.
48610 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
48611 N1.getOperand(0).getValueType() == VT;
48612 if (RHSTrunc)
48613 N1 = N1.getOperand(0);
48614 else if (SDValue Cst =
48616 N1 = Cst;
48617 else
48618 return SDValue();
48619 }
48620
48621 return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);
48622}
48623
48624// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
48625// register. In most cases we actually compare or select YMM-sized registers
48626// and mixing the two types creates horrible code. This method optimizes
48627// some of the transition sequences.
48628// Even with AVX-512 this is still useful for removing casts around logical
48629// operations on vXi1 mask types.
48631 SelectionDAG &DAG,
48632 const X86Subtarget &Subtarget) {
48633 EVT VT = N.getValueType();
48634 assert(VT.isVector() && "Expected vector type");
48635 assert((N.getOpcode() == ISD::ANY_EXTEND ||
48636 N.getOpcode() == ISD::ZERO_EXTEND ||
48637 N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
48638
48639 SDValue Narrow = N.getOperand(0);
48640 EVT NarrowVT = Narrow.getValueType();
48641
48642 // Generate the wide operation.
48643 SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, 0);
48644 if (!Op)
48645 return SDValue();
48646 switch (N.getOpcode()) {
48647 default: llvm_unreachable("Unexpected opcode");
48648 case ISD::ANY_EXTEND:
48649 return Op;
48650 case ISD::ZERO_EXTEND:
48651 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
48652 case ISD::SIGN_EXTEND:
48653 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
48654 Op, DAG.getValueType(NarrowVT));
48655 }
48656}
48657
48658static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
48659 unsigned FPOpcode;
48660 switch (Opcode) {
48661 // clang-format off
48662 default: llvm_unreachable("Unexpected input node for FP logic conversion");
48663 case ISD::AND: FPOpcode = X86ISD::FAND; break;
48664 case ISD::OR: FPOpcode = X86ISD::FOR; break;
48665 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
48666 // clang-format on
48667 }
48668 return FPOpcode;
48669}
48670
48671/// If both input operands of a logic op are being cast from floating-point
48672/// types or FP compares, try to convert this into a floating-point logic node
48673/// to avoid unnecessary moves from SSE to integer registers.
48676 const X86Subtarget &Subtarget) {
48677 EVT VT = N->getValueType(0);
48678 SDValue N0 = N->getOperand(0);
48679 SDValue N1 = N->getOperand(1);
48680 SDLoc DL(N);
48681
48682 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
48683 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
48684 return SDValue();
48685
48686 SDValue N00 = N0.getOperand(0);
48687 SDValue N10 = N1.getOperand(0);
48688 EVT N00Type = N00.getValueType();
48689 EVT N10Type = N10.getValueType();
48690
48691 // Ensure that both types are the same and are legal scalar fp types.
48692 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
48693 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
48694 (Subtarget.hasFP16() && N00Type == MVT::f16)))
48695 return SDValue();
48696
48697 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
48698 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
48699 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
48700 return DAG.getBitcast(VT, FPLogic);
48701 }
48702
48703 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
48704 !N1.hasOneUse())
48705 return SDValue();
48706
48707 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
48708 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
48709
48710 // The vector ISA for FP predicates is incomplete before AVX, so converting
48711 // COMIS* to CMPS* may not be a win before AVX.
48712 if (!Subtarget.hasAVX() &&
48713 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
48714 return SDValue();
48715
48716 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
48717 // and vector logic:
48718 // logic (setcc N00, N01), (setcc N10, N11) -->
48719 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
48720 unsigned NumElts = 128 / N00Type.getSizeInBits();
48721 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
48722 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
48723 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
48724 SDValue N01 = N0.getOperand(1);
48725 SDValue N11 = N1.getOperand(1);
48726 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
48727 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
48728 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
48729 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
48730 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
48731 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
48732 SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1);
48733 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
48734}
48735
48736// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
48737// to reduce XMM->GPR traffic.
48739 unsigned Opc = N->getOpcode();
48740 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
48741 "Unexpected bit opcode");
48742
48743 SDValue N0 = N->getOperand(0);
48744 SDValue N1 = N->getOperand(1);
48745
48746 // Both operands must be single use MOVMSK.
48747 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
48748 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
48749 return SDValue();
48750
48751 SDValue Vec0 = N0.getOperand(0);
48752 SDValue Vec1 = N1.getOperand(0);
48753 EVT VecVT0 = Vec0.getValueType();
48754 EVT VecVT1 = Vec1.getValueType();
48755
48756 // Both MOVMSK operands must be from vectors of the same size and same element
48757 // size, but its OK for a fp/int diff.
48758 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
48759 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
48760 return SDValue();
48761
48762 SDLoc DL(N);
48763 unsigned VecOpc =
48764 VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
48765 SDValue Result =
48766 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
48767 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
48768}
48769
48770// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
48771// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
48772// handles in InstCombine.
48774 unsigned Opc = N->getOpcode();
48775 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
48776 "Unexpected bit opcode");
48777
48778 SDValue N0 = N->getOperand(0);
48779 SDValue N1 = N->getOperand(1);
48780 EVT VT = N->getValueType(0);
48781
48782 // Both operands must be single use.
48783 if (!N0.hasOneUse() || !N1.hasOneUse())
48784 return SDValue();
48785
48786 // Search for matching shifts.
48789
48790 unsigned BCOpc = BC0.getOpcode();
48791 EVT BCVT = BC0.getValueType();
48792 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
48793 return SDValue();
48794
48795 switch (BCOpc) {
48796 case X86ISD::VSHLI:
48797 case X86ISD::VSRLI:
48798 case X86ISD::VSRAI: {
48799 if (BC0.getOperand(1) != BC1.getOperand(1))
48800 return SDValue();
48801
48802 SDLoc DL(N);
48803 SDValue BitOp =
48804 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
48805 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
48806 return DAG.getBitcast(VT, Shift);
48807 }
48808 }
48809
48810 return SDValue();
48811}
48812
48813// Attempt to fold:
48814// BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
48815// TODO: Handle PACKUS handling.
48817 unsigned Opc = N->getOpcode();
48818 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
48819 "Unexpected bit opcode");
48820
48821 SDValue N0 = N->getOperand(0);
48822 SDValue N1 = N->getOperand(1);
48823 EVT VT = N->getValueType(0);
48824
48825 // Both operands must be single use.
48826 if (!N0.hasOneUse() || !N1.hasOneUse())
48827 return SDValue();
48828
48829 // Search for matching packs.
48832
48833 if (N0.getOpcode() != X86ISD::PACKSS || N1.getOpcode() != X86ISD::PACKSS)
48834 return SDValue();
48835
48836 MVT DstVT = N0.getSimpleValueType();
48837 if (DstVT != N1.getSimpleValueType())
48838 return SDValue();
48839
48840 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
48841 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
48842
48843 // Limit to allsignbits packing.
48844 if (DAG.ComputeNumSignBits(N0.getOperand(0)) != NumSrcBits ||
48845 DAG.ComputeNumSignBits(N0.getOperand(1)) != NumSrcBits ||
48846 DAG.ComputeNumSignBits(N1.getOperand(0)) != NumSrcBits ||
48847 DAG.ComputeNumSignBits(N1.getOperand(1)) != NumSrcBits)
48848 return SDValue();
48849
48850 SDLoc DL(N);
48851 SDValue LHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(0), N1.getOperand(0));
48852 SDValue RHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(1), N1.getOperand(1));
48853 return DAG.getBitcast(VT, DAG.getNode(X86ISD::PACKSS, DL, DstVT, LHS, RHS));
48854}
48855
48856/// If this is a zero/all-bits result that is bitwise-anded with a low bits
48857/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
48858/// with a shift-right to eliminate loading the vector constant mask value.
48860 const X86Subtarget &Subtarget) {
48861 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
48862 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
48863 EVT VT = Op0.getValueType();
48864 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
48865 return SDValue();
48866
48867 // Try to convert an "is positive" signbit masking operation into arithmetic
48868 // shift and "andn". This saves a materialization of a -1 vector constant.
48869 // The "is negative" variant should be handled more generally because it only
48870 // requires "and" rather than "andn":
48871 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
48872 //
48873 // This is limited to the original type to avoid producing even more bitcasts.
48874 // If the bitcasts can't be eliminated, then it is unlikely that this fold
48875 // will be profitable.
48876 if (N->getValueType(0) == VT &&
48877 supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {
48878 SDValue X, Y;
48879 if (Op1.getOpcode() == X86ISD::PCMPGT &&
48880 isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
48881 X = Op1.getOperand(0);
48882 Y = Op0;
48883 } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
48884 isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
48885 X = Op0.getOperand(0);
48886 Y = Op1;
48887 }
48888 if (X && Y) {
48889 SDLoc DL(N);
48890 SDValue Sra =
48892 VT.getScalarSizeInBits() - 1, DAG);
48893 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
48894 }
48895 }
48896
48897 APInt SplatVal;
48898 if (!X86::isConstantSplat(Op1, SplatVal, false) || !SplatVal.isMask())
48899 return SDValue();
48900
48901 // Don't prevent creation of ANDN.
48902 if (isBitwiseNot(Op0))
48903 return SDValue();
48904
48905 if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))
48906 return SDValue();
48907
48908 unsigned EltBitWidth = VT.getScalarSizeInBits();
48909 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
48910 return SDValue();
48911
48912 SDLoc DL(N);
48913 unsigned ShiftVal = SplatVal.countr_one();
48914 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
48915 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
48916 return DAG.getBitcast(N->getValueType(0), Shift);
48917}
48918
48919// Get the index node from the lowered DAG of a GEP IR instruction with one
48920// indexing dimension.
48922 if (Ld->isIndexed())
48923 return SDValue();
48924
48925 SDValue Base = Ld->getBasePtr();
48926
48927 if (Base.getOpcode() != ISD::ADD)
48928 return SDValue();
48929
48930 SDValue ShiftedIndex = Base.getOperand(0);
48931
48932 if (ShiftedIndex.getOpcode() != ISD::SHL)
48933 return SDValue();
48934
48935 return ShiftedIndex.getOperand(0);
48936
48937}
48938
48939static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
48940 return Subtarget.hasBMI2() &&
48941 (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit()));
48942}
48943
48944// This function recognizes cases where X86 bzhi instruction can replace and
48945// 'and-load' sequence.
48946// In case of loading integer value from an array of constants which is defined
48947// as follows:
48948//
48949// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
48950//
48951// then applying a bitwise and on the result with another input.
48952// It's equivalent to performing bzhi (zero high bits) on the input, with the
48953// same index of the load.
48955 const X86Subtarget &Subtarget) {
48956 MVT VT = Node->getSimpleValueType(0);
48957 SDLoc dl(Node);
48958
48959 // Check if subtarget has BZHI instruction for the node's type
48960 if (!hasBZHI(Subtarget, VT))
48961 return SDValue();
48962
48963 // Try matching the pattern for both operands.
48964 for (unsigned i = 0; i < 2; i++) {
48965 SDValue N = Node->getOperand(i);
48966 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
48967
48968 // continue if the operand is not a load instruction
48969 if (!Ld)
48970 return SDValue();
48971
48972 const Value *MemOp = Ld->getMemOperand()->getValue();
48973
48974 if (!MemOp)
48975 return SDValue();
48976
48977 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
48978 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
48979 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
48980
48981 Constant *Init = GV->getInitializer();
48982 Type *Ty = Init->getType();
48983 if (!isa<ConstantDataArray>(Init) ||
48984 !Ty->getArrayElementType()->isIntegerTy() ||
48986 VT.getSizeInBits() ||
48987 Ty->getArrayNumElements() >
48989 continue;
48990
48991 // Check if the array's constant elements are suitable to our case.
48992 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
48993 bool ConstantsMatch = true;
48994 for (uint64_t j = 0; j < ArrayElementCount; j++) {
48995 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
48996 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
48997 ConstantsMatch = false;
48998 break;
48999 }
49000 }
49001 if (!ConstantsMatch)
49002 continue;
49003
49004 // Do the transformation (For 32-bit type):
49005 // -> (and (load arr[idx]), inp)
49006 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
49007 // that will be replaced with one bzhi instruction.
49008 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
49009 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
49010
49011 // Get the Node which indexes into the array.
49013 if (!Index)
49014 return SDValue();
49015 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
49016
49017 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
49018 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
49019
49020 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
49021 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
49022
49023 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
49024 }
49025 }
49026 }
49027 }
49028 return SDValue();
49029}
49030
49031// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
49032// Where C is a mask containing the same number of bits as the setcc and
49033// where the setcc will freely 0 upper bits of k-register. We can replace the
49034// undef in the concat with 0s and remove the AND. This mainly helps with
49035// v2i1/v4i1 setcc being casted to scalar.
49037 const X86Subtarget &Subtarget) {
49038 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
49039
49040 EVT VT = N->getValueType(0);
49041
49042 // Make sure this is an AND with constant. We will check the value of the
49043 // constant later.
49044 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
49045 if (!C1)
49046 return SDValue();
49047
49048 // This is implied by the ConstantSDNode.
49049 assert(!VT.isVector() && "Expected scalar VT!");
49050
49051 SDValue Src = N->getOperand(0);
49052 if (!Src.hasOneUse())
49053 return SDValue();
49054
49055 // (Optionally) peek through any_extend().
49056 if (Src.getOpcode() == ISD::ANY_EXTEND) {
49057 if (!Src.getOperand(0).hasOneUse())
49058 return SDValue();
49059 Src = Src.getOperand(0);
49060 }
49061
49062 if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
49063 return SDValue();
49064
49065 Src = Src.getOperand(0);
49066 EVT SrcVT = Src.getValueType();
49067
49068 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49069 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
49070 !TLI.isTypeLegal(SrcVT))
49071 return SDValue();
49072
49073 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
49074 return SDValue();
49075
49076 // We only care about the first subvector of the concat, we expect the
49077 // other subvectors to be ignored due to the AND if we make the change.
49078 SDValue SubVec = Src.getOperand(0);
49079 EVT SubVecVT = SubVec.getValueType();
49080
49081 // The RHS of the AND should be a mask with as many bits as SubVec.
49082 if (!TLI.isTypeLegal(SubVecVT) ||
49083 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
49084 return SDValue();
49085
49086 // First subvector should be a setcc with a legal result type or a
49087 // AND containing at least one setcc with a legal result type.
49088 auto IsLegalSetCC = [&](SDValue V) {
49089 if (V.getOpcode() != ISD::SETCC)
49090 return false;
49091 EVT SetccVT = V.getOperand(0).getValueType();
49092 if (!TLI.isTypeLegal(SetccVT) ||
49093 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
49094 return false;
49095 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
49096 return false;
49097 return true;
49098 };
49099 if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
49100 (IsLegalSetCC(SubVec.getOperand(0)) ||
49101 IsLegalSetCC(SubVec.getOperand(1))))))
49102 return SDValue();
49103
49104 // We passed all the checks. Rebuild the concat_vectors with zeroes
49105 // and cast it back to VT.
49106 SDLoc dl(N);
49107 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
49108 DAG.getConstant(0, dl, SubVecVT));
49109 Ops[0] = SubVec;
49110 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
49111 Ops);
49112 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
49113 return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
49114}
49115
49116static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG,
49117 SDValue OpMustEq, SDValue Op, unsigned Depth) {
49118 // We don't want to go crazy with the recursion here. This isn't a super
49119 // important optimization.
49120 static constexpr unsigned kMaxDepth = 2;
49121
49122 // Only do this re-ordering if op has one use.
49123 if (!Op.hasOneUse())
49124 return SDValue();
49125
49126 SDLoc DL(Op);
49127 // If we hit another assosiative op, recurse further.
49128 if (Op.getOpcode() == Opc) {
49129 // Done recursing.
49130 if (Depth++ >= kMaxDepth)
49131 return SDValue();
49132
49133 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
49134 if (SDValue R =
49135 getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
49136 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
49137 Op.getOperand(1 - OpIdx));
49138
49139 } else if (Op.getOpcode() == ISD::SUB) {
49140 if (Opc == ISD::AND) {
49141 // BLSI: (and x, (sub 0, x))
49142 if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
49143 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
49144 }
49145 // Opc must be ISD::AND or ISD::XOR
49146 // BLSR: (and x, (sub x, 1))
49147 // BLSMSK: (xor x, (sub x, 1))
49148 if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
49149 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
49150
49151 } else if (Op.getOpcode() == ISD::ADD) {
49152 // Opc must be ISD::AND or ISD::XOR
49153 // BLSR: (and x, (add x, -1))
49154 // BLSMSK: (xor x, (add x, -1))
49155 if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
49156 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
49157 }
49158 return SDValue();
49159}
49160
49162 const X86Subtarget &Subtarget) {
49163 EVT VT = N->getValueType(0);
49164 // Make sure this node is a candidate for BMI instructions.
49165 if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
49166 (VT != MVT::i32 && VT != MVT::i64))
49167 return SDValue();
49168
49169 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);
49170
49171 // Try and match LHS and RHS.
49172 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
49173 if (SDValue OpMatch =
49174 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
49175 N->getOperand(1 - OpIdx), 0))
49176 return OpMatch;
49177 return SDValue();
49178}
49179
49182 const X86Subtarget &Subtarget) {
49183 SDValue N0 = N->getOperand(0);
49184 SDValue N1 = N->getOperand(1);
49185 EVT VT = N->getValueType(0);
49186 SDLoc dl(N);
49187 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49188
49189 // If this is SSE1 only convert to FAND to avoid scalarization.
49190 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
49191 return DAG.getBitcast(MVT::v4i32,
49192 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
49193 DAG.getBitcast(MVT::v4f32, N0),
49194 DAG.getBitcast(MVT::v4f32, N1)));
49195 }
49196
49197 // Use a 32-bit and+zext if upper bits known zero.
49198 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
49199 APInt HiMask = APInt::getHighBitsSet(64, 32);
49200 if (DAG.MaskedValueIsZero(N1, HiMask) ||
49201 DAG.MaskedValueIsZero(N0, HiMask)) {
49202 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
49203 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
49204 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
49205 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
49206 }
49207 }
49208
49209 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
49210 // TODO: Support multiple SrcOps.
49211 if (VT == MVT::i1) {
49213 SmallVector<APInt, 2> SrcPartials;
49214 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
49215 SrcOps.size() == 1) {
49216 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
49217 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
49218 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
49219 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
49220 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
49221 if (Mask) {
49222 assert(SrcPartials[0].getBitWidth() == NumElts &&
49223 "Unexpected partial reduction mask");
49224 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
49225 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
49226 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
49227 }
49228 }
49229 }
49230
49231 // InstCombine converts:
49232 // `(-x << C0) & C1`
49233 // to
49234 // `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`
49235 // This saves an IR instruction but on x86 the neg/shift version is preferable
49236 // so undo the transform.
49237
49238 if (N0.getOpcode() == ISD::MUL && N0.hasOneUse()) {
49239 // TODO: We don't actually need a splat for this, we just need the checks to
49240 // hold for each element.
49241 ConstantSDNode *N1C = isConstOrConstSplat(N1, /*AllowUndefs*/ true,
49242 /*AllowTruncation*/ false);
49243 ConstantSDNode *N01C =
49244 isConstOrConstSplat(N0.getOperand(1), /*AllowUndefs*/ true,
49245 /*AllowTruncation*/ false);
49246 if (N1C && N01C) {
49247 const APInt &MulC = N01C->getAPIntValue();
49248 const APInt &AndC = N1C->getAPIntValue();
49249 APInt MulCLowBit = MulC & (-MulC);
49250 if (MulC.uge(AndC) && !MulC.isPowerOf2() &&
49251 (MulCLowBit + MulC).isPowerOf2()) {
49252 SDValue Neg = DAG.getNegative(N0.getOperand(0), dl, VT);
49253 int32_t MulCLowBitLog = MulCLowBit.exactLogBase2();
49254 assert(MulCLowBitLog != -1 &&
49255 "Isolated lowbit is somehow not a power of 2!");
49256 SDValue Shift = DAG.getNode(ISD::SHL, dl, VT, Neg,
49257 DAG.getConstant(MulCLowBitLog, dl, VT));
49258 return DAG.getNode(ISD::AND, dl, VT, Shift, N1);
49259 }
49260 }
49261 }
49262
49263 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
49264 return V;
49265
49266 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
49267 return R;
49268
49269 if (SDValue R = combineBitOpWithShift(N, DAG))
49270 return R;
49271
49272 if (SDValue R = combineBitOpWithPACK(N, DAG))
49273 return R;
49274
49275 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
49276 return FPLogic;
49277
49278 if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
49279 return R;
49280
49281 if (DCI.isBeforeLegalizeOps())
49282 return SDValue();
49283
49284 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
49285 return R;
49286
49287 if (SDValue R = combineAndNotIntoANDNP(N, DAG))
49288 return R;
49289
49290 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
49291 return ShiftRight;
49292
49293 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
49294 return R;
49295
49296 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
49297 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
49298 // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
49299 if (VT.isVector() && getTargetConstantFromNode(N1)) {
49300 unsigned Opc0 = N0.getOpcode();
49301 if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
49303 DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
49304 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
49305 SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
49306 return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
49307 }
49308 }
49309
49310 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
49311 // avoids slow variable shift (moving shift amount to ECX etc.)
49312 if (isOneConstant(N1) && N0->hasOneUse()) {
49313 SDValue Src = N0;
49314 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
49315 Src.getOpcode() == ISD::TRUNCATE) &&
49316 Src.getOperand(0)->hasOneUse())
49317 Src = Src.getOperand(0);
49318 bool ContainsNOT = false;
49319 X86::CondCode X86CC = X86::COND_B;
49320 // Peek through AND(NOT(SRL(X,Y)),1).
49321 if (isBitwiseNot(Src)) {
49322 Src = Src.getOperand(0);
49323 X86CC = X86::COND_AE;
49324 ContainsNOT = true;
49325 }
49326 if (Src.getOpcode() == ISD::SRL &&
49327 !isa<ConstantSDNode>(Src.getOperand(1))) {
49328 SDValue BitNo = Src.getOperand(1);
49329 Src = Src.getOperand(0);
49330 // Peek through AND(SRL(NOT(X),Y),1).
49331 if (isBitwiseNot(Src)) {
49332 Src = Src.getOperand(0);
49333 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
49334 ContainsNOT = true;
49335 }
49336 // If we have BMI2 then SHRX should be faster for i32/i64 cases.
49337 if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
49338 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
49339 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
49340 }
49341 }
49342
49343 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
49344 // Attempt to recursively combine a bitmask AND with shuffles.
49345 SDValue Op(N, 0);
49346 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49347 return Res;
49348
49349 // If either operand is a constant mask, then only the elements that aren't
49350 // zero are actually demanded by the other operand.
49351 auto GetDemandedMasks = [&](SDValue Op) {
49352 APInt UndefElts;
49353 SmallVector<APInt> EltBits;
49354 int NumElts = VT.getVectorNumElements();
49355 int EltSizeInBits = VT.getScalarSizeInBits();
49356 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
49357 APInt DemandedElts = APInt::getAllOnes(NumElts);
49358 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
49359 EltBits)) {
49360 DemandedBits.clearAllBits();
49361 DemandedElts.clearAllBits();
49362 for (int I = 0; I != NumElts; ++I) {
49363 if (UndefElts[I]) {
49364 // We can't assume an undef src element gives an undef dst - the
49365 // other src might be zero.
49366 DemandedBits.setAllBits();
49367 DemandedElts.setBit(I);
49368 } else if (!EltBits[I].isZero()) {
49369 DemandedBits |= EltBits[I];
49370 DemandedElts.setBit(I);
49371 }
49372 }
49373 }
49374 return std::make_pair(DemandedBits, DemandedElts);
49375 };
49376 APInt Bits0, Elts0;
49377 APInt Bits1, Elts1;
49378 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
49379 std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
49380
49381 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
49382 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
49383 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
49384 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
49385 if (N->getOpcode() != ISD::DELETED_NODE)
49386 DCI.AddToWorklist(N);
49387 return SDValue(N, 0);
49388 }
49389
49390 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
49391 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
49392 if (NewN0 || NewN1)
49393 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
49394 NewN1 ? NewN1 : N1);
49395 }
49396
49397 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
49398 if ((VT.getScalarSizeInBits() % 8) == 0 &&
49400 isa<ConstantSDNode>(N0.getOperand(1)) && N0->hasOneUse()) {
49401 SDValue BitMask = N1;
49402 SDValue SrcVec = N0.getOperand(0);
49403 EVT SrcVecVT = SrcVec.getValueType();
49404
49405 // Check that the constant bitmask masks whole bytes.
49406 APInt UndefElts;
49407 SmallVector<APInt, 64> EltBits;
49408 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
49409 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
49410 llvm::all_of(EltBits, [](const APInt &M) {
49411 return M.isZero() || M.isAllOnes();
49412 })) {
49413 unsigned NumElts = SrcVecVT.getVectorNumElements();
49414 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
49415 unsigned Idx = N0.getConstantOperandVal(1);
49416
49417 // Create a root shuffle mask from the byte mask and the extracted index.
49418 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
49419 for (unsigned i = 0; i != Scale; ++i) {
49420 if (UndefElts[i])
49421 continue;
49422 int VecIdx = Scale * Idx + i;
49423 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
49424 }
49425
49427 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
49429 /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,
49430 /*AllowVarPerLaneMask*/ true, DAG, Subtarget))
49431 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
49432 N0.getOperand(1));
49433 }
49434 }
49435
49436 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
49437 return R;
49438
49439 return SDValue();
49440}
49441
49442// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
49444 const X86Subtarget &Subtarget) {
49445 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
49446
49447 MVT VT = N->getSimpleValueType(0);
49448 unsigned EltSizeInBits = VT.getScalarSizeInBits();
49449 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
49450 return SDValue();
49451
49452 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
49453 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
49454 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
49455 return SDValue();
49456
49457 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
49458 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
49459 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
49460 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
49461 return SDValue();
49462
49463 // Attempt to extract constant byte masks.
49464 APInt UndefElts0, UndefElts1;
49465 SmallVector<APInt, 32> EltBits0, EltBits1;
49466 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
49467 /*AllowWholeUndefs*/ false,
49468 /*AllowPartialUndefs*/ false))
49469 return SDValue();
49470 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
49471 /*AllowWholeUndefs*/ false,
49472 /*AllowPartialUndefs*/ false))
49473 return SDValue();
49474
49475 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
49476 // TODO - add UNDEF elts support.
49477 if (UndefElts0[i] || UndefElts1[i])
49478 return SDValue();
49479 if (EltBits0[i] != ~EltBits1[i])
49480 return SDValue();
49481 }
49482
49483 SDLoc DL(N);
49484
49485 if (useVPTERNLOG(Subtarget, VT)) {
49486 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
49487 // VPTERNLOG is only available as vXi32/64-bit types.
49488 MVT OpSVT = EltSizeInBits <= 32 ? MVT::i32 : MVT::i64;
49489 MVT OpVT =
49490 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
49491 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
49492 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
49493 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
49494 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
49495 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
49496 DAG, Subtarget);
49497 return DAG.getBitcast(VT, Res);
49498 }
49499
49500 SDValue X = N->getOperand(0);
49501 SDValue Y =
49502 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
49503 DAG.getBitcast(VT, N1.getOperand(0)));
49504 return DAG.getNode(ISD::OR, DL, VT, X, Y);
49505}
49506
49507// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
49508static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
49509 if (N->getOpcode() != ISD::OR)
49510 return false;
49511
49512 SDValue N0 = N->getOperand(0);
49513 SDValue N1 = N->getOperand(1);
49514
49515 // Canonicalize AND to LHS.
49516 if (N1.getOpcode() == ISD::AND)
49517 std::swap(N0, N1);
49518
49519 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
49520 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
49521 return false;
49522
49523 Mask = N1.getOperand(0);
49524 X = N1.getOperand(1);
49525
49526 // Check to see if the mask appeared in both the AND and ANDNP.
49527 if (N0.getOperand(0) == Mask)
49528 Y = N0.getOperand(1);
49529 else if (N0.getOperand(1) == Mask)
49530 Y = N0.getOperand(0);
49531 else
49532 return false;
49533
49534 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
49535 // ANDNP combine allows other combines to happen that prevent matching.
49536 return true;
49537}
49538
49539// Try to fold:
49540// (or (and (m, y), (pandn m, x)))
49541// into:
49542// (vselect m, x, y)
49543// As a special case, try to fold:
49544// (or (and (m, (sub 0, x)), (pandn m, x)))
49545// into:
49546// (sub (xor X, M), M)
49548 const X86Subtarget &Subtarget) {
49549 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
49550
49551 EVT VT = N->getValueType(0);
49552 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
49553 (VT.is256BitVector() && Subtarget.hasInt256())))
49554 return SDValue();
49555
49556 SDValue X, Y, Mask;
49557 if (!matchLogicBlend(N, X, Y, Mask))
49558 return SDValue();
49559
49560 // Validate that X, Y, and Mask are bitcasts, and see through them.
49561 Mask = peekThroughBitcasts(Mask);
49564
49565 EVT MaskVT = Mask.getValueType();
49566 unsigned EltBits = MaskVT.getScalarSizeInBits();
49567
49568 // TODO: Attempt to handle floating point cases as well?
49569 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
49570 return SDValue();
49571
49572 SDLoc DL(N);
49573
49574 // Attempt to combine to conditional negate: (sub (xor X, M), M)
49575 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
49576 DAG, Subtarget))
49577 return Res;
49578
49579 // PBLENDVB is only available on SSE 4.1.
49580 if (!Subtarget.hasSSE41())
49581 return SDValue();
49582
49583 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
49584 if (Subtarget.hasVLX())
49585 return SDValue();
49586
49587 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
49588
49589 X = DAG.getBitcast(BlendVT, X);
49590 Y = DAG.getBitcast(BlendVT, Y);
49591 Mask = DAG.getBitcast(BlendVT, Mask);
49592 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
49593 return DAG.getBitcast(VT, Mask);
49594}
49595
49596// Helper function for combineOrCmpEqZeroToCtlzSrl
49597// Transforms:
49598// seteq(cmp x, 0)
49599// into:
49600// srl(ctlz x), log2(bitsize(x))
49601// Input pattern is checked by caller.
49603 SDValue Cmp = Op.getOperand(1);
49604 EVT VT = Cmp.getOperand(0).getValueType();
49605 unsigned Log2b = Log2_32(VT.getSizeInBits());
49606 SDLoc dl(Op);
49607 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
49608 // The result of the shift is true or false, and on X86, the 32-bit
49609 // encoding of shr and lzcnt is more desirable.
49610 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
49611 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
49612 DAG.getConstant(Log2b, dl, MVT::i8));
49613 return Scc;
49614}
49615
49616// Try to transform:
49617// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
49618// into:
49619// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
49620// Will also attempt to match more generic cases, eg:
49621// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
49622// Only applies if the target supports the FastLZCNT feature.
49625 const X86Subtarget &Subtarget) {
49626 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
49627 return SDValue();
49628
49629 auto isORCandidate = [](SDValue N) {
49630 return (N->getOpcode() == ISD::OR && N->hasOneUse());
49631 };
49632
49633 // Check the zero extend is extending to 32-bit or more. The code generated by
49634 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
49635 // instructions to clear the upper bits.
49636 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
49637 !isORCandidate(N->getOperand(0)))
49638 return SDValue();
49639
49640 // Check the node matches: setcc(eq, cmp 0)
49641 auto isSetCCCandidate = [](SDValue N) {
49642 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
49643 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
49644 N->getOperand(1).getOpcode() == X86ISD::CMP &&
49645 isNullConstant(N->getOperand(1).getOperand(1)) &&
49646 N->getOperand(1).getValueType().bitsGE(MVT::i32);
49647 };
49648
49649 SDNode *OR = N->getOperand(0).getNode();
49650 SDValue LHS = OR->getOperand(0);
49651 SDValue RHS = OR->getOperand(1);
49652
49653 // Save nodes matching or(or, setcc(eq, cmp 0)).
49655 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
49656 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
49657 ORNodes.push_back(OR);
49658 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
49659 LHS = OR->getOperand(0);
49660 RHS = OR->getOperand(1);
49661 }
49662
49663 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
49664 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
49665 !isORCandidate(SDValue(OR, 0)))
49666 return SDValue();
49667
49668 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
49669 // to
49670 // or(srl(ctlz),srl(ctlz)).
49671 // The dag combiner can then fold it into:
49672 // srl(or(ctlz, ctlz)).
49673 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
49674 SDValue Ret, NewRHS;
49675 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
49676 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
49677
49678 if (!Ret)
49679 return SDValue();
49680
49681 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
49682 while (!ORNodes.empty()) {
49683 OR = ORNodes.pop_back_val();
49684 LHS = OR->getOperand(0);
49685 RHS = OR->getOperand(1);
49686 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
49687 if (RHS->getOpcode() == ISD::OR)
49688 std::swap(LHS, RHS);
49689 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
49690 if (!NewRHS)
49691 return SDValue();
49692 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
49693 }
49694
49695 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
49696}
49697
49699 SDValue And1_L, SDValue And1_R,
49700 const SDLoc &DL, SelectionDAG &DAG) {
49701 if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
49702 return SDValue();
49703 SDValue NotOp = And0_L->getOperand(0);
49704 if (NotOp == And1_R)
49705 std::swap(And1_R, And1_L);
49706 if (NotOp != And1_L)
49707 return SDValue();
49708
49709 // (~(NotOp) & And0_R) | (NotOp & And1_R)
49710 // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
49711 EVT VT = And1_L->getValueType(0);
49712 SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);
49713 SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);
49714 SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
49715 SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);
49716 return Xor1;
49717}
49718
49719/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
49720/// equivalent `((x ^ y) & m) ^ y)` pattern.
49721/// This is typically a better representation for targets without a fused
49722/// "and-not" operation. This function is intended to be called from a
49723/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.
49725 // Note that masked-merge variants using XOR or ADD expressions are
49726 // normalized to OR by InstCombine so we only check for OR.
49727 assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
49728 SDValue N0 = Node->getOperand(0);
49729 if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
49730 return SDValue();
49731 SDValue N1 = Node->getOperand(1);
49732 if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
49733 return SDValue();
49734
49735 SDLoc DL(Node);
49736 SDValue N00 = N0->getOperand(0);
49737 SDValue N01 = N0->getOperand(1);
49738 SDValue N10 = N1->getOperand(0);
49739 SDValue N11 = N1->getOperand(1);
49740 if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
49741 return Result;
49742 if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
49743 return Result;
49744 if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
49745 return Result;
49746 if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
49747 return Result;
49748 return SDValue();
49749}
49750
49751/// If this is an add or subtract where one operand is produced by a cmp+setcc,
49752/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
49753/// with CMP+{ADC, SBB}.
49754/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
49755static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
49756 SDValue X, SDValue Y,
49757 SelectionDAG &DAG,
49758 bool ZeroSecondOpOnly = false) {
49759 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
49760 return SDValue();
49761
49762 // Look through a one-use zext.
49763 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
49764 Y = Y.getOperand(0);
49765
49767 SDValue EFLAGS;
49768 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
49769 CC = (X86::CondCode)Y.getConstantOperandVal(0);
49770 EFLAGS = Y.getOperand(1);
49771 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
49772 Y.hasOneUse()) {
49773 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
49774 }
49775
49776 if (!EFLAGS)
49777 return SDValue();
49778
49779 // If X is -1 or 0, then we have an opportunity to avoid constants required in
49780 // the general case below.
49781 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
49782 if (ConstantX && !ZeroSecondOpOnly) {
49783 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
49784 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
49785 // This is a complicated way to get -1 or 0 from the carry flag:
49786 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
49787 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
49788 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49789 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49790 EFLAGS);
49791 }
49792
49793 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
49794 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
49795 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
49796 EFLAGS.getValueType().isInteger() &&
49797 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49798 // Swap the operands of a SUB, and we have the same pattern as above.
49799 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
49800 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
49801 SDValue NewSub = DAG.getNode(
49802 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
49803 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49804 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
49805 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49806 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49807 NewEFLAGS);
49808 }
49809 }
49810 }
49811
49812 if (CC == X86::COND_B) {
49813 // X + SETB Z --> adc X, 0
49814 // X - SETB Z --> sbb X, 0
49815 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
49816 DAG.getVTList(VT, MVT::i32), X,
49817 DAG.getConstant(0, DL, VT), EFLAGS);
49818 }
49819
49820 if (ZeroSecondOpOnly)
49821 return SDValue();
49822
49823 if (CC == X86::COND_A) {
49824 // Try to convert COND_A into COND_B in an attempt to facilitate
49825 // materializing "setb reg".
49826 //
49827 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
49828 // cannot take an immediate as its first operand.
49829 //
49830 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
49831 EFLAGS.getValueType().isInteger() &&
49832 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49833 SDValue NewSub =
49834 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
49835 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49836 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
49837 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
49838 DAG.getVTList(VT, MVT::i32), X,
49839 DAG.getConstant(0, DL, VT), NewEFLAGS);
49840 }
49841 }
49842
49843 if (CC == X86::COND_AE) {
49844 // X + SETAE --> sbb X, -1
49845 // X - SETAE --> adc X, -1
49846 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
49847 DAG.getVTList(VT, MVT::i32), X,
49848 DAG.getConstant(-1, DL, VT), EFLAGS);
49849 }
49850
49851 if (CC == X86::COND_BE) {
49852 // X + SETBE --> sbb X, -1
49853 // X - SETBE --> adc X, -1
49854 // Try to convert COND_BE into COND_AE in an attempt to facilitate
49855 // materializing "setae reg".
49856 //
49857 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
49858 // cannot take an immediate as its first operand.
49859 //
49860 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
49861 EFLAGS.getValueType().isInteger() &&
49862 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49863 SDValue NewSub =
49864 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
49865 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49866 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
49867 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
49868 DAG.getVTList(VT, MVT::i32), X,
49869 DAG.getConstant(-1, DL, VT), NewEFLAGS);
49870 }
49871 }
49872
49873 if (CC != X86::COND_E && CC != X86::COND_NE)
49874 return SDValue();
49875
49876 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
49877 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
49878 !EFLAGS.getOperand(0).getValueType().isInteger())
49879 return SDValue();
49880
49881 SDValue Z = EFLAGS.getOperand(0);
49882 EVT ZVT = Z.getValueType();
49883
49884 // If X is -1 or 0, then we have an opportunity to avoid constants required in
49885 // the general case below.
49886 if (ConstantX) {
49887 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
49888 // fake operands:
49889 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
49890 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
49891 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
49892 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
49893 SDValue Zero = DAG.getConstant(0, DL, ZVT);
49894 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49895 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
49896 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49897 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49898 SDValue(Neg.getNode(), 1));
49899 }
49900
49901 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
49902 // with fake operands:
49903 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
49904 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
49905 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
49906 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
49907 SDValue One = DAG.getConstant(1, DL, ZVT);
49908 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49909 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
49910 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49911 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49912 Cmp1.getValue(1));
49913 }
49914 }
49915
49916 // (cmp Z, 1) sets the carry flag if Z is 0.
49917 SDValue One = DAG.getConstant(1, DL, ZVT);
49918 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49919 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
49920
49921 // Add the flags type for ADC/SBB nodes.
49922 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49923
49924 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
49925 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
49926 if (CC == X86::COND_NE)
49927 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
49928 DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
49929
49930 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
49931 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
49932 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
49933 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
49934}
49935
49936/// If this is an add or subtract where one operand is produced by a cmp+setcc,
49937/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
49938/// with CMP+{ADC, SBB}.
49940 bool IsSub = N->getOpcode() == ISD::SUB;
49941 SDValue X = N->getOperand(0);
49942 SDValue Y = N->getOperand(1);
49943 EVT VT = N->getValueType(0);
49944 SDLoc DL(N);
49945
49946 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
49947 return ADCOrSBB;
49948
49949 // Commute and try again (negate the result for subtracts).
49950 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
49951 if (IsSub)
49952 ADCOrSBB = DAG.getNegative(ADCOrSBB, DL, VT);
49953 return ADCOrSBB;
49954 }
49955
49956 return SDValue();
49957}
49958
49960 SelectionDAG &DAG) {
49961 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) &&
49962 "Unexpected opcode");
49963
49964 // Delegate to combineAddOrSubToADCOrSBB if we have:
49965 //
49966 // (xor/or (zero_extend (setcc)) imm)
49967 //
49968 // where imm is odd if and only if we have xor, in which case the XOR/OR are
49969 // equivalent to a SUB/ADD, respectively.
49970 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
49971 N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
49972 if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
49973 bool IsSub = N->getOpcode() == ISD::XOR;
49974 bool N1COdd = N1C->getZExtValue() & 1;
49975 if (IsSub ? N1COdd : !N1COdd) {
49976 SDLoc DL(N);
49977 EVT VT = N->getValueType(0);
49978 if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
49979 return R;
49980 }
49981 }
49982 }
49983
49984 // not(pcmpeq(and(X,CstPow2),0)) -> pcmpeq(and(X,CstPow2),CstPow2)
49985 if (N->getOpcode() == ISD::XOR && N0.getOpcode() == X86ISD::PCMPEQ &&
49986 N0.getOperand(0).getOpcode() == ISD::AND &&
49989 MVT VT = N->getSimpleValueType(0);
49990 APInt UndefElts;
49991 SmallVector<APInt> EltBits;
49993 VT.getScalarSizeInBits(), UndefElts,
49994 EltBits)) {
49995 bool IsPow2OrUndef = true;
49996 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
49997 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
49998
49999 if (IsPow2OrUndef)
50000 return DAG.getNode(X86ISD::PCMPEQ, SDLoc(N), VT, N0.getOperand(0),
50001 N0.getOperand(0).getOperand(1));
50002 }
50003 }
50004
50005 return SDValue();
50006}
50007
50010 const X86Subtarget &Subtarget) {
50011 SDValue N0 = N->getOperand(0);
50012 SDValue N1 = N->getOperand(1);
50013 EVT VT = N->getValueType(0);
50014 SDLoc dl(N);
50015 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50016
50017 // If this is SSE1 only convert to FOR to avoid scalarization.
50018 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
50019 return DAG.getBitcast(MVT::v4i32,
50020 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
50021 DAG.getBitcast(MVT::v4f32, N0),
50022 DAG.getBitcast(MVT::v4f32, N1)));
50023 }
50024
50025 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
50026 // TODO: Support multiple SrcOps.
50027 if (VT == MVT::i1) {
50029 SmallVector<APInt, 2> SrcPartials;
50030 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
50031 SrcOps.size() == 1) {
50032 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
50033 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
50034 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
50035 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
50036 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
50037 if (Mask) {
50038 assert(SrcPartials[0].getBitWidth() == NumElts &&
50039 "Unexpected partial reduction mask");
50040 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
50041 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
50042 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
50043 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
50044 }
50045 }
50046 }
50047
50048 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
50049 return R;
50050
50051 if (SDValue R = combineBitOpWithShift(N, DAG))
50052 return R;
50053
50054 if (SDValue R = combineBitOpWithPACK(N, DAG))
50055 return R;
50056
50057 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
50058 return FPLogic;
50059
50060 if (DCI.isBeforeLegalizeOps())
50061 return SDValue();
50062
50063 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
50064 return R;
50065
50066 if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
50067 return R;
50068
50069 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
50070 return R;
50071
50072 // (0 - SetCC) | C -> (zext (not SetCC)) * (C + 1) - 1 if we can get a LEA out of it.
50073 if ((VT == MVT::i32 || VT == MVT::i64) &&
50074 N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
50075 isNullConstant(N0.getOperand(0))) {
50076 SDValue Cond = N0.getOperand(1);
50077 if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
50078 Cond = Cond.getOperand(0);
50079
50080 if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
50081 if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
50082 uint64_t Val = CN->getZExtValue();
50083 if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 || Val == 8) {
50084 X86::CondCode CCode = (X86::CondCode)Cond.getConstantOperandVal(0);
50085 CCode = X86::GetOppositeBranchCondition(CCode);
50086 SDValue NotCond = getSETCC(CCode, Cond.getOperand(1), SDLoc(Cond), DAG);
50087
50088 SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
50089 R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
50090 R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
50091 return R;
50092 }
50093 }
50094 }
50095 }
50096
50097 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
50098 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
50099 // iff the upper elements of the non-shifted arg are zero.
50100 // KUNPCK require 16+ bool vector elements.
50101 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
50102 unsigned NumElts = VT.getVectorNumElements();
50103 unsigned HalfElts = NumElts / 2;
50104 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
50105 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
50106 N1.getConstantOperandAPInt(1) == HalfElts &&
50107 DAG.MaskedVectorIsZero(N0, UpperElts)) {
50108 return DAG.getNode(
50109 ISD::CONCAT_VECTORS, dl, VT,
50110 extractSubVector(N0, 0, DAG, dl, HalfElts),
50111 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
50112 }
50113 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
50114 N0.getConstantOperandAPInt(1) == HalfElts &&
50115 DAG.MaskedVectorIsZero(N1, UpperElts)) {
50116 return DAG.getNode(
50117 ISD::CONCAT_VECTORS, dl, VT,
50118 extractSubVector(N1, 0, DAG, dl, HalfElts),
50119 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
50120 }
50121 }
50122
50123 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
50124 // Attempt to recursively combine an OR of shuffles.
50125 SDValue Op(N, 0);
50126 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
50127 return Res;
50128
50129 // If either operand is a constant mask, then only the elements that aren't
50130 // allones are actually demanded by the other operand.
50131 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
50132 APInt UndefElts;
50133 SmallVector<APInt> EltBits;
50134 int NumElts = VT.getVectorNumElements();
50135 int EltSizeInBits = VT.getScalarSizeInBits();
50136 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
50137 return false;
50138
50139 APInt DemandedElts = APInt::getZero(NumElts);
50140 for (int I = 0; I != NumElts; ++I)
50141 if (!EltBits[I].isAllOnes())
50142 DemandedElts.setBit(I);
50143
50144 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
50145 };
50146 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
50147 if (N->getOpcode() != ISD::DELETED_NODE)
50148 DCI.AddToWorklist(N);
50149 return SDValue(N, 0);
50150 }
50151 }
50152
50153 // We should fold "masked merge" patterns when `andn` is not available.
50154 if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)
50155 if (SDValue R = foldMaskedMerge(N, DAG))
50156 return R;
50157
50158 if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))
50159 return R;
50160
50161 return SDValue();
50162}
50163
50164/// Try to turn tests against the signbit in the form of:
50165/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
50166/// into:
50167/// SETGT(X, -1)
50169 // This is only worth doing if the output type is i8 or i1.
50170 EVT ResultType = N->getValueType(0);
50171 if (ResultType != MVT::i8 && ResultType != MVT::i1)
50172 return SDValue();
50173
50174 SDValue N0 = N->getOperand(0);
50175 SDValue N1 = N->getOperand(1);
50176
50177 // We should be performing an xor against a truncated shift.
50178 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
50179 return SDValue();
50180
50181 // Make sure we are performing an xor against one.
50182 if (!isOneConstant(N1))
50183 return SDValue();
50184
50185 // SetCC on x86 zero extends so only act on this if it's a logical shift.
50186 SDValue Shift = N0.getOperand(0);
50187 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
50188 return SDValue();
50189
50190 // Make sure we are truncating from one of i16, i32 or i64.
50191 EVT ShiftTy = Shift.getValueType();
50192 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
50193 return SDValue();
50194
50195 // Make sure the shift amount extracts the sign bit.
50196 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
50197 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
50198 return SDValue();
50199
50200 // Create a greater-than comparison against -1.
50201 // N.B. Using SETGE against 0 works but we want a canonical looking
50202 // comparison, using SETGT matches up with what TranslateX86CC.
50203 SDLoc DL(N);
50204 SDValue ShiftOp = Shift.getOperand(0);
50205 EVT ShiftOpTy = ShiftOp.getValueType();
50206 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50207 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
50208 *DAG.getContext(), ResultType);
50209 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
50210 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
50211 if (SetCCResultType != ResultType)
50212 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
50213 return Cond;
50214}
50215
50216/// Turn vector tests of the signbit in the form of:
50217/// xor (sra X, elt_size(X)-1), -1
50218/// into:
50219/// pcmpgt X, -1
50220///
50221/// This should be called before type legalization because the pattern may not
50222/// persist after that.
50224 const X86Subtarget &Subtarget) {
50225 EVT VT = N->getValueType(0);
50226 if (!VT.isSimple())
50227 return SDValue();
50228
50229 switch (VT.getSimpleVT().SimpleTy) {
50230 // clang-format off
50231 default: return SDValue();
50232 case MVT::v16i8:
50233 case MVT::v8i16:
50234 case MVT::v4i32:
50235 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
50236 case MVT::v32i8:
50237 case MVT::v16i16:
50238 case MVT::v8i32:
50239 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
50240 // clang-format on
50241 }
50242
50243 // There must be a shift right algebraic before the xor, and the xor must be a
50244 // 'not' operation.
50245 SDValue Shift = N->getOperand(0);
50246 SDValue Ones = N->getOperand(1);
50247 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
50249 return SDValue();
50250
50251 // The shift should be smearing the sign bit across each vector element.
50252 auto *ShiftAmt =
50253 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
50254 if (!ShiftAmt ||
50255 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
50256 return SDValue();
50257
50258 // Create a greater-than comparison against -1. We don't use the more obvious
50259 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
50260 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
50261}
50262
50263/// Detect patterns of truncation with unsigned saturation:
50264///
50265/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
50266/// Return the source value x to be truncated or SDValue() if the pattern was
50267/// not matched.
50268///
50269/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
50270/// where C1 >= 0 and C2 is unsigned max of destination type.
50271///
50272/// (truncate (smax (smin (x, C2), C1)) to dest_type)
50273/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
50274///
50275/// These two patterns are equivalent to:
50276/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
50277/// So return the smax(x, C1) value to be truncated or SDValue() if the
50278/// pattern was not matched.
50280 const SDLoc &DL) {
50281 EVT InVT = In.getValueType();
50282
50283 // Saturation with truncation. We truncate from InVT to VT.
50285 "Unexpected types for truncate operation");
50286
50287 // Match min/max and return limit value as a parameter.
50288 auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
50289 if (V.getOpcode() == Opcode &&
50290 ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
50291 return V.getOperand(0);
50292 return SDValue();
50293 };
50294
50295 APInt C1, C2;
50296 if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
50297 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
50298 // the element size of the destination type.
50299 if (C2.isMask(VT.getScalarSizeInBits()))
50300 return UMin;
50301
50302 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
50303 if (MatchMinMax(SMin, ISD::SMAX, C1))
50304 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
50305 return SMin;
50306
50307 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
50308 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
50309 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
50310 C2.uge(C1)) {
50311 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
50312 }
50313
50314 return SDValue();
50315}
50316
50317/// Detect patterns of truncation with signed saturation:
50318/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
50319/// signed_max_of_dest_type)) to dest_type)
50320/// or:
50321/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
50322/// signed_min_of_dest_type)) to dest_type).
50323/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
50324/// Return the source value to be truncated or SDValue() if the pattern was not
50325/// matched.
50326static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
50327 unsigned NumDstBits = VT.getScalarSizeInBits();
50328 unsigned NumSrcBits = In.getScalarValueSizeInBits();
50329 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
50330
50331 auto MatchMinMax = [](SDValue V, unsigned Opcode,
50332 const APInt &Limit) -> SDValue {
50333 APInt C;
50334 if (V.getOpcode() == Opcode &&
50335 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
50336 return V.getOperand(0);
50337 return SDValue();
50338 };
50339
50340 APInt SignedMax, SignedMin;
50341 if (MatchPackUS) {
50342 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
50343 SignedMin = APInt(NumSrcBits, 0);
50344 } else {
50345 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
50346 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
50347 }
50348
50349 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
50350 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
50351 return SMax;
50352
50353 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
50354 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
50355 return SMin;
50356
50357 return SDValue();
50358}
50359
50361 SelectionDAG &DAG,
50362 const X86Subtarget &Subtarget) {
50363 if (!Subtarget.hasSSE2() || !VT.isVector())
50364 return SDValue();
50365
50366 EVT SVT = VT.getVectorElementType();
50367 EVT InVT = In.getValueType();
50368 EVT InSVT = InVT.getVectorElementType();
50369
50370 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
50371 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
50372 // and concatenate at the same time. Then we can use a final vpmovuswb to
50373 // clip to 0-255.
50374 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
50375 InVT == MVT::v16i32 && VT == MVT::v16i8) {
50376 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
50377 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
50378 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
50379 DL, DAG, Subtarget);
50380 assert(Mid && "Failed to pack!");
50381 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
50382 }
50383 }
50384
50385 // vXi32 truncate instructions are available with AVX512F.
50386 // vXi16 truncate instructions are only available with AVX512BW.
50387 // For 256-bit or smaller vectors, we require VLX.
50388 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
50389 // If the result type is 256-bits or larger and we have disable 512-bit
50390 // registers, we should go ahead and use the pack instructions if possible.
50391 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
50392 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
50393 (InVT.getSizeInBits() > 128) &&
50394 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
50395 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
50396
50397 if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
50399 (SVT == MVT::i8 || SVT == MVT::i16) &&
50400 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
50401 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
50402 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
50403 if (SVT == MVT::i8 && InSVT == MVT::i32) {
50404 EVT MidVT = VT.changeVectorElementType(MVT::i16);
50405 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
50406 DAG, Subtarget);
50407 assert(Mid && "Failed to pack!");
50409 Subtarget);
50410 assert(V && "Failed to pack!");
50411 return V;
50412 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
50413 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
50414 Subtarget);
50415 }
50416 if (SDValue SSatVal = detectSSatPattern(In, VT))
50417 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
50418 Subtarget);
50419 }
50420
50421 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50422 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
50423 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
50424 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
50425 unsigned TruncOpc = 0;
50426 SDValue SatVal;
50427 if (SDValue SSatVal = detectSSatPattern(In, VT)) {
50428 SatVal = SSatVal;
50429 TruncOpc = X86ISD::VTRUNCS;
50430 } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
50431 SatVal = USatVal;
50432 TruncOpc = X86ISD::VTRUNCUS;
50433 }
50434 if (SatVal) {
50435 unsigned ResElts = VT.getVectorNumElements();
50436 // If the input type is less than 512 bits and we don't have VLX, we need
50437 // to widen to 512 bits.
50438 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
50439 unsigned NumConcats = 512 / InVT.getSizeInBits();
50440 ResElts *= NumConcats;
50441 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
50442 ConcatOps[0] = SatVal;
50443 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
50444 NumConcats * InVT.getVectorNumElements());
50445 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
50446 }
50447 // Widen the result if its narrower than 128 bits.
50448 if (ResElts * SVT.getSizeInBits() < 128)
50449 ResElts = 128 / SVT.getSizeInBits();
50450 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
50451 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
50452 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
50453 DAG.getIntPtrConstant(0, DL));
50454 }
50455 }
50456
50457 return SDValue();
50458}
50459
50460/// This function detects the AVG pattern between vectors of unsigned i8/i16,
50461/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
50462/// ISD::AVGCEILU (AVG) instruction.
50464 const X86Subtarget &Subtarget,
50465 const SDLoc &DL) {
50466 if (!VT.isVector())
50467 return SDValue();
50468 EVT InVT = In.getValueType();
50469 unsigned NumElems = VT.getVectorNumElements();
50470
50471 EVT ScalarVT = VT.getVectorElementType();
50472 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
50473 return SDValue();
50474
50475 // InScalarVT is the intermediate type in AVG pattern and it should be greater
50476 // than the original input type (i8/i16).
50477 EVT InScalarVT = InVT.getVectorElementType();
50478 if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
50479 return SDValue();
50480
50481 if (!Subtarget.hasSSE2())
50482 return SDValue();
50483
50484 // Detect the following pattern:
50485 //
50486 // %1 = zext <N x i8> %a to <N x i32>
50487 // %2 = zext <N x i8> %b to <N x i32>
50488 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
50489 // %4 = add nuw nsw <N x i32> %3, %2
50490 // %5 = lshr <N x i32> %N, <i32 1 x N>
50491 // %6 = trunc <N x i32> %5 to <N x i8>
50492 //
50493 // In AVX512, the last instruction can also be a trunc store.
50494 if (In.getOpcode() != ISD::SRL)
50495 return SDValue();
50496
50497 // A lambda checking the given SDValue is a constant vector and each element
50498 // is in the range [Min, Max].
50499 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
50500 return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
50501 return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
50502 });
50503 };
50504
50505 auto IsZExtLike = [DAG = &DAG, ScalarVT](SDValue V) {
50506 unsigned MaxActiveBits = DAG->computeKnownBits(V).countMaxActiveBits();
50507 return MaxActiveBits <= ScalarVT.getSizeInBits();
50508 };
50509
50510 // Check if each element of the vector is right-shifted by one.
50511 SDValue LHS = In.getOperand(0);
50512 SDValue RHS = In.getOperand(1);
50513 if (!IsConstVectorInRange(RHS, 1, 1))
50514 return SDValue();
50515 if (LHS.getOpcode() != ISD::ADD)
50516 return SDValue();
50517
50518 // Detect a pattern of a + b + 1 where the order doesn't matter.
50519 SDValue Operands[3];
50520 Operands[0] = LHS.getOperand(0);
50521 Operands[1] = LHS.getOperand(1);
50522
50523 auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
50524 ArrayRef<SDValue> Ops) {
50525 return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops);
50526 };
50527
50528 auto AVGSplitter = [&](std::array<SDValue, 2> Ops) {
50529 for (SDValue &Op : Ops)
50530 if (Op.getValueType() != VT)
50531 Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
50532 // Pad to a power-of-2 vector, split+apply and extract the original vector.
50533 unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
50534 EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
50535 if (NumElemsPow2 != NumElems) {
50536 for (SDValue &Op : Ops) {
50537 SmallVector<SDValue, 32> EltsOfOp(NumElemsPow2, DAG.getUNDEF(ScalarVT));
50538 for (unsigned i = 0; i != NumElems; ++i) {
50539 SDValue Idx = DAG.getIntPtrConstant(i, DL);
50540 EltsOfOp[i] =
50541 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op, Idx);
50542 }
50543 Op = DAG.getBuildVector(Pow2VT, DL, EltsOfOp);
50544 }
50545 }
50546 SDValue Res = SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, Ops, AVGBuilder);
50547 if (NumElemsPow2 == NumElems)
50548 return Res;
50549 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
50550 DAG.getIntPtrConstant(0, DL));
50551 };
50552
50553 // Take care of the case when one of the operands is a constant vector whose
50554 // element is in the range [1, 256].
50555 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
50556 IsZExtLike(Operands[0])) {
50557 // The pattern is detected. Subtract one from the constant vector, then
50558 // demote it and emit X86ISD::AVG instruction.
50559 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
50560 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
50561 return AVGSplitter({Operands[0], Operands[1]});
50562 }
50563
50564 // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
50565 // Match the or case only if its 'add-like' - can be replaced by an add.
50566 auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
50567 if (ISD::ADD == V.getOpcode()) {
50568 Op0 = V.getOperand(0);
50569 Op1 = V.getOperand(1);
50570 return true;
50571 }
50572 if (ISD::ZERO_EXTEND != V.getOpcode())
50573 return false;
50574 V = V.getOperand(0);
50575 if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
50576 !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
50577 return false;
50578 Op0 = V.getOperand(0);
50579 Op1 = V.getOperand(1);
50580 return true;
50581 };
50582
50583 SDValue Op0, Op1;
50584 if (FindAddLike(Operands[0], Op0, Op1))
50585 std::swap(Operands[0], Operands[1]);
50586 else if (!FindAddLike(Operands[1], Op0, Op1))
50587 return SDValue();
50588 Operands[2] = Op0;
50589 Operands[1] = Op1;
50590
50591 // Now we have three operands of two additions. Check that one of them is a
50592 // constant vector with ones, and the other two can be promoted from i8/i16.
50593 for (SDValue &Op : Operands) {
50594 if (!IsConstVectorInRange(Op, 1, 1))
50595 continue;
50596 std::swap(Op, Operands[2]);
50597
50598 // Check if Operands[0] and Operands[1] are results of type promotion.
50599 for (int j = 0; j < 2; ++j)
50600 if (Operands[j].getValueType() != VT)
50601 if (!IsZExtLike(Operands[j]))
50602 return SDValue();
50603
50604 // The pattern is detected, emit X86ISD::AVG instruction(s).
50605 return AVGSplitter({Operands[0], Operands[1]});
50606 }
50607
50608 return SDValue();
50609}
50610
50613 const X86Subtarget &Subtarget) {
50614 LoadSDNode *Ld = cast<LoadSDNode>(N);
50615 EVT RegVT = Ld->getValueType(0);
50616 EVT MemVT = Ld->getMemoryVT();
50617 SDLoc dl(Ld);
50618 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50619
50620 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
50621 // into two 16-byte operations. Also split non-temporal aligned loads on
50622 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
50624 unsigned Fast;
50625 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
50626 Ext == ISD::NON_EXTLOAD &&
50627 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
50628 Ld->getAlign() >= Align(16)) ||
50629 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
50630 *Ld->getMemOperand(), &Fast) &&
50631 !Fast))) {
50632 unsigned NumElems = RegVT.getVectorNumElements();
50633 if (NumElems < 2)
50634 return SDValue();
50635
50636 unsigned HalfOffset = 16;
50637 SDValue Ptr1 = Ld->getBasePtr();
50638 SDValue Ptr2 =
50639 DAG.getMemBasePlusOffset(Ptr1, TypeSize::getFixed(HalfOffset), dl);
50640 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
50641 NumElems / 2);
50642 SDValue Load1 =
50643 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
50644 Ld->getOriginalAlign(),
50645 Ld->getMemOperand()->getFlags());
50646 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
50647 Ld->getPointerInfo().getWithOffset(HalfOffset),
50648 Ld->getOriginalAlign(),
50649 Ld->getMemOperand()->getFlags());
50650 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
50651 Load1.getValue(1), Load2.getValue(1));
50652
50653 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
50654 return DCI.CombineTo(N, NewVec, TF, true);
50655 }
50656
50657 // Bool vector load - attempt to cast to an integer, as we have good
50658 // (vXiY *ext(vXi1 bitcast(iX))) handling.
50659 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
50660 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
50661 unsigned NumElts = RegVT.getVectorNumElements();
50662 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
50663 if (TLI.isTypeLegal(IntVT)) {
50664 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
50665 Ld->getPointerInfo(),
50666 Ld->getOriginalAlign(),
50667 Ld->getMemOperand()->getFlags());
50668 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
50669 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
50670 }
50671 }
50672
50673 // If we also load/broadcast this to a wider type, then just extract the
50674 // lowest subvector.
50675 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
50676 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
50677 SDValue Ptr = Ld->getBasePtr();
50678 SDValue Chain = Ld->getChain();
50679 for (SDNode *User : Chain->uses()) {
50680 auto *UserLd = dyn_cast<MemSDNode>(User);
50681 if (User != N && UserLd &&
50682 (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
50683 User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
50685 UserLd->getChain() == Chain && !User->hasAnyUseOfValue(1) &&
50686 User->getValueSizeInBits(0).getFixedValue() >
50687 RegVT.getFixedSizeInBits()) {
50688 if (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
50689 UserLd->getBasePtr() == Ptr &&
50690 UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits()) {
50691 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
50692 RegVT.getSizeInBits());
50693 Extract = DAG.getBitcast(RegVT, Extract);
50694 return DCI.CombineTo(N, Extract, SDValue(User, 1));
50695 }
50696 auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs,
50697 ArrayRef<APInt> Bits, ArrayRef<APInt> UserBits) {
50698 for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) {
50699 if (Undefs[I])
50700 continue;
50701 if (UserUndefs[I] || Bits[I] != UserBits[I])
50702 return false;
50703 }
50704 return true;
50705 };
50706 // See if we are loading a constant that matches in the lower
50707 // bits of a longer constant (but from a different constant pool ptr).
50708 EVT UserVT = User->getValueType(0);
50709 SDValue UserPtr = UserLd->getBasePtr();
50711 const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
50712 if (LdC && UserC && UserPtr != Ptr) {
50713 unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
50714 unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
50715 if (LdSize < UserSize || !ISD::isNormalLoad(User)) {
50716 APInt Undefs, UserUndefs;
50717 SmallVector<APInt> Bits, UserBits;
50718 unsigned NumBits = std::min(RegVT.getScalarSizeInBits(),
50719 UserVT.getScalarSizeInBits());
50720 if (getTargetConstantBitsFromNode(SDValue(N, 0), NumBits, Undefs,
50721 Bits) &&
50723 UserUndefs, UserBits)) {
50724 if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) {
50725 SDValue Extract = extractSubVector(
50726 SDValue(User, 0), 0, DAG, SDLoc(N), RegVT.getSizeInBits());
50727 Extract = DAG.getBitcast(RegVT, Extract);
50728 return DCI.CombineTo(N, Extract, SDValue(User, 1));
50729 }
50730 }
50731 }
50732 }
50733 }
50734 }
50735 }
50736
50737 // Cast ptr32 and ptr64 pointers to the default address space before a load.
50738 unsigned AddrSpace = Ld->getAddressSpace();
50739 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
50740 AddrSpace == X86AS::PTR32_UPTR) {
50741 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
50742 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
50743 SDValue Cast =
50744 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
50745 return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast,
50746 Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
50747 Ld->getMemOperand()->getFlags());
50748 }
50749 }
50750
50751 return SDValue();
50752}
50753
50754/// If V is a build vector of boolean constants and exactly one of those
50755/// constants is true, return the operand index of that true element.
50756/// Otherwise, return -1.
50757static int getOneTrueElt(SDValue V) {
50758 // This needs to be a build vector of booleans.
50759 // TODO: Checking for the i1 type matches the IR definition for the mask,
50760 // but the mask check could be loosened to i8 or other types. That might
50761 // also require checking more than 'allOnesValue'; eg, the x86 HW
50762 // instructions only require that the MSB is set for each mask element.
50763 // The ISD::MSTORE comments/definition do not specify how the mask operand
50764 // is formatted.
50765 auto *BV = dyn_cast<BuildVectorSDNode>(V);
50766 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
50767 return -1;
50768
50769 int TrueIndex = -1;
50770 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
50771 for (unsigned i = 0; i < NumElts; ++i) {
50772 const SDValue &Op = BV->getOperand(i);
50773 if (Op.isUndef())
50774 continue;
50775 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
50776 if (!ConstNode)
50777 return -1;
50778 if (ConstNode->getAPIntValue().countr_one() >= 1) {
50779 // If we already found a one, this is too many.
50780 if (TrueIndex >= 0)
50781 return -1;
50782 TrueIndex = i;
50783 }
50784 }
50785 return TrueIndex;
50786}
50787
50788/// Given a masked memory load/store operation, return true if it has one mask
50789/// bit set. If it has one mask bit set, then also return the memory address of
50790/// the scalar element to load/store, the vector index to insert/extract that
50791/// scalar element, and the alignment for the scalar memory access.
50793 SelectionDAG &DAG, SDValue &Addr,
50794 SDValue &Index, Align &Alignment,
50795 unsigned &Offset) {
50796 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
50797 if (TrueMaskElt < 0)
50798 return false;
50799
50800 // Get the address of the one scalar element that is specified by the mask
50801 // using the appropriate offset from the base pointer.
50802 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
50803 Offset = 0;
50804 Addr = MaskedOp->getBasePtr();
50805 if (TrueMaskElt != 0) {
50806 Offset = TrueMaskElt * EltVT.getStoreSize();
50808 SDLoc(MaskedOp));
50809 }
50810
50811 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
50812 Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
50813 EltVT.getStoreSize());
50814 return true;
50815}
50816
50817/// If exactly one element of the mask is set for a non-extending masked load,
50818/// it is a scalar load and vector insert.
50819/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
50820/// mask have already been optimized in IR, so we don't bother with those here.
50821static SDValue
50824 const X86Subtarget &Subtarget) {
50825 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
50826 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
50827 // However, some target hooks may need to be added to know when the transform
50828 // is profitable. Endianness would also have to be considered.
50829
50830 SDValue Addr, VecIndex;
50831 Align Alignment;
50832 unsigned Offset;
50833 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
50834 return SDValue();
50835
50836 // Load the one scalar element that is specified by the mask using the
50837 // appropriate offset from the base pointer.
50838 SDLoc DL(ML);
50839 EVT VT = ML->getValueType(0);
50840 EVT EltVT = VT.getVectorElementType();
50841
50842 EVT CastVT = VT;
50843 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
50844 EltVT = MVT::f64;
50845 CastVT = VT.changeVectorElementType(EltVT);
50846 }
50847
50848 SDValue Load =
50849 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
50850 ML->getPointerInfo().getWithOffset(Offset),
50851 Alignment, ML->getMemOperand()->getFlags());
50852
50853 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
50854
50855 // Insert the loaded element into the appropriate place in the vector.
50856 SDValue Insert =
50857 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
50858 Insert = DAG.getBitcast(VT, Insert);
50859 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
50860}
50861
50862static SDValue
50865 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
50866 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
50867 return SDValue();
50868
50869 SDLoc DL(ML);
50870 EVT VT = ML->getValueType(0);
50871
50872 // If we are loading the first and last elements of a vector, it is safe and
50873 // always faster to load the whole vector. Replace the masked load with a
50874 // vector load and select.
50875 unsigned NumElts = VT.getVectorNumElements();
50876 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
50877 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
50878 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
50879 if (LoadFirstElt && LoadLastElt) {
50880 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
50881 ML->getMemOperand());
50882 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
50883 ML->getPassThru());
50884 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
50885 }
50886
50887 // Convert a masked load with a constant mask into a masked load and a select.
50888 // This allows the select operation to use a faster kind of select instruction
50889 // (for example, vblendvps -> vblendps).
50890
50891 // Don't try this if the pass-through operand is already undefined. That would
50892 // cause an infinite loop because that's what we're about to create.
50893 if (ML->getPassThru().isUndef())
50894 return SDValue();
50895
50896 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
50897 return SDValue();
50898
50899 // The new masked load has an undef pass-through operand. The select uses the
50900 // original pass-through operand.
50901 SDValue NewML = DAG.getMaskedLoad(
50902 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
50903 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
50904 ML->getAddressingMode(), ML->getExtensionType());
50905 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
50906 ML->getPassThru());
50907
50908 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
50909}
50910
50913 const X86Subtarget &Subtarget) {
50914 auto *Mld = cast<MaskedLoadSDNode>(N);
50915
50916 // TODO: Expanding load with constant mask may be optimized as well.
50917 if (Mld->isExpandingLoad())
50918 return SDValue();
50919
50920 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
50921 if (SDValue ScalarLoad =
50922 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
50923 return ScalarLoad;
50924
50925 // TODO: Do some AVX512 subsets benefit from this transform?
50926 if (!Subtarget.hasAVX512())
50927 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
50928 return Blend;
50929 }
50930
50931 // If the mask value has been legalized to a non-boolean vector, try to
50932 // simplify ops leading up to it. We only demand the MSB of each lane.
50933 SDValue Mask = Mld->getMask();
50934 if (Mask.getScalarValueSizeInBits() != 1) {
50935 EVT VT = Mld->getValueType(0);
50936 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50938 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
50939 if (N->getOpcode() != ISD::DELETED_NODE)
50940 DCI.AddToWorklist(N);
50941 return SDValue(N, 0);
50942 }
50943 if (SDValue NewMask =
50945 return DAG.getMaskedLoad(
50946 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
50947 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
50948 Mld->getAddressingMode(), Mld->getExtensionType());
50949 }
50950
50951 return SDValue();
50952}
50953
50954/// If exactly one element of the mask is set for a non-truncating masked store,
50955/// it is a vector extract and scalar store.
50956/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
50957/// mask have already been optimized in IR, so we don't bother with those here.
50959 SelectionDAG &DAG,
50960 const X86Subtarget &Subtarget) {
50961 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
50962 // However, some target hooks may need to be added to know when the transform
50963 // is profitable. Endianness would also have to be considered.
50964
50965 SDValue Addr, VecIndex;
50966 Align Alignment;
50967 unsigned Offset;
50968 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
50969 return SDValue();
50970
50971 // Extract the one scalar element that is actually being stored.
50972 SDLoc DL(MS);
50973 SDValue Value = MS->getValue();
50974 EVT VT = Value.getValueType();
50975 EVT EltVT = VT.getVectorElementType();
50976 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
50977 EltVT = MVT::f64;
50978 EVT CastVT = VT.changeVectorElementType(EltVT);
50979 Value = DAG.getBitcast(CastVT, Value);
50980 }
50981 SDValue Extract =
50982 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
50983
50984 // Store that element at the appropriate offset from the base pointer.
50985 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
50987 Alignment, MS->getMemOperand()->getFlags());
50988}
50989
50992 const X86Subtarget &Subtarget) {
50993 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
50994 if (Mst->isCompressingStore())
50995 return SDValue();
50996
50997 EVT VT = Mst->getValue().getValueType();
50998 SDLoc dl(Mst);
50999 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51000
51001 if (Mst->isTruncatingStore())
51002 return SDValue();
51003
51004 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
51005 return ScalarStore;
51006
51007 // If the mask value has been legalized to a non-boolean vector, try to
51008 // simplify ops leading up to it. We only demand the MSB of each lane.
51009 SDValue Mask = Mst->getMask();
51010 if (Mask.getScalarValueSizeInBits() != 1) {
51012 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
51013 if (N->getOpcode() != ISD::DELETED_NODE)
51014 DCI.AddToWorklist(N);
51015 return SDValue(N, 0);
51016 }
51017 if (SDValue NewMask =
51019 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
51020 Mst->getBasePtr(), Mst->getOffset(), NewMask,
51021 Mst->getMemoryVT(), Mst->getMemOperand(),
51022 Mst->getAddressingMode());
51023 }
51024
51025 SDValue Value = Mst->getValue();
51026 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
51027 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
51028 Mst->getMemoryVT())) {
51029 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
51030 Mst->getBasePtr(), Mst->getOffset(), Mask,
51031 Mst->getMemoryVT(), Mst->getMemOperand(),
51032 Mst->getAddressingMode(), true);
51033 }
51034
51035 return SDValue();
51036}
51037
51040 const X86Subtarget &Subtarget) {
51041 StoreSDNode *St = cast<StoreSDNode>(N);
51042 EVT StVT = St->getMemoryVT();
51043 SDLoc dl(St);
51044 SDValue StoredVal = St->getValue();
51045 EVT VT = StoredVal.getValueType();
51046 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51047
51048 // Convert a store of vXi1 into a store of iX and a bitcast.
51049 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
51050 VT.getVectorElementType() == MVT::i1) {
51051
51053 StoredVal = DAG.getBitcast(NewVT, StoredVal);
51054
51055 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
51056 St->getPointerInfo(), St->getOriginalAlign(),
51057 St->getMemOperand()->getFlags());
51058 }
51059
51060 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
51061 // This will avoid a copy to k-register.
51062 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
51063 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
51064 StoredVal.getOperand(0).getValueType() == MVT::i8) {
51065 SDValue Val = StoredVal.getOperand(0);
51066 // We must store zeros to the unused bits.
51067 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
51068 return DAG.getStore(St->getChain(), dl, Val,
51069 St->getBasePtr(), St->getPointerInfo(),
51070 St->getOriginalAlign(),
51071 St->getMemOperand()->getFlags());
51072 }
51073
51074 // Widen v2i1/v4i1 stores to v8i1.
51075 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
51076 Subtarget.hasAVX512()) {
51077 unsigned NumConcats = 8 / VT.getVectorNumElements();
51078 // We must store zeros to the unused bits.
51079 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
51080 Ops[0] = StoredVal;
51081 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
51082 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
51083 St->getPointerInfo(), St->getOriginalAlign(),
51084 St->getMemOperand()->getFlags());
51085 }
51086
51087 // Turn vXi1 stores of constants into a scalar store.
51088 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
51089 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
51091 // If its a v64i1 store without 64-bit support, we need two stores.
51092 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
51093 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
51094 StoredVal->ops().slice(0, 32));
51096 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
51097 StoredVal->ops().slice(32, 32));
51099
51100 SDValue Ptr0 = St->getBasePtr();
51101 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(4), dl);
51102
51103 SDValue Ch0 =
51104 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
51105 St->getOriginalAlign(),
51106 St->getMemOperand()->getFlags());
51107 SDValue Ch1 =
51108 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
51110 St->getOriginalAlign(),
51111 St->getMemOperand()->getFlags());
51112 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
51113 }
51114
51115 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
51116 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
51117 St->getPointerInfo(), St->getOriginalAlign(),
51118 St->getMemOperand()->getFlags());
51119 }
51120
51121 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
51122 // Sandy Bridge, perform two 16-byte stores.
51123 unsigned Fast;
51124 if (VT.is256BitVector() && StVT == VT &&
51125 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
51126 *St->getMemOperand(), &Fast) &&
51127 !Fast) {
51128 unsigned NumElems = VT.getVectorNumElements();
51129 if (NumElems < 2)
51130 return SDValue();
51131
51132 return splitVectorStore(St, DAG);
51133 }
51134
51135 // Split under-aligned vector non-temporal stores.
51136 if (St->isNonTemporal() && StVT == VT &&
51137 St->getAlign().value() < VT.getStoreSize()) {
51138 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
51139 // vectors or the legalizer can scalarize it to use MOVNTI.
51140 if (VT.is256BitVector() || VT.is512BitVector()) {
51141 unsigned NumElems = VT.getVectorNumElements();
51142 if (NumElems < 2)
51143 return SDValue();
51144 return splitVectorStore(St, DAG);
51145 }
51146
51147 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
51148 // to use MOVNTI.
51149 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
51150 MVT NTVT = Subtarget.hasSSE4A()
51151 ? MVT::v2f64
51152 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
51153 return scalarizeVectorStore(St, NTVT, DAG);
51154 }
51155 }
51156
51157 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
51158 // supported, but avx512f is by extending to v16i32 and truncating.
51159 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
51160 St->getValue().getOpcode() == ISD::TRUNCATE &&
51161 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
51162 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
51163 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
51164 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
51165 St->getValue().getOperand(0));
51166 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
51167 MVT::v16i8, St->getMemOperand());
51168 }
51169
51170 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
51171 if (!St->isTruncatingStore() &&
51172 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
51173 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
51174 StoredVal.hasOneUse() &&
51175 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
51176 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
51177 return EmitTruncSStore(IsSigned, St->getChain(),
51178 dl, StoredVal.getOperand(0), St->getBasePtr(),
51179 VT, St->getMemOperand(), DAG);
51180 }
51181
51182 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
51183 if (!St->isTruncatingStore()) {
51184 auto IsExtractedElement = [](SDValue V) {
51185 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
51186 V = V.getOperand(0);
51187 unsigned Opc = V.getOpcode();
51188 if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) &&
51189 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
51190 V.getOperand(0).hasOneUse())
51191 return V.getOperand(0);
51192 return SDValue();
51193 };
51194 if (SDValue Extract = IsExtractedElement(StoredVal)) {
51195 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
51196 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
51197 SDValue Src = Trunc.getOperand(0);
51198 MVT DstVT = Trunc.getSimpleValueType();
51199 MVT SrcVT = Src.getSimpleValueType();
51200 unsigned NumSrcElts = SrcVT.getVectorNumElements();
51201 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
51202 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
51203 if (NumTruncBits == VT.getSizeInBits() &&
51204 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
51205 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
51206 TruncVT, St->getMemOperand());
51207 }
51208 }
51209 }
51210 }
51211
51212 // Optimize trunc store (of multiple scalars) to shuffle and store.
51213 // First, pack all of the elements in one place. Next, store to memory
51214 // in fewer chunks.
51215 if (St->isTruncatingStore() && VT.isVector()) {
51216 // Check if we can detect an AVG pattern from the truncation. If yes,
51217 // replace the trunc store by a normal store with the result of X86ISD::AVG
51218 // instruction.
51219 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
51220 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
51221 Subtarget, dl))
51222 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
51223 St->getPointerInfo(), St->getOriginalAlign(),
51224 St->getMemOperand()->getFlags());
51225
51226 if (TLI.isTruncStoreLegal(VT, StVT)) {
51227 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
51228 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
51229 dl, Val, St->getBasePtr(),
51230 St->getMemoryVT(), St->getMemOperand(), DAG);
51231 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
51232 DAG, dl))
51233 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
51234 dl, Val, St->getBasePtr(),
51235 St->getMemoryVT(), St->getMemOperand(), DAG);
51236 }
51237
51238 return SDValue();
51239 }
51240
51241 // Cast ptr32 and ptr64 pointers to the default address space before a store.
51242 unsigned AddrSpace = St->getAddressSpace();
51243 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
51244 AddrSpace == X86AS::PTR32_UPTR) {
51245 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
51246 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
51247 SDValue Cast =
51248 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
51249 return DAG.getTruncStore(
51250 St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT,
51251 St->getOriginalAlign(), St->getMemOperand()->getFlags(),
51252 St->getAAInfo());
51253 }
51254 }
51255
51256 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
51257 // the FP state in cases where an emms may be missing.
51258 // A preferable solution to the general problem is to figure out the right
51259 // places to insert EMMS. This qualifies as a quick hack.
51260
51261 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
51262 if (VT.getSizeInBits() != 64)
51263 return SDValue();
51264
51265 const Function &F = DAG.getMachineFunction().getFunction();
51266 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
51267 bool F64IsLegal =
51268 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
51269
51270 if (!F64IsLegal || Subtarget.is64Bit())
51271 return SDValue();
51272
51273 if (VT == MVT::i64 && isa<LoadSDNode>(St->getValue()) &&
51274 cast<LoadSDNode>(St->getValue())->isSimple() &&
51275 St->getChain().hasOneUse() && St->isSimple()) {
51276 auto *Ld = cast<LoadSDNode>(St->getValue());
51277
51278 if (!ISD::isNormalLoad(Ld))
51279 return SDValue();
51280
51281 // Avoid the transformation if there are multiple uses of the loaded value.
51282 if (!Ld->hasNUsesOfValue(1, 0))
51283 return SDValue();
51284
51285 SDLoc LdDL(Ld);
51286 SDLoc StDL(N);
51287 // Lower to a single movq load/store pair.
51288 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
51289 Ld->getBasePtr(), Ld->getMemOperand());
51290
51291 // Make sure new load is placed in same chain order.
51292 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
51293 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
51294 St->getMemOperand());
51295 }
51296
51297 // This is similar to the above case, but here we handle a scalar 64-bit
51298 // integer store that is extracted from a vector on a 32-bit target.
51299 // If we have SSE2, then we can treat it like a floating-point double
51300 // to get past legalization. The execution dependencies fixup pass will
51301 // choose the optimal machine instruction for the store if this really is
51302 // an integer or v2f32 rather than an f64.
51303 if (VT == MVT::i64 &&
51305 SDValue OldExtract = St->getOperand(1);
51306 SDValue ExtOp0 = OldExtract.getOperand(0);
51307 unsigned VecSize = ExtOp0.getValueSizeInBits();
51308 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
51309 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
51310 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
51311 BitCast, OldExtract.getOperand(1));
51312 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
51313 St->getPointerInfo(), St->getOriginalAlign(),
51314 St->getMemOperand()->getFlags());
51315 }
51316
51317 return SDValue();
51318}
51319
51322 const X86Subtarget &Subtarget) {
51323 auto *St = cast<MemIntrinsicSDNode>(N);
51324
51325 SDValue StoredVal = N->getOperand(1);
51326 MVT VT = StoredVal.getSimpleValueType();
51327 EVT MemVT = St->getMemoryVT();
51328
51329 // Figure out which elements we demand.
51330 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
51331 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
51332
51333 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51334 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
51335 if (N->getOpcode() != ISD::DELETED_NODE)
51336 DCI.AddToWorklist(N);
51337 return SDValue(N, 0);
51338 }
51339
51340 return SDValue();
51341}
51342
51343/// Return 'true' if this vector operation is "horizontal"
51344/// and return the operands for the horizontal operation in LHS and RHS. A
51345/// horizontal operation performs the binary operation on successive elements
51346/// of its first operand, then on successive elements of its second operand,
51347/// returning the resulting values in a vector. For example, if
51348/// A = < float a0, float a1, float a2, float a3 >
51349/// and
51350/// B = < float b0, float b1, float b2, float b3 >
51351/// then the result of doing a horizontal operation on A and B is
51352/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
51353/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
51354/// A horizontal-op B, for some already available A and B, and if so then LHS is
51355/// set to A, RHS to B, and the routine returns 'true'.
51356static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
51357 SelectionDAG &DAG, const X86Subtarget &Subtarget,
51358 bool IsCommutative,
51359 SmallVectorImpl<int> &PostShuffleMask) {
51360 // If either operand is undef, bail out. The binop should be simplified.
51361 if (LHS.isUndef() || RHS.isUndef())
51362 return false;
51363
51364 // Look for the following pattern:
51365 // A = < float a0, float a1, float a2, float a3 >
51366 // B = < float b0, float b1, float b2, float b3 >
51367 // and
51368 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
51369 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
51370 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
51371 // which is A horizontal-op B.
51372
51373 MVT VT = LHS.getSimpleValueType();
51374 assert((VT.is128BitVector() || VT.is256BitVector()) &&
51375 "Unsupported vector type for horizontal add/sub");
51376 unsigned NumElts = VT.getVectorNumElements();
51377
51378 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
51379 SmallVectorImpl<int> &ShuffleMask) {
51380 bool UseSubVector = false;
51381 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
51382 Op.getOperand(0).getValueType().is256BitVector() &&
51383 llvm::isNullConstant(Op.getOperand(1))) {
51384 Op = Op.getOperand(0);
51385 UseSubVector = true;
51386 }
51388 SmallVector<int, 16> SrcMask, ScaledMask;
51390 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
51391 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
51392 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
51393 })) {
51394 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
51395 if (!UseSubVector && SrcOps.size() <= 2 &&
51396 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
51397 N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
51398 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
51399 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
51400 }
51401 if (UseSubVector && SrcOps.size() == 1 &&
51402 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
51403 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
51404 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
51405 ShuffleMask.assign(Mask.begin(), Mask.end());
51406 }
51407 }
51408 };
51409
51410 // View LHS in the form
51411 // LHS = VECTOR_SHUFFLE A, B, LMask
51412 // If LHS is not a shuffle, then pretend it is the identity shuffle:
51413 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
51414 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
51415 SDValue A, B;
51417 GetShuffle(LHS, A, B, LMask);
51418
51419 // Likewise, view RHS in the form
51420 // RHS = VECTOR_SHUFFLE C, D, RMask
51421 SDValue C, D;
51423 GetShuffle(RHS, C, D, RMask);
51424
51425 // At least one of the operands should be a vector shuffle.
51426 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
51427 if (NumShuffles == 0)
51428 return false;
51429
51430 if (LMask.empty()) {
51431 A = LHS;
51432 for (unsigned i = 0; i != NumElts; ++i)
51433 LMask.push_back(i);
51434 }
51435
51436 if (RMask.empty()) {
51437 C = RHS;
51438 for (unsigned i = 0; i != NumElts; ++i)
51439 RMask.push_back(i);
51440 }
51441
51442 // If we have an unary mask, ensure the other op is set to null.
51443 if (isUndefOrInRange(LMask, 0, NumElts))
51444 B = SDValue();
51445 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
51446 A = SDValue();
51447
51448 if (isUndefOrInRange(RMask, 0, NumElts))
51449 D = SDValue();
51450 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
51451 C = SDValue();
51452
51453 // If A and B occur in reverse order in RHS, then canonicalize by commuting
51454 // RHS operands and shuffle mask.
51455 if (A != C) {
51456 std::swap(C, D);
51458 }
51459 // Check that the shuffles are both shuffling the same vectors.
51460 if (!(A == C && B == D))
51461 return false;
51462
51463 PostShuffleMask.clear();
51464 PostShuffleMask.append(NumElts, SM_SentinelUndef);
51465
51466 // LHS and RHS are now:
51467 // LHS = shuffle A, B, LMask
51468 // RHS = shuffle A, B, RMask
51469 // Check that the masks correspond to performing a horizontal operation.
51470 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
51471 // so we just repeat the inner loop if this is a 256-bit op.
51472 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
51473 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
51474 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
51475 assert((NumEltsPer128BitChunk % 2 == 0) &&
51476 "Vector type should have an even number of elements in each lane");
51477 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
51478 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
51479 // Ignore undefined components.
51480 int LIdx = LMask[i + j], RIdx = RMask[i + j];
51481 if (LIdx < 0 || RIdx < 0 ||
51482 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
51483 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
51484 continue;
51485
51486 // Check that successive odd/even elements are being operated on. If not,
51487 // this is not a horizontal operation.
51488 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
51489 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
51490 return false;
51491
51492 // Compute the post-shuffle mask index based on where the element
51493 // is stored in the HOP result, and where it needs to be moved to.
51494 int Base = LIdx & ~1u;
51495 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
51496 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
51497
51498 // The low half of the 128-bit result must choose from A.
51499 // The high half of the 128-bit result must choose from B,
51500 // unless B is undef. In that case, we are always choosing from A.
51501 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
51502 Index += NumEltsPer64BitChunk;
51503 PostShuffleMask[i + j] = Index;
51504 }
51505 }
51506
51507 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
51508 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
51509
51510 bool IsIdentityPostShuffle =
51511 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
51512 if (IsIdentityPostShuffle)
51513 PostShuffleMask.clear();
51514
51515 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
51516 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
51517 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
51518 return false;
51519
51520 // If the source nodes are already used in HorizOps then always accept this.
51521 // Shuffle folding should merge these back together.
51522 bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
51523 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
51524 });
51525 bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
51526 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
51527 });
51528 bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
51529
51530 // Assume a SingleSource HOP if we only shuffle one input and don't need to
51531 // shuffle the result.
51532 if (!ForceHorizOp &&
51533 !shouldUseHorizontalOp(NewLHS == NewRHS &&
51534 (NumShuffles < 2 || !IsIdentityPostShuffle),
51535 DAG, Subtarget))
51536 return false;
51537
51538 LHS = DAG.getBitcast(VT, NewLHS);
51539 RHS = DAG.getBitcast(VT, NewRHS);
51540 return true;
51541}
51542
51543// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
51545 const X86Subtarget &Subtarget) {
51546 EVT VT = N->getValueType(0);
51547 unsigned Opcode = N->getOpcode();
51548 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
51549 SmallVector<int, 8> PostShuffleMask;
51550
51551 switch (Opcode) {
51552 case ISD::FADD:
51553 case ISD::FSUB:
51554 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
51555 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
51556 SDValue LHS = N->getOperand(0);
51557 SDValue RHS = N->getOperand(1);
51558 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
51559 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
51560 PostShuffleMask)) {
51561 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
51562 if (!PostShuffleMask.empty())
51563 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
51564 DAG.getUNDEF(VT), PostShuffleMask);
51565 return HorizBinOp;
51566 }
51567 }
51568 break;
51569 case ISD::ADD:
51570 case ISD::SUB:
51571 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
51572 VT == MVT::v16i16 || VT == MVT::v8i32)) {
51573 SDValue LHS = N->getOperand(0);
51574 SDValue RHS = N->getOperand(1);
51575 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
51576 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
51577 PostShuffleMask)) {
51578 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
51579 ArrayRef<SDValue> Ops) {
51580 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
51581 };
51582 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
51583 {LHS, RHS}, HOpBuilder);
51584 if (!PostShuffleMask.empty())
51585 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
51586 DAG.getUNDEF(VT), PostShuffleMask);
51587 return HorizBinOp;
51588 }
51589 }
51590 break;
51591 }
51592
51593 return SDValue();
51594}
51595
51596// Try to combine the following nodes
51597// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
51598// <i32 -2147483648[float -0.000000e+00]> 0
51599// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
51600// <(load 4 from constant-pool)> t0, t29
51601// [t30: v16i32 = bitcast t27]
51602// t6: v16i32 = xor t7, t27[t30]
51603// t11: v16f32 = bitcast t6
51604// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
51605// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
51606// t22: v16f32 = bitcast t7
51607// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
51608// t24: v32f16 = bitcast t23
51610 const X86Subtarget &Subtarget) {
51611 EVT VT = N->getValueType(0);
51612 SDValue LHS = N->getOperand(0);
51613 SDValue RHS = N->getOperand(1);
51614 int CombineOpcode =
51615 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
51616 auto combineConjugation = [&](SDValue &r) {
51617 if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
51618 SDValue XOR = LHS.getOperand(0);
51619 if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
51620 KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
51621 if (XORRHS.isConstant()) {
51622 APInt ConjugationInt32 = APInt(32, 0x80000000, true);
51623 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);
51624 if ((XORRHS.getBitWidth() == 32 &&
51625 XORRHS.getConstant() == ConjugationInt32) ||
51626 (XORRHS.getBitWidth() == 64 &&
51627 XORRHS.getConstant() == ConjugationInt64)) {
51628 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
51629 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
51630 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
51631 r = DAG.getBitcast(VT, FCMulC);
51632 return true;
51633 }
51634 }
51635 }
51636 }
51637 return false;
51638 };
51639 SDValue Res;
51640 if (combineConjugation(Res))
51641 return Res;
51642 std::swap(LHS, RHS);
51643 if (combineConjugation(Res))
51644 return Res;
51645 return Res;
51646}
51647
51648// Try to combine the following nodes:
51649// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
51651 const X86Subtarget &Subtarget) {
51652 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
51654 Flags.hasAllowContract();
51655 };
51656
51657 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
51658 return DAG.getTarget().Options.NoSignedZerosFPMath ||
51659 Flags.hasNoSignedZeros();
51660 };
51661 auto IsVectorAllNegativeZero = [&DAG](SDValue Op) {
51662 APInt AI = APInt(32, 0x80008000, true);
51663 KnownBits Bits = DAG.computeKnownBits(Op);
51664 return Bits.getBitWidth() == 32 && Bits.isConstant() &&
51665 Bits.getConstant() == AI;
51666 };
51667
51668 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
51669 !AllowContract(N->getFlags()))
51670 return SDValue();
51671
51672 EVT VT = N->getValueType(0);
51673 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
51674 return SDValue();
51675
51676 SDValue LHS = N->getOperand(0);
51677 SDValue RHS = N->getOperand(1);
51678 bool IsConj;
51679 SDValue FAddOp1, MulOp0, MulOp1;
51680 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
51681 &IsVectorAllNegativeZero,
51682 &HasNoSignedZero](SDValue N) -> bool {
51683 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
51684 return false;
51685 SDValue Op0 = N.getOperand(0);
51686 unsigned Opcode = Op0.getOpcode();
51687 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
51688 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
51689 MulOp0 = Op0.getOperand(0);
51690 MulOp1 = Op0.getOperand(1);
51691 IsConj = Opcode == X86ISD::VFCMULC;
51692 return true;
51693 }
51694 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
51696 HasNoSignedZero(Op0->getFlags())) ||
51697 IsVectorAllNegativeZero(Op0->getOperand(2)))) {
51698 MulOp0 = Op0.getOperand(0);
51699 MulOp1 = Op0.getOperand(1);
51700 IsConj = Opcode == X86ISD::VFCMADDC;
51701 return true;
51702 }
51703 }
51704 return false;
51705 };
51706
51707 if (GetCFmulFrom(LHS))
51708 FAddOp1 = RHS;
51709 else if (GetCFmulFrom(RHS))
51710 FAddOp1 = LHS;
51711 else
51712 return SDValue();
51713
51714 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
51715 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
51716 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
51717 // FIXME: How do we handle when fast math flags of FADD are different from
51718 // CFMUL's?
51719 SDValue CFmul =
51720 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
51721 return DAG.getBitcast(VT, CFmul);
51722}
51723
51724/// Do target-specific dag combines on floating-point adds/subs.
51726 const X86Subtarget &Subtarget) {
51727 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
51728 return HOp;
51729
51730 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
51731 return COp;
51732
51733 return SDValue();
51734}
51735
51737 const X86Subtarget &Subtarget) {
51738 EVT VT = N->getValueType(0);
51739 SDValue Src = N->getOperand(0);
51740 EVT SrcVT = Src.getValueType();
51741 SDLoc DL(N);
51742
51743 if (!Subtarget.hasDQI() || !Subtarget.hasVLX() || VT != MVT::v2i64 ||
51744 SrcVT != MVT::v2f32)
51745 return SDValue();
51746
51747 return DAG.getNode(X86ISD::CVTP2SI, DL, VT,
51748 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, Src,
51749 DAG.getUNDEF(SrcVT)));
51750}
51751
51752/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
51753/// the codegen.
51754/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
51755/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
51756/// anything that is guaranteed to be transformed by DAGCombiner.
51758 const X86Subtarget &Subtarget,
51759 const SDLoc &DL) {
51760 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
51761 SDValue Src = N->getOperand(0);
51762 unsigned SrcOpcode = Src.getOpcode();
51763 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51764
51765 EVT VT = N->getValueType(0);
51766 EVT SrcVT = Src.getValueType();
51767
51768 auto IsFreeTruncation = [VT](SDValue Op) {
51769 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
51770
51771 // See if this has been extended from a smaller/equal size to
51772 // the truncation size, allowing a truncation to combine with the extend.
51773 unsigned Opcode = Op.getOpcode();
51774 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
51775 Opcode == ISD::ZERO_EXTEND) &&
51776 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
51777 return true;
51778
51779 // See if this is a single use constant which can be constant folded.
51780 // NOTE: We don't peek throught bitcasts here because there is currently
51781 // no support for constant folding truncate+bitcast+vector_of_constants. So
51782 // we'll just send up with a truncate on both operands which will
51783 // get turned back into (truncate (binop)) causing an infinite loop.
51784 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
51785 };
51786
51787 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
51788 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
51789 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
51790 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
51791 };
51792
51793 // Don't combine if the operation has other uses.
51794 if (!Src.hasOneUse())
51795 return SDValue();
51796
51797 // Only support vector truncation for now.
51798 // TODO: i64 scalar math would benefit as well.
51799 if (!VT.isVector())
51800 return SDValue();
51801
51802 // In most cases its only worth pre-truncating if we're only facing the cost
51803 // of one truncation.
51804 // i.e. if one of the inputs will constant fold or the input is repeated.
51805 switch (SrcOpcode) {
51806 case ISD::MUL:
51807 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
51808 // better to truncate if we have the chance.
51809 if (SrcVT.getScalarType() == MVT::i64 &&
51810 TLI.isOperationLegal(SrcOpcode, VT) &&
51811 !TLI.isOperationLegal(SrcOpcode, SrcVT))
51812 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
51813 [[fallthrough]];
51814 case ISD::AND:
51815 case ISD::XOR:
51816 case ISD::OR:
51817 case ISD::ADD:
51818 case ISD::SUB: {
51819 SDValue Op0 = Src.getOperand(0);
51820 SDValue Op1 = Src.getOperand(1);
51821 if (TLI.isOperationLegal(SrcOpcode, VT) &&
51822 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
51823 return TruncateArithmetic(Op0, Op1);
51824 break;
51825 }
51826 }
51827
51828 return SDValue();
51829}
51830
51831// Try to form a MULHU or MULHS node by looking for
51832// (trunc (srl (mul ext, ext), 16))
51833// TODO: This is X86 specific because we want to be able to handle wide types
51834// before type legalization. But we can only do it if the vector will be
51835// legalized via widening/splitting. Type legalization can't handle promotion
51836// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
51837// combiner.
51838static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
51839 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
51840 // First instruction should be a right shift of a multiply.
51841 if (Src.getOpcode() != ISD::SRL ||
51842 Src.getOperand(0).getOpcode() != ISD::MUL)
51843 return SDValue();
51844
51845 if (!Subtarget.hasSSE2())
51846 return SDValue();
51847
51848 // Only handle vXi16 types that are at least 128-bits unless they will be
51849 // widened.
51850 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
51851 return SDValue();
51852
51853 // Input type should be at least vXi32.
51854 EVT InVT = Src.getValueType();
51855 if (InVT.getVectorElementType().getSizeInBits() < 32)
51856 return SDValue();
51857
51858 // Need a shift by 16.
51859 APInt ShiftAmt;
51860 if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
51861 ShiftAmt != 16)
51862 return SDValue();
51863
51864 SDValue LHS = Src.getOperand(0).getOperand(0);
51865 SDValue RHS = Src.getOperand(0).getOperand(1);
51866
51867 // Count leading sign/zero bits on both inputs - if there are enough then
51868 // truncation back to vXi16 will be cheap - either as a pack/shuffle
51869 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
51870 // truncations may actually be free by peeking through to the ext source.
51871 auto IsSext = [&DAG](SDValue V) {
51872 return DAG.ComputeMaxSignificantBits(V) <= 16;
51873 };
51874 auto IsZext = [&DAG](SDValue V) {
51875 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
51876 };
51877
51878 bool IsSigned = IsSext(LHS) && IsSext(RHS);
51879 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
51880 if (!IsSigned && !IsUnsigned)
51881 return SDValue();
51882
51883 // Check if both inputs are extensions, which will be removed by truncation.
51884 bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND ||
51885 LHS.getOpcode() == ISD::ZERO_EXTEND) &&
51886 (RHS.getOpcode() == ISD::SIGN_EXTEND ||
51887 RHS.getOpcode() == ISD::ZERO_EXTEND) &&
51888 LHS.getOperand(0).getScalarValueSizeInBits() <= 16 &&
51889 RHS.getOperand(0).getScalarValueSizeInBits() <= 16;
51890
51891 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
51892 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
51893 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
51894 // will have to split anyway.
51895 unsigned InSizeInBits = InVT.getSizeInBits();
51896 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
51897 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
51898 (InSizeInBits % 16) == 0) {
51899 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
51900 InVT.getSizeInBits() / 16);
51901 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
51902 DAG.getBitcast(BCVT, RHS));
51903 return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
51904 }
51905
51906 // Truncate back to source type.
51907 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
51908 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
51909
51910 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
51911 return DAG.getNode(Opc, DL, VT, LHS, RHS);
51912}
51913
51914// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
51915// from one vector with signed bytes from another vector, adds together
51916// adjacent pairs of 16-bit products, and saturates the result before
51917// truncating to 16-bits.
51918//
51919// Which looks something like this:
51920// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
51921// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
51923 const X86Subtarget &Subtarget,
51924 const SDLoc &DL) {
51925 if (!VT.isVector() || !Subtarget.hasSSSE3())
51926 return SDValue();
51927
51928 unsigned NumElems = VT.getVectorNumElements();
51929 EVT ScalarVT = VT.getVectorElementType();
51930 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
51931 return SDValue();
51932
51933 SDValue SSatVal = detectSSatPattern(In, VT);
51934 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
51935 return SDValue();
51936
51937 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
51938 // of multiplies from even/odd elements.
51939 SDValue N0 = SSatVal.getOperand(0);
51940 SDValue N1 = SSatVal.getOperand(1);
51941
51942 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
51943 return SDValue();
51944
51945 SDValue N00 = N0.getOperand(0);
51946 SDValue N01 = N0.getOperand(1);
51947 SDValue N10 = N1.getOperand(0);
51948 SDValue N11 = N1.getOperand(1);
51949
51950 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
51951 // Canonicalize zero_extend to LHS.
51952 if (N01.getOpcode() == ISD::ZERO_EXTEND)
51953 std::swap(N00, N01);
51954 if (N11.getOpcode() == ISD::ZERO_EXTEND)
51955 std::swap(N10, N11);
51956
51957 // Ensure we have a zero_extend and a sign_extend.
51958 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
51959 N01.getOpcode() != ISD::SIGN_EXTEND ||
51960 N10.getOpcode() != ISD::ZERO_EXTEND ||
51961 N11.getOpcode() != ISD::SIGN_EXTEND)
51962 return SDValue();
51963
51964 // Peek through the extends.
51965 N00 = N00.getOperand(0);
51966 N01 = N01.getOperand(0);
51967 N10 = N10.getOperand(0);
51968 N11 = N11.getOperand(0);
51969
51970 // Ensure the extend is from vXi8.
51971 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
51972 N01.getValueType().getVectorElementType() != MVT::i8 ||
51973 N10.getValueType().getVectorElementType() != MVT::i8 ||
51974 N11.getValueType().getVectorElementType() != MVT::i8)
51975 return SDValue();
51976
51977 // All inputs should be build_vectors.
51978 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
51979 N01.getOpcode() != ISD::BUILD_VECTOR ||
51980 N10.getOpcode() != ISD::BUILD_VECTOR ||
51982 return SDValue();
51983
51984 // N00/N10 are zero extended. N01/N11 are sign extended.
51985
51986 // For each element, we need to ensure we have an odd element from one vector
51987 // multiplied by the odd element of another vector and the even element from
51988 // one of the same vectors being multiplied by the even element from the
51989 // other vector. So we need to make sure for each element i, this operator
51990 // is being performed:
51991 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
51992 SDValue ZExtIn, SExtIn;
51993 for (unsigned i = 0; i != NumElems; ++i) {
51994 SDValue N00Elt = N00.getOperand(i);
51995 SDValue N01Elt = N01.getOperand(i);
51996 SDValue N10Elt = N10.getOperand(i);
51997 SDValue N11Elt = N11.getOperand(i);
51998 // TODO: Be more tolerant to undefs.
51999 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52000 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52001 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
52003 return SDValue();
52004 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
52005 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
52006 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
52007 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
52008 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
52009 return SDValue();
52010 unsigned IdxN00 = ConstN00Elt->getZExtValue();
52011 unsigned IdxN01 = ConstN01Elt->getZExtValue();
52012 unsigned IdxN10 = ConstN10Elt->getZExtValue();
52013 unsigned IdxN11 = ConstN11Elt->getZExtValue();
52014 // Add is commutative so indices can be reordered.
52015 if (IdxN00 > IdxN10) {
52016 std::swap(IdxN00, IdxN10);
52017 std::swap(IdxN01, IdxN11);
52018 }
52019 // N0 indices be the even element. N1 indices must be the next odd element.
52020 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
52021 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
52022 return SDValue();
52023 SDValue N00In = N00Elt.getOperand(0);
52024 SDValue N01In = N01Elt.getOperand(0);
52025 SDValue N10In = N10Elt.getOperand(0);
52026 SDValue N11In = N11Elt.getOperand(0);
52027 // First time we find an input capture it.
52028 if (!ZExtIn) {
52029 ZExtIn = N00In;
52030 SExtIn = N01In;
52031 }
52032 if (ZExtIn != N00In || SExtIn != N01In ||
52033 ZExtIn != N10In || SExtIn != N11In)
52034 return SDValue();
52035 }
52036
52037 auto ExtractVec = [&DAG, &DL, NumElems](SDValue &Ext) {
52038 EVT ExtVT = Ext.getValueType();
52039 if (ExtVT.getVectorNumElements() != NumElems * 2) {
52040 MVT NVT = MVT::getVectorVT(MVT::i8, NumElems * 2);
52041 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, Ext,
52042 DAG.getIntPtrConstant(0, DL));
52043 }
52044 };
52045 ExtractVec(ZExtIn);
52046 ExtractVec(SExtIn);
52047
52048 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
52049 ArrayRef<SDValue> Ops) {
52050 // Shrink by adding truncate nodes and let DAGCombine fold with the
52051 // sources.
52052 EVT InVT = Ops[0].getValueType();
52053 assert(InVT.getScalarType() == MVT::i8 &&
52054 "Unexpected scalar element type");
52055 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
52056 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
52057 InVT.getVectorNumElements() / 2);
52058 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
52059 };
52060 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
52061 PMADDBuilder);
52062}
52063
52065 const X86Subtarget &Subtarget) {
52066 EVT VT = N->getValueType(0);
52067 SDValue Src = N->getOperand(0);
52068 SDLoc DL(N);
52069
52070 // Attempt to pre-truncate inputs to arithmetic ops instead.
52071 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
52072 return V;
52073
52074 // Try to detect AVG pattern first.
52075 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
52076 return Avg;
52077
52078 // Try to detect PMADD
52079 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
52080 return PMAdd;
52081
52082 // Try to combine truncation with signed/unsigned saturation.
52083 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
52084 return Val;
52085
52086 // Try to combine PMULHUW/PMULHW for vXi16.
52087 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
52088 return V;
52089
52090 // The bitcast source is a direct mmx result.
52091 // Detect bitcasts between i32 to x86mmx
52092 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
52093 SDValue BCSrc = Src.getOperand(0);
52094 if (BCSrc.getValueType() == MVT::x86mmx)
52095 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
52096 }
52097
52098 // Try to combine (trunc (vNi64 (lrint x))) to (vNi32 (lrint x)).
52099 if (Src.getOpcode() == ISD::LRINT && VT.getScalarType() == MVT::i32 &&
52100 Src.hasOneUse())
52101 return DAG.getNode(ISD::LRINT, DL, VT, Src.getOperand(0));
52102
52103 return SDValue();
52104}
52105
52108 EVT VT = N->getValueType(0);
52109 SDValue In = N->getOperand(0);
52110 SDLoc DL(N);
52111
52112 if (SDValue SSatVal = detectSSatPattern(In, VT))
52113 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
52114 if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
52115 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
52116
52117 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52118 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
52119 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
52120 return SDValue(N, 0);
52121
52122 return SDValue();
52123}
52124
52125/// Returns the negated value if the node \p N flips sign of FP value.
52126///
52127/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
52128/// or FSUB(0, x)
52129/// AVX512F does not have FXOR, so FNEG is lowered as
52130/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
52131/// In this case we go though all bitcasts.
52132/// This also recognizes splat of a negated value and returns the splat of that
52133/// value.
52134static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
52135 if (N->getOpcode() == ISD::FNEG)
52136 return N->getOperand(0);
52137
52138 // Don't recurse exponentially.
52140 return SDValue();
52141
52142 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
52143
52145 EVT VT = Op->getValueType(0);
52146
52147 // Make sure the element size doesn't change.
52148 if (VT.getScalarSizeInBits() != ScalarSize)
52149 return SDValue();
52150
52151 unsigned Opc = Op.getOpcode();
52152 switch (Opc) {
52153 case ISD::VECTOR_SHUFFLE: {
52154 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
52155 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
52156 if (!Op.getOperand(1).isUndef())
52157 return SDValue();
52158 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
52159 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
52160 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
52161 cast<ShuffleVectorSDNode>(Op)->getMask());
52162 break;
52163 }
52165 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
52166 // -V, INDEX).
52167 SDValue InsVector = Op.getOperand(0);
52168 SDValue InsVal = Op.getOperand(1);
52169 if (!InsVector.isUndef())
52170 return SDValue();
52171 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
52172 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
52173 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
52174 NegInsVal, Op.getOperand(2));
52175 break;
52176 }
52177 case ISD::FSUB:
52178 case ISD::XOR:
52179 case X86ISD::FXOR: {
52180 SDValue Op1 = Op.getOperand(1);
52181 SDValue Op0 = Op.getOperand(0);
52182
52183 // For XOR and FXOR, we want to check if constant
52184 // bits of Op1 are sign bit masks. For FSUB, we
52185 // have to check if constant bits of Op0 are sign
52186 // bit masks and hence we swap the operands.
52187 if (Opc == ISD::FSUB)
52188 std::swap(Op0, Op1);
52189
52190 APInt UndefElts;
52191 SmallVector<APInt, 16> EltBits;
52192 // Extract constant bits and see if they are all
52193 // sign bit masks. Ignore the undef elements.
52194 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
52195 /* AllowWholeUndefs */ true,
52196 /* AllowPartialUndefs */ false)) {
52197 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
52198 if (!UndefElts[I] && !EltBits[I].isSignMask())
52199 return SDValue();
52200
52201 // Only allow bitcast from correctly-sized constant.
52202 Op0 = peekThroughBitcasts(Op0);
52203 if (Op0.getScalarValueSizeInBits() == ScalarSize)
52204 return Op0;
52205 }
52206 break;
52207 } // case
52208 } // switch
52209
52210 return SDValue();
52211}
52212
52213static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
52214 bool NegRes) {
52215 if (NegMul) {
52216 switch (Opcode) {
52217 // clang-format off
52218 default: llvm_unreachable("Unexpected opcode");
52219 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
52220 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
52221 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
52222 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
52223 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
52224 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
52225 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
52226 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
52227 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
52228 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
52229 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
52230 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
52231 // clang-format on
52232 }
52233 }
52234
52235 if (NegAcc) {
52236 switch (Opcode) {
52237 // clang-format off
52238 default: llvm_unreachable("Unexpected opcode");
52239 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
52240 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
52241 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
52242 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
52243 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
52244 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
52245 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
52246 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
52247 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
52248 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
52249 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
52250 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
52251 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
52252 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
52253 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
52254 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
52255 // clang-format on
52256 }
52257 }
52258
52259 if (NegRes) {
52260 switch (Opcode) {
52261 // For accuracy reason, we never combine fneg and fma under strict FP.
52262 // clang-format off
52263 default: llvm_unreachable("Unexpected opcode");
52264 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
52265 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
52266 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
52267 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
52268 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
52269 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
52270 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
52271 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
52272 // clang-format on
52273 }
52274 }
52275
52276 return Opcode;
52277}
52278
52279/// Do target-specific dag combines on floating point negations.
52282 const X86Subtarget &Subtarget) {
52283 EVT OrigVT = N->getValueType(0);
52284 SDValue Arg = isFNEG(DAG, N);
52285 if (!Arg)
52286 return SDValue();
52287
52288 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52289 EVT VT = Arg.getValueType();
52290 EVT SVT = VT.getScalarType();
52291 SDLoc DL(N);
52292
52293 // Let legalize expand this if it isn't a legal type yet.
52294 if (!TLI.isTypeLegal(VT))
52295 return SDValue();
52296
52297 // If we're negating a FMUL node on a target with FMA, then we can avoid the
52298 // use of a constant by performing (-0 - A*B) instead.
52299 // FIXME: Check rounding control flags as well once it becomes available.
52300 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
52301 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
52302 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
52303 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
52304 Arg.getOperand(1), Zero);
52305 return DAG.getBitcast(OrigVT, NewNode);
52306 }
52307
52308 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
52309 bool LegalOperations = !DCI.isBeforeLegalizeOps();
52310 if (SDValue NegArg =
52311 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
52312 return DAG.getBitcast(OrigVT, NegArg);
52313
52314 return SDValue();
52315}
52316
52318 bool LegalOperations,
52319 bool ForCodeSize,
52321 unsigned Depth) const {
52322 // fneg patterns are removable even if they have multiple uses.
52323 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
52325 return DAG.getBitcast(Op.getValueType(), Arg);
52326 }
52327
52328 EVT VT = Op.getValueType();
52329 EVT SVT = VT.getScalarType();
52330 unsigned Opc = Op.getOpcode();
52331 SDNodeFlags Flags = Op.getNode()->getFlags();
52332 switch (Opc) {
52333 case ISD::FMA:
52334 case X86ISD::FMSUB:
52335 case X86ISD::FNMADD:
52336 case X86ISD::FNMSUB:
52337 case X86ISD::FMADD_RND:
52338 case X86ISD::FMSUB_RND:
52339 case X86ISD::FNMADD_RND:
52340 case X86ISD::FNMSUB_RND: {
52341 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
52342 !(SVT == MVT::f32 || SVT == MVT::f64) ||
52344 break;
52345
52346 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
52347 // if it may have signed zeros.
52348 if (!Flags.hasNoSignedZeros())
52349 break;
52350
52351 // This is always negatible for free but we might be able to remove some
52352 // extra operand negations as well.
52354 for (int i = 0; i != 3; ++i)
52355 NewOps[i] = getCheaperNegatedExpression(
52356 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
52357
52358 bool NegA = !!NewOps[0];
52359 bool NegB = !!NewOps[1];
52360 bool NegC = !!NewOps[2];
52361 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
52362
52363 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
52365
52366 // Fill in the non-negated ops with the original values.
52367 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
52368 if (!NewOps[i])
52369 NewOps[i] = Op.getOperand(i);
52370 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
52371 }
52372 case X86ISD::FRCP:
52373 if (SDValue NegOp0 =
52374 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
52375 ForCodeSize, Cost, Depth + 1))
52376 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
52377 break;
52378 }
52379
52380 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
52381 ForCodeSize, Cost, Depth);
52382}
52383
52385 const X86Subtarget &Subtarget) {
52386 MVT VT = N->getSimpleValueType(0);
52387 // If we have integer vector types available, use the integer opcodes.
52388 if (!VT.isVector() || !Subtarget.hasSSE2())
52389 return SDValue();
52390
52391 SDLoc dl(N);
52392
52393 unsigned IntBits = VT.getScalarSizeInBits();
52394 MVT IntSVT = MVT::getIntegerVT(IntBits);
52395 MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
52396
52397 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
52398 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
52399 unsigned IntOpcode;
52400 switch (N->getOpcode()) {
52401 // clang-format off
52402 default: llvm_unreachable("Unexpected FP logic op");
52403 case X86ISD::FOR: IntOpcode = ISD::OR; break;
52404 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
52405 case X86ISD::FAND: IntOpcode = ISD::AND; break;
52406 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
52407 // clang-format on
52408 }
52409 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
52410 return DAG.getBitcast(VT, IntOp);
52411}
52412
52413
52414/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
52416 if (N->getOpcode() != ISD::XOR)
52417 return SDValue();
52418
52419 SDValue LHS = N->getOperand(0);
52420 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
52421 return SDValue();
52422
52424 X86::CondCode(LHS->getConstantOperandVal(0)));
52425 SDLoc DL(N);
52426 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
52427}
52428
52430 const X86Subtarget &Subtarget) {
52431 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
52432 "Invalid opcode for combing with CTLZ");
52433 if (Subtarget.hasFastLZCNT())
52434 return SDValue();
52435
52436 EVT VT = N->getValueType(0);
52437 if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
52438 (VT != MVT::i64 || !Subtarget.is64Bit()))
52439 return SDValue();
52440
52441 SDValue N0 = N->getOperand(0);
52442 SDValue N1 = N->getOperand(1);
52443
52444 if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
52446 return SDValue();
52447
52448 SDValue OpCTLZ;
52449 SDValue OpSizeTM1;
52450
52451 if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
52452 OpCTLZ = N1;
52453 OpSizeTM1 = N0;
52454 } else if (N->getOpcode() == ISD::SUB) {
52455 return SDValue();
52456 } else {
52457 OpCTLZ = N0;
52458 OpSizeTM1 = N1;
52459 }
52460
52461 if (!OpCTLZ.hasOneUse())
52462 return SDValue();
52463 auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
52464 if (!C)
52465 return SDValue();
52466
52467 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
52468 return SDValue();
52469 SDLoc DL(N);
52470 EVT OpVT = VT;
52471 SDValue Op = OpCTLZ.getOperand(0);
52472 if (VT == MVT::i8) {
52473 // Zero extend to i32 since there is not an i8 bsr.
52474 OpVT = MVT::i32;
52475 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
52476 }
52477
52478 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
52479 Op = DAG.getNode(X86ISD::BSR, DL, VTs, Op);
52480 if (VT == MVT::i8)
52481 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
52482
52483 return Op;
52484}
52485
52488 const X86Subtarget &Subtarget) {
52489 SDValue N0 = N->getOperand(0);
52490 SDValue N1 = N->getOperand(1);
52491 EVT VT = N->getValueType(0);
52492
52493 // If this is SSE1 only convert to FXOR to avoid scalarization.
52494 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
52495 return DAG.getBitcast(MVT::v4i32,
52496 DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
52497 DAG.getBitcast(MVT::v4f32, N0),
52498 DAG.getBitcast(MVT::v4f32, N1)));
52499 }
52500
52501 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
52502 return Cmp;
52503
52504 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
52505 return R;
52506
52507 if (SDValue R = combineBitOpWithShift(N, DAG))
52508 return R;
52509
52510 if (SDValue R = combineBitOpWithPACK(N, DAG))
52511 return R;
52512
52513 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
52514 return FPLogic;
52515
52516 if (SDValue R = combineXorSubCTLZ(N, DAG, Subtarget))
52517 return R;
52518
52519 if (DCI.isBeforeLegalizeOps())
52520 return SDValue();
52521
52522 if (SDValue SetCC = foldXor1SetCC(N, DAG))
52523 return SetCC;
52524
52525 if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))
52526 return R;
52527
52528 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
52529 return RV;
52530
52531 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
52532 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52533 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
52534 N0.getOperand(0).getValueType().isVector() &&
52535 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
52536 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
52537 return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),
52538 N0.getOperand(0).getValueType()));
52539 }
52540
52541 // Handle AVX512 mask widening.
52542 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
52543 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
52544 VT.getVectorElementType() == MVT::i1 &&
52546 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
52547 return DAG.getNode(
52549 DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),
52550 N0.getOperand(2));
52551 }
52552
52553 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
52554 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
52555 // TODO: Under what circumstances could this be performed in DAGCombine?
52556 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
52557 N0.getOperand(0).getOpcode() == N->getOpcode()) {
52558 SDValue TruncExtSrc = N0.getOperand(0);
52559 auto *N1C = dyn_cast<ConstantSDNode>(N1);
52560 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
52561 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
52562 SDLoc DL(N);
52563 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
52564 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
52565 return DAG.getNode(ISD::XOR, DL, VT, LHS,
52566 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
52567 }
52568 }
52569
52570 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
52571 return R;
52572
52573 return combineFneg(N, DAG, DCI, Subtarget);
52574}
52575
52578 const X86Subtarget &Subtarget) {
52579 SDValue N0 = N->getOperand(0);
52580 EVT VT = N->getValueType(0);
52581
52582 // Convert a (iX bitreverse(bitcast(vXi1 X))) -> (iX bitcast(shuffle(X)))
52583 if (VT.isInteger() && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) {
52584 SDValue Src = N0.getOperand(0);
52585 EVT SrcVT = Src.getValueType();
52586 if (SrcVT.isVector() && SrcVT.getScalarType() == MVT::i1 &&
52587 (DCI.isBeforeLegalize() ||
52588 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) &&
52589 Subtarget.hasSSSE3()) {
52590 unsigned NumElts = SrcVT.getVectorNumElements();
52591 SmallVector<int, 32> ReverseMask(NumElts);
52592 for (unsigned I = 0; I != NumElts; ++I)
52593 ReverseMask[I] = (NumElts - 1) - I;
52594 SDValue Rev =
52595 DAG.getVectorShuffle(SrcVT, SDLoc(N), Src, Src, ReverseMask);
52596 return DAG.getBitcast(VT, Rev);
52597 }
52598 }
52599
52600 return SDValue();
52601}
52602
52605 const X86Subtarget &Subtarget) {
52606 EVT VT = N->getValueType(0);
52607 unsigned NumBits = VT.getSizeInBits();
52608
52609 // TODO - Constant Folding.
52610
52611 // Simplify the inputs.
52612 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52613 APInt DemandedMask(APInt::getAllOnes(NumBits));
52614 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
52615 return SDValue(N, 0);
52616
52617 return SDValue();
52618}
52619
52621 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
52622}
52623
52624/// If a value is a scalar FP zero or a vector FP zero (potentially including
52625/// undefined elements), return a zero constant that may be used to fold away
52626/// that value. In the case of a vector, the returned constant will not contain
52627/// undefined elements even if the input parameter does. This makes it suitable
52628/// to be used as a replacement operand with operations (eg, bitwise-and) where
52629/// an undef should not propagate.
52631 const X86Subtarget &Subtarget) {
52633 return SDValue();
52634
52635 if (V.getValueType().isVector())
52636 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
52637
52638 return V;
52639}
52640
52642 const X86Subtarget &Subtarget) {
52643 SDValue N0 = N->getOperand(0);
52644 SDValue N1 = N->getOperand(1);
52645 EVT VT = N->getValueType(0);
52646 SDLoc DL(N);
52647
52648 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
52649 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
52650 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
52651 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
52652 return SDValue();
52653
52654 auto isAllOnesConstantFP = [](SDValue V) {
52655 if (V.getSimpleValueType().isVector())
52656 return ISD::isBuildVectorAllOnes(V.getNode());
52657 auto *C = dyn_cast<ConstantFPSDNode>(V);
52658 return C && C->getConstantFPValue()->isAllOnesValue();
52659 };
52660
52661 // fand (fxor X, -1), Y --> fandn X, Y
52662 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
52663 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
52664
52665 // fand X, (fxor Y, -1) --> fandn Y, X
52666 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
52667 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
52668
52669 return SDValue();
52670}
52671
52672/// Do target-specific dag combines on X86ISD::FAND nodes.
52674 const X86Subtarget &Subtarget) {
52675 // FAND(0.0, x) -> 0.0
52676 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
52677 return V;
52678
52679 // FAND(x, 0.0) -> 0.0
52680 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
52681 return V;
52682
52683 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
52684 return V;
52685
52686 return lowerX86FPLogicOp(N, DAG, Subtarget);
52687}
52688
52689/// Do target-specific dag combines on X86ISD::FANDN nodes.
52691 const X86Subtarget &Subtarget) {
52692 // FANDN(0.0, x) -> x
52693 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
52694 return N->getOperand(1);
52695
52696 // FANDN(x, 0.0) -> 0.0
52697 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
52698 return V;
52699
52700 return lowerX86FPLogicOp(N, DAG, Subtarget);
52701}
52702
52703/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
52706 const X86Subtarget &Subtarget) {
52707 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
52708
52709 // F[X]OR(0.0, x) -> x
52710 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
52711 return N->getOperand(1);
52712
52713 // F[X]OR(x, 0.0) -> x
52714 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
52715 return N->getOperand(0);
52716
52717 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
52718 return NewVal;
52719
52720 return lowerX86FPLogicOp(N, DAG, Subtarget);
52721}
52722
52723/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
52725 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
52726
52727 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
52728 if (!DAG.getTarget().Options.NoNaNsFPMath ||
52730 return SDValue();
52731
52732 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
52733 // into FMINC and FMAXC, which are Commutative operations.
52734 unsigned NewOp = 0;
52735 switch (N->getOpcode()) {
52736 default: llvm_unreachable("unknown opcode");
52737 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
52738 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
52739 }
52740
52741 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
52742 N->getOperand(0), N->getOperand(1));
52743}
52744
52746 const X86Subtarget &Subtarget) {
52747 EVT VT = N->getValueType(0);
52748 if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget))
52749 return SDValue();
52750
52751 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52752
52753 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
52754 (Subtarget.hasSSE2() && VT == MVT::f64) ||
52755 (Subtarget.hasFP16() && VT == MVT::f16) ||
52756 (VT.isVector() && TLI.isTypeLegal(VT))))
52757 return SDValue();
52758
52759 SDValue Op0 = N->getOperand(0);
52760 SDValue Op1 = N->getOperand(1);
52761 SDLoc DL(N);
52762 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
52763
52764 // If we don't have to respect NaN inputs, this is a direct translation to x86
52765 // min/max instructions.
52766 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
52767 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
52768
52769 // If one of the operands is known non-NaN use the native min/max instructions
52770 // with the non-NaN input as second operand.
52771 if (DAG.isKnownNeverNaN(Op1))
52772 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
52773 if (DAG.isKnownNeverNaN(Op0))
52774 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
52775
52776 // If we have to respect NaN inputs, this takes at least 3 instructions.
52777 // Favor a library call when operating on a scalar and minimizing code size.
52778 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
52779 return SDValue();
52780
52781 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
52782 VT);
52783
52784 // There are 4 possibilities involving NaN inputs, and these are the required
52785 // outputs:
52786 // Op1
52787 // Num NaN
52788 // ----------------
52789 // Num | Max | Op0 |
52790 // Op0 ----------------
52791 // NaN | Op1 | NaN |
52792 // ----------------
52793 //
52794 // The SSE FP max/min instructions were not designed for this case, but rather
52795 // to implement:
52796 // Min = Op1 < Op0 ? Op1 : Op0
52797 // Max = Op1 > Op0 ? Op1 : Op0
52798 //
52799 // So they always return Op0 if either input is a NaN. However, we can still
52800 // use those instructions for fmaxnum by selecting away a NaN input.
52801
52802 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
52803 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
52804 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
52805
52806 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
52807 // are NaN, the NaN value of Op1 is the result.
52808 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
52809}
52810
52813 EVT VT = N->getValueType(0);
52814 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52815
52816 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
52817 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
52818 return SDValue(N, 0);
52819
52820 // Convert a full vector load into vzload when not all bits are needed.
52821 SDValue In = N->getOperand(0);
52822 MVT InVT = In.getSimpleValueType();
52823 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
52824 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
52825 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
52826 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
52827 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
52828 MVT MemVT = MVT::getIntegerVT(NumBits);
52829 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
52830 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
52831 SDLoc dl(N);
52832 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
52833 DAG.getBitcast(InVT, VZLoad));
52834 DCI.CombineTo(N, Convert);
52835 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
52837 return SDValue(N, 0);
52838 }
52839 }
52840
52841 return SDValue();
52842}
52843
52846 bool IsStrict = N->isTargetStrictFPOpcode();
52847 EVT VT = N->getValueType(0);
52848
52849 // Convert a full vector load into vzload when not all bits are needed.
52850 SDValue In = N->getOperand(IsStrict ? 1 : 0);
52851 MVT InVT = In.getSimpleValueType();
52852 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
52853 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
52854 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
52855 LoadSDNode *LN = cast<LoadSDNode>(In);
52856 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
52857 MVT MemVT = MVT::getFloatingPointVT(NumBits);
52858 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
52859 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
52860 SDLoc dl(N);
52861 if (IsStrict) {
52862 SDValue Convert =
52863 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
52864 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
52865 DCI.CombineTo(N, Convert, Convert.getValue(1));
52866 } else {
52867 SDValue Convert =
52868 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
52869 DCI.CombineTo(N, Convert);
52870 }
52871 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
52873 return SDValue(N, 0);
52874 }
52875 }
52876
52877 return SDValue();
52878}
52879
52880/// Do target-specific dag combines on X86ISD::ANDNP nodes.
52883 const X86Subtarget &Subtarget) {
52884 SDValue N0 = N->getOperand(0);
52885 SDValue N1 = N->getOperand(1);
52886 MVT VT = N->getSimpleValueType(0);
52887 int NumElts = VT.getVectorNumElements();
52888 unsigned EltSizeInBits = VT.getScalarSizeInBits();
52889 SDLoc DL(N);
52890
52891 // ANDNP(undef, x) -> 0
52892 // ANDNP(x, undef) -> 0
52893 if (N0.isUndef() || N1.isUndef())
52894 return DAG.getConstant(0, DL, VT);
52895
52896 // ANDNP(0, x) -> x
52898 return N1;
52899
52900 // ANDNP(x, 0) -> 0
52902 return DAG.getConstant(0, DL, VT);
52903
52904 // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)
52906 return DAG.getNOT(DL, N0, VT);
52907
52908 // Turn ANDNP back to AND if input is inverted.
52909 if (SDValue Not = IsNOT(N0, DAG))
52910 return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);
52911
52912 // Fold for better commutativity:
52913 // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
52914 if (N1->hasOneUse())
52915 if (SDValue Not = IsNOT(N1, DAG))
52916 return DAG.getNOT(
52917 DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
52918
52919 // Constant Folding
52920 APInt Undefs0, Undefs1;
52921 SmallVector<APInt> EltBits0, EltBits1;
52922 if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0,
52923 /*AllowWholeUndefs*/ true,
52924 /*AllowPartialUndefs*/ true)) {
52925 if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1,
52926 /*AllowWholeUndefs*/ true,
52927 /*AllowPartialUndefs*/ true)) {
52928 SmallVector<APInt> ResultBits;
52929 for (int I = 0; I != NumElts; ++I)
52930 ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
52931 return getConstVector(ResultBits, VT, DAG, DL);
52932 }
52933
52934 // Constant fold NOT(N0) to allow us to use AND.
52935 // Ensure this is only performed if we can confirm that the bitcasted source
52936 // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
52937 if (N0->hasOneUse()) {
52939 if (BC0.getOpcode() != ISD::BITCAST) {
52940 for (APInt &Elt : EltBits0)
52941 Elt = ~Elt;
52942 SDValue Not = getConstVector(EltBits0, VT, DAG, DL);
52943 return DAG.getNode(ISD::AND, DL, VT, Not, N1);
52944 }
52945 }
52946 }
52947
52948 // Attempt to recursively combine a bitmask ANDNP with shuffles.
52949 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
52950 SDValue Op(N, 0);
52951 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
52952 return Res;
52953
52954 // If either operand is a constant mask, then only the elements that aren't
52955 // zero are actually demanded by the other operand.
52956 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
52957 APInt UndefElts;
52958 SmallVector<APInt> EltBits;
52959 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
52960 APInt DemandedElts = APInt::getAllOnes(NumElts);
52961 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
52962 EltBits)) {
52963 DemandedBits.clearAllBits();
52964 DemandedElts.clearAllBits();
52965 for (int I = 0; I != NumElts; ++I) {
52966 if (UndefElts[I]) {
52967 // We can't assume an undef src element gives an undef dst - the
52968 // other src might be zero.
52969 DemandedBits.setAllBits();
52970 DemandedElts.setBit(I);
52971 } else if ((Invert && !EltBits[I].isAllOnes()) ||
52972 (!Invert && !EltBits[I].isZero())) {
52973 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
52974 DemandedElts.setBit(I);
52975 }
52976 }
52977 }
52978 return std::make_pair(DemandedBits, DemandedElts);
52979 };
52980 APInt Bits0, Elts0;
52981 APInt Bits1, Elts1;
52982 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
52983 std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
52984
52985 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52986 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
52987 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
52988 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
52989 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
52990 if (N->getOpcode() != ISD::DELETED_NODE)
52991 DCI.AddToWorklist(N);
52992 return SDValue(N, 0);
52993 }
52994 }
52995
52996 return SDValue();
52997}
52998
53001 SDValue N1 = N->getOperand(1);
53002
53003 // BT ignores high bits in the bit index operand.
53004 unsigned BitWidth = N1.getValueSizeInBits();
53006 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
53007 if (N->getOpcode() != ISD::DELETED_NODE)
53008 DCI.AddToWorklist(N);
53009 return SDValue(N, 0);
53010 }
53011
53012 return SDValue();
53013}
53014
53017 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
53018 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
53019
53020 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
53021 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53022 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
53023 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
53024 if (N->getOpcode() != ISD::DELETED_NODE)
53025 DCI.AddToWorklist(N);
53026 return SDValue(N, 0);
53027 }
53028
53029 // Convert a full vector load into vzload when not all bits are needed.
53030 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
53031 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
53032 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
53033 SDLoc dl(N);
53034 if (IsStrict) {
53035 SDValue Convert = DAG.getNode(
53036 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
53037 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
53038 DCI.CombineTo(N, Convert, Convert.getValue(1));
53039 } else {
53040 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
53041 DAG.getBitcast(MVT::v8i16, VZLoad));
53042 DCI.CombineTo(N, Convert);
53043 }
53044
53045 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
53047 return SDValue(N, 0);
53048 }
53049 }
53050 }
53051
53052 return SDValue();
53053}
53054
53055// Try to combine sext_in_reg of a cmov of constants by extending the constants.
53057 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
53058
53059 EVT DstVT = N->getValueType(0);
53060
53061 SDValue N0 = N->getOperand(0);
53062 SDValue N1 = N->getOperand(1);
53063 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
53064
53065 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
53066 return SDValue();
53067
53068 // Look through single use any_extends / truncs.
53069 SDValue IntermediateBitwidthOp;
53070 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
53071 N0.hasOneUse()) {
53072 IntermediateBitwidthOp = N0;
53073 N0 = N0.getOperand(0);
53074 }
53075
53076 // See if we have a single use cmov.
53077 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
53078 return SDValue();
53079
53080 SDValue CMovOp0 = N0.getOperand(0);
53081 SDValue CMovOp1 = N0.getOperand(1);
53082
53083 // Make sure both operands are constants.
53084 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
53085 !isa<ConstantSDNode>(CMovOp1.getNode()))
53086 return SDValue();
53087
53088 SDLoc DL(N);
53089
53090 // If we looked through an any_extend/trunc above, add one to the constants.
53091 if (IntermediateBitwidthOp) {
53092 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
53093 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
53094 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
53095 }
53096
53097 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
53098 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
53099
53100 EVT CMovVT = DstVT;
53101 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
53102 if (DstVT == MVT::i16) {
53103 CMovVT = MVT::i32;
53104 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
53105 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
53106 }
53107
53108 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
53109 N0.getOperand(2), N0.getOperand(3));
53110
53111 if (CMovVT != DstVT)
53112 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
53113
53114 return CMov;
53115}
53116
53118 const X86Subtarget &Subtarget) {
53119 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
53120
53121 if (SDValue V = combineSextInRegCmov(N, DAG))
53122 return V;
53123
53124 EVT VT = N->getValueType(0);
53125 SDValue N0 = N->getOperand(0);
53126 SDValue N1 = N->getOperand(1);
53127 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
53128 SDLoc dl(N);
53129
53130 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
53131 // both SSE and AVX2 since there is no sign-extended shift right
53132 // operation on a vector with 64-bit elements.
53133 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
53134 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
53135 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
53136 N0.getOpcode() == ISD::SIGN_EXTEND)) {
53137 SDValue N00 = N0.getOperand(0);
53138
53139 // EXTLOAD has a better solution on AVX2,
53140 // it may be replaced with X86ISD::VSEXT node.
53141 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
53142 if (!ISD::isNormalLoad(N00.getNode()))
53143 return SDValue();
53144
53145 // Attempt to promote any comparison mask ops before moving the
53146 // SIGN_EXTEND_INREG in the way.
53147 if (SDValue Promote = PromoteMaskArithmetic(N0, dl, DAG, Subtarget))
53148 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
53149
53150 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
53151 SDValue Tmp =
53152 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
53153 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
53154 }
53155 }
53156 return SDValue();
53157}
53158
53159/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
53160/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
53161/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
53162/// opportunities to combine math ops, use an LEA, or use a complex addressing
53163/// mode. This can eliminate extend, add, and shift instructions.
53165 const X86Subtarget &Subtarget) {
53166 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
53167 Ext->getOpcode() != ISD::ZERO_EXTEND)
53168 return SDValue();
53169
53170 // TODO: This should be valid for other integer types.
53171 EVT VT = Ext->getValueType(0);
53172 if (VT != MVT::i64)
53173 return SDValue();
53174
53175 SDValue Add = Ext->getOperand(0);
53176 if (Add.getOpcode() != ISD::ADD)
53177 return SDValue();
53178
53179 SDValue AddOp0 = Add.getOperand(0);
53180 SDValue AddOp1 = Add.getOperand(1);
53181 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
53182 bool NSW = Add->getFlags().hasNoSignedWrap();
53183 bool NUW = Add->getFlags().hasNoUnsignedWrap();
53184 NSW = NSW || (Sext && DAG.willNotOverflowAdd(true, AddOp0, AddOp1));
53185 NUW = NUW || (!Sext && DAG.willNotOverflowAdd(false, AddOp0, AddOp1));
53186
53187 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
53188 // into the 'zext'
53189 if ((Sext && !NSW) || (!Sext && !NUW))
53190 return SDValue();
53191
53192 // Having a constant operand to the 'add' ensures that we are not increasing
53193 // the instruction count because the constant is extended for free below.
53194 // A constant operand can also become the displacement field of an LEA.
53195 auto *AddOp1C = dyn_cast<ConstantSDNode>(AddOp1);
53196 if (!AddOp1C)
53197 return SDValue();
53198
53199 // Don't make the 'add' bigger if there's no hope of combining it with some
53200 // other 'add' or 'shl' instruction.
53201 // TODO: It may be profitable to generate simpler LEA instructions in place
53202 // of single 'add' instructions, but the cost model for selecting an LEA
53203 // currently has a high threshold.
53204 bool HasLEAPotential = false;
53205 for (auto *User : Ext->uses()) {
53206 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
53207 HasLEAPotential = true;
53208 break;
53209 }
53210 }
53211 if (!HasLEAPotential)
53212 return SDValue();
53213
53214 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
53215 int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();
53216 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
53217 SDValue NewConstant = DAG.getConstant(AddC, SDLoc(Add), VT);
53218
53219 // The wider add is guaranteed to not wrap because both operands are
53220 // sign-extended.
53221 SDNodeFlags Flags;
53222 Flags.setNoSignedWrap(NSW);
53223 Flags.setNoUnsignedWrap(NUW);
53224 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
53225}
53226
53227// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
53228// operands and the result of CMOV is not used anywhere else - promote CMOV
53229// itself instead of promoting its result. This could be beneficial, because:
53230// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
53231// (or more) pseudo-CMOVs only when they go one-after-another and
53232// getting rid of result extension code after CMOV will help that.
53233// 2) Promotion of constant CMOV arguments is free, hence the
53234// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
53235// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
53236// promotion is also good in terms of code-size.
53237// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
53238// promotion).
53240 SDValue CMovN = Extend->getOperand(0);
53241 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
53242 return SDValue();
53243
53244 EVT TargetVT = Extend->getValueType(0);
53245 unsigned ExtendOpcode = Extend->getOpcode();
53246 SDLoc DL(Extend);
53247
53248 EVT VT = CMovN.getValueType();
53249 SDValue CMovOp0 = CMovN.getOperand(0);
53250 SDValue CMovOp1 = CMovN.getOperand(1);
53251
53252 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
53253 !isa<ConstantSDNode>(CMovOp1.getNode()))
53254 return SDValue();
53255
53256 // Only extend to i32 or i64.
53257 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
53258 return SDValue();
53259
53260 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
53261 // are free.
53262 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
53263 return SDValue();
53264
53265 // If this a zero extend to i64, we should only extend to i32 and use a free
53266 // zero extend to finish.
53267 EVT ExtendVT = TargetVT;
53268 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
53269 ExtendVT = MVT::i32;
53270
53271 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
53272 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
53273
53274 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
53275 CMovN.getOperand(2), CMovN.getOperand(3));
53276
53277 // Finish extending if needed.
53278 if (ExtendVT != TargetVT)
53279 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
53280
53281 return Res;
53282}
53283
53284// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
53285// result type.
53287 const X86Subtarget &Subtarget) {
53288 SDValue N0 = N->getOperand(0);
53289 EVT VT = N->getValueType(0);
53290 SDLoc dl(N);
53291
53292 // Only do this combine with AVX512 for vector extends.
53293 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
53294 return SDValue();
53295
53296 // Only combine legal element types.
53297 EVT SVT = VT.getVectorElementType();
53298 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
53299 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
53300 return SDValue();
53301
53302 // We don't have CMPP Instruction for vxf16
53303 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
53304 return SDValue();
53305 // We can only do this if the vector size in 256 bits or less.
53306 unsigned Size = VT.getSizeInBits();
53307 if (Size > 256 && Subtarget.useAVX512Regs())
53308 return SDValue();
53309
53310 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
53311 // that's the only integer compares with we have.
53312 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
53314 return SDValue();
53315
53316 // Only do this combine if the extension will be fully consumed by the setcc.
53317 EVT N00VT = N0.getOperand(0).getValueType();
53318 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
53319 if (Size != MatchingVecType.getSizeInBits())
53320 return SDValue();
53321
53322 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
53323
53324 if (N->getOpcode() == ISD::ZERO_EXTEND)
53325 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
53326
53327 return Res;
53328}
53329
53332 const X86Subtarget &Subtarget) {
53333 SDValue N0 = N->getOperand(0);
53334 EVT VT = N->getValueType(0);
53335 SDLoc DL(N);
53336
53337 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
53338 if (!DCI.isBeforeLegalizeOps() &&
53340 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
53341 N0->getOperand(1));
53342 bool ReplaceOtherUses = !N0.hasOneUse();
53343 DCI.CombineTo(N, Setcc);
53344 // Replace other uses with a truncate of the widened setcc_carry.
53345 if (ReplaceOtherUses) {
53346 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
53347 N0.getValueType(), Setcc);
53348 DCI.CombineTo(N0.getNode(), Trunc);
53349 }
53350
53351 return SDValue(N, 0);
53352 }
53353
53354 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
53355 return NewCMov;
53356
53357 if (!DCI.isBeforeLegalizeOps())
53358 return SDValue();
53359
53360 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
53361 return V;
53362
53363 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
53364 DAG, DCI, Subtarget))
53365 return V;
53366
53367 if (VT.isVector()) {
53368 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget))
53369 return R;
53370
53372 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
53373 }
53374
53375 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
53376 return NewAdd;
53377
53378 return SDValue();
53379}
53380
53381// Inverting a constant vector is profitable if it can be eliminated and the
53382// inverted vector is already present in DAG. Otherwise, it will be loaded
53383// anyway.
53384//
53385// We determine which of the values can be completely eliminated and invert it.
53386// If both are eliminable, select a vector with the first negative element.
53389 "ConstantFP build vector expected");
53390 // Check if we can eliminate V. We assume if a value is only used in FMAs, we
53391 // can eliminate it. Since this function is invoked for each FMA with this
53392 // vector.
53393 auto IsNotFMA = [](SDNode *Use) {
53394 return Use->getOpcode() != ISD::FMA && Use->getOpcode() != ISD::STRICT_FMA;
53395 };
53396 if (llvm::any_of(V->uses(), IsNotFMA))
53397 return SDValue();
53398
53400 EVT VT = V.getValueType();
53401 EVT EltVT = VT.getVectorElementType();
53402 for (const SDValue &Op : V->op_values()) {
53403 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
53404 Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));
53405 } else {
53406 assert(Op.isUndef());
53407 Ops.push_back(DAG.getUNDEF(EltVT));
53408 }
53409 }
53410
53411 SDNode *NV = DAG.getNodeIfExists(ISD::BUILD_VECTOR, DAG.getVTList(VT), Ops);
53412 if (!NV)
53413 return SDValue();
53414
53415 // If an inverted version cannot be eliminated, choose it instead of the
53416 // original version.
53417 if (llvm::any_of(NV->uses(), IsNotFMA))
53418 return SDValue(NV, 0);
53419
53420 // If the inverted version also can be eliminated, we have to consistently
53421 // prefer one of the values. We prefer a constant with a negative value on
53422 // the first place.
53423 // N.B. We need to skip undefs that may precede a value.
53424 for (const SDValue &Op : V->op_values()) {
53425 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
53426 if (Cst->isNegative())
53427 return SDValue();
53428 break;
53429 }
53430 }
53431 return SDValue(NV, 0);
53432}
53433
53436 const X86Subtarget &Subtarget) {
53437 SDLoc dl(N);
53438 EVT VT = N->getValueType(0);
53439 bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
53440
53441 // Let legalize expand this if it isn't a legal type yet.
53442 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53443 if (!TLI.isTypeLegal(VT))
53444 return SDValue();
53445
53446 SDValue A = N->getOperand(IsStrict ? 1 : 0);
53447 SDValue B = N->getOperand(IsStrict ? 2 : 1);
53448 SDValue C = N->getOperand(IsStrict ? 3 : 2);
53449
53450 // If the operation allows fast-math and the target does not support FMA,
53451 // split this into mul+add to avoid libcall(s).
53452 SDNodeFlags Flags = N->getFlags();
53453 if (!IsStrict && Flags.hasAllowReassociation() &&
53454 TLI.isOperationExpand(ISD::FMA, VT)) {
53455 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
53456 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
53457 }
53458
53459 EVT ScalarVT = VT.getScalarType();
53460 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
53461 !Subtarget.hasAnyFMA()) &&
53462 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()))
53463 return SDValue();
53464
53465 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
53466 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
53467 bool LegalOperations = !DCI.isBeforeLegalizeOps();
53468 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
53469 CodeSize)) {
53470 V = NegV;
53471 return true;
53472 }
53473 // Look through extract_vector_elts. If it comes from an FNEG, create a
53474 // new extract from the FNEG input.
53475 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
53476 isNullConstant(V.getOperand(1))) {
53477 SDValue Vec = V.getOperand(0);
53478 if (SDValue NegV = TLI.getCheaperNegatedExpression(
53479 Vec, DAG, LegalOperations, CodeSize)) {
53480 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
53481 NegV, V.getOperand(1));
53482 return true;
53483 }
53484 }
53485 // Lookup if there is an inverted version of constant vector V in DAG.
53486 if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode())) {
53487 if (SDValue NegV = getInvertedVectorForFMA(V, DAG)) {
53488 V = NegV;
53489 return true;
53490 }
53491 }
53492 return false;
53493 };
53494
53495 // Do not convert the passthru input of scalar intrinsics.
53496 // FIXME: We could allow negations of the lower element only.
53497 bool NegA = invertIfNegative(A);
53498 bool NegB = invertIfNegative(B);
53499 bool NegC = invertIfNegative(C);
53500
53501 if (!NegA && !NegB && !NegC)
53502 return SDValue();
53503
53504 unsigned NewOpcode =
53505 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
53506
53507 // Propagate fast-math-flags to new FMA node.
53508 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
53509 if (IsStrict) {
53510 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
53511 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
53512 {N->getOperand(0), A, B, C});
53513 } else {
53514 if (N->getNumOperands() == 4)
53515 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
53516 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
53517 }
53518}
53519
53520// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
53521// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
53524 SDLoc dl(N);
53525 EVT VT = N->getValueType(0);
53526 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53527 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
53528 bool LegalOperations = !DCI.isBeforeLegalizeOps();
53529
53530 SDValue N2 = N->getOperand(2);
53531
53532 SDValue NegN2 =
53533 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
53534 if (!NegN2)
53535 return SDValue();
53536 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
53537
53538 if (N->getNumOperands() == 4)
53539 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
53540 NegN2, N->getOperand(3));
53541 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
53542 NegN2);
53543}
53544
53547 const X86Subtarget &Subtarget) {
53548 SDLoc dl(N);
53549 SDValue N0 = N->getOperand(0);
53550 EVT VT = N->getValueType(0);
53551
53552 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
53553 // FIXME: Is this needed? We don't seem to have any tests for it.
53554 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
53556 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
53557 N0->getOperand(1));
53558 bool ReplaceOtherUses = !N0.hasOneUse();
53559 DCI.CombineTo(N, Setcc);
53560 // Replace other uses with a truncate of the widened setcc_carry.
53561 if (ReplaceOtherUses) {
53562 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
53563 N0.getValueType(), Setcc);
53564 DCI.CombineTo(N0.getNode(), Trunc);
53565 }
53566
53567 return SDValue(N, 0);
53568 }
53569
53570 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
53571 return NewCMov;
53572
53573 if (DCI.isBeforeLegalizeOps())
53574 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
53575 return V;
53576
53577 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
53578 DAG, DCI, Subtarget))
53579 return V;
53580
53581 if (VT.isVector())
53582 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), dl, DAG, Subtarget))
53583 return R;
53584
53585 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
53586 return NewAdd;
53587
53588 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
53589 return R;
53590
53591 // TODO: Combine with any target/faux shuffle.
53592 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
53594 SDValue N00 = N0.getOperand(0);
53595 SDValue N01 = N0.getOperand(1);
53596 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
53597 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
53598 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
53599 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
53600 return concatSubVectors(N00, N01, DAG, dl);
53601 }
53602 }
53603
53604 return SDValue();
53605}
53606
53607/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
53608/// pre-promote its result type since vXi1 vectors don't get promoted
53609/// during type legalization.
53612 const SDLoc &DL, SelectionDAG &DAG,
53613 const X86Subtarget &Subtarget) {
53614 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
53615 VT.getVectorElementType() == MVT::i1 &&
53616 (OpVT.getVectorElementType() == MVT::i8 ||
53617 OpVT.getVectorElementType() == MVT::i16)) {
53618 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
53619 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
53620 }
53621 return SDValue();
53622}
53623
53626 const X86Subtarget &Subtarget) {
53627 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
53628 const SDValue LHS = N->getOperand(0);
53629 const SDValue RHS = N->getOperand(1);
53630 EVT VT = N->getValueType(0);
53631 EVT OpVT = LHS.getValueType();
53632 SDLoc DL(N);
53633
53634 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
53635 if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
53636 Subtarget))
53637 return V;
53638
53639 if (VT == MVT::i1) {
53640 X86::CondCode X86CC;
53641 if (SDValue V =
53642 MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
53643 return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
53644 }
53645
53646 if (OpVT.isScalarInteger()) {
53647 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
53648 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
53649 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
53650 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
53651 if (N0.getOperand(0) == N1)
53652 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
53653 N0.getOperand(1));
53654 if (N0.getOperand(1) == N1)
53655 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
53656 N0.getOperand(0));
53657 }
53658 return SDValue();
53659 };
53660 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
53661 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
53662 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
53663 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
53664
53665 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
53666 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
53667 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
53668 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
53669 if (N0.getOperand(0) == N1)
53670 return DAG.getNode(ISD::AND, DL, OpVT, N1,
53671 DAG.getNOT(DL, N0.getOperand(1), OpVT));
53672 if (N0.getOperand(1) == N1)
53673 return DAG.getNode(ISD::AND, DL, OpVT, N1,
53674 DAG.getNOT(DL, N0.getOperand(0), OpVT));
53675 }
53676 return SDValue();
53677 };
53678 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
53679 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
53680 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
53681 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
53682
53683 // cmpeq(trunc(x),C) --> cmpeq(x,C)
53684 // cmpne(trunc(x),C) --> cmpne(x,C)
53685 // iff x upper bits are zero.
53686 if (LHS.getOpcode() == ISD::TRUNCATE &&
53687 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
53688 isa<ConstantSDNode>(RHS) && !DCI.isBeforeLegalize()) {
53689 EVT SrcVT = LHS.getOperand(0).getValueType();
53691 OpVT.getScalarSizeInBits());
53692 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53693 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
53694 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
53695 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
53696 DAG.getZExtOrTrunc(RHS, DL, SrcVT), CC);
53697 }
53698
53699 // With C as a power of 2 and C != 0 and C != INT_MIN:
53700 // icmp eq Abs(X) C ->
53701 // (icmp eq A, C) | (icmp eq A, -C)
53702 // icmp ne Abs(X) C ->
53703 // (icmp ne A, C) & (icmp ne A, -C)
53704 // Both of these patterns can be better optimized in
53705 // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
53706 // integers which is checked above.
53707 if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
53708 if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
53709 const APInt &CInt = C->getAPIntValue();
53710 // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
53711 if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
53712 SDValue BaseOp = LHS.getOperand(0);
53713 SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
53714 SDValue SETCC1 = DAG.getSetCC(
53715 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
53716 return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
53717 SETCC0, SETCC1);
53718 }
53719 }
53720 }
53721 }
53722 }
53723
53724 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
53726 // Using temporaries to avoid messing up operand ordering for later
53727 // transformations if this doesn't work.
53728 SDValue Op0 = LHS;
53729 SDValue Op1 = RHS;
53730 ISD::CondCode TmpCC = CC;
53731 // Put build_vector on the right.
53732 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
53733 std::swap(Op0, Op1);
53734 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
53735 }
53736
53737 bool IsSEXT0 =
53738 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
53739 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
53740 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
53741
53742 if (IsSEXT0 && IsVZero1) {
53743 assert(VT == Op0.getOperand(0).getValueType() &&
53744 "Unexpected operand type");
53745 if (TmpCC == ISD::SETGT)
53746 return DAG.getConstant(0, DL, VT);
53747 if (TmpCC == ISD::SETLE)
53748 return DAG.getConstant(1, DL, VT);
53749 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
53750 return DAG.getNOT(DL, Op0.getOperand(0), VT);
53751
53752 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
53753 "Unexpected condition code!");
53754 return Op0.getOperand(0);
53755 }
53756 }
53757
53758 // Try and make unsigned vector comparison signed. On pre AVX512 targets there
53759 // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
53760 // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
53761 // a mask, there are signed AVX512 comparisons).
53762 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
53763 bool CanMakeSigned = false;
53765 KnownBits CmpKnown =
53767 // If we know LHS/RHS share the same sign bit at each element we can
53768 // make this signed.
53769 // NOTE: `computeKnownBits` on a vector type aggregates common bits
53770 // across all lanes. So a pattern where the sign varies from lane to
53771 // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
53772 // missed. We could get around this by demanding each lane
53773 // independently, but this isn't the most important optimization and
53774 // that may eat into compile time.
53775 CanMakeSigned =
53776 CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
53777 }
53778 if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
53779 SDValue LHSOut = LHS;
53780 SDValue RHSOut = RHS;
53781 ISD::CondCode NewCC = CC;
53782 switch (CC) {
53783 case ISD::SETGE:
53784 case ISD::SETUGE:
53785 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
53786 /*NSW*/ true))
53787 LHSOut = NewLHS;
53788 else if (SDValue NewRHS = incDecVectorConstant(
53789 RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
53790 RHSOut = NewRHS;
53791 else
53792 break;
53793
53794 [[fallthrough]];
53795 case ISD::SETUGT:
53796 NewCC = ISD::SETGT;
53797 break;
53798
53799 case ISD::SETLE:
53800 case ISD::SETULE:
53801 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
53802 /*NSW*/ true))
53803 LHSOut = NewLHS;
53804 else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
53805 /*NSW*/ true))
53806 RHSOut = NewRHS;
53807 else
53808 break;
53809
53810 [[fallthrough]];
53811 case ISD::SETULT:
53812 // Will be swapped to SETGT in LowerVSETCC*.
53813 NewCC = ISD::SETLT;
53814 break;
53815 default:
53816 break;
53817 }
53818 if (NewCC != CC) {
53819 if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
53820 NewCC, DL, DAG, Subtarget))
53821 return R;
53822 return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
53823 }
53824 }
53825 }
53826
53827 if (SDValue R =
53828 truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
53829 return R;
53830
53831 // In the middle end transforms:
53832 // `(or (icmp eq X, C), (icmp eq X, C+1))`
53833 // -> `(icmp ult (add x, -C), 2)`
53834 // Likewise inverted cases with `ugt`.
53835 //
53836 // Since x86, pre avx512, doesn't have unsigned vector compares, this results
53837 // in worse codegen. So, undo the middle-end transform and go back to `(or
53838 // (icmp eq), (icmp eq))` form.
53839 // Also skip AVX1 with ymm vectors, as the umin approach combines better than
53840 // the xmm approach.
53841 //
53842 // NB: We don't handle the similiar simplication of `(and (icmp ne), (icmp
53843 // ne))` as it doesn't end up instruction positive.
53844 // TODO: We might want to do this for avx512 as well if we `sext` the result.
53845 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger() &&
53846 ISD::isUnsignedIntSetCC(CC) && LHS.getOpcode() == ISD::ADD &&
53847 !Subtarget.hasAVX512() &&
53848 (OpVT.getSizeInBits() <= 128 || !Subtarget.hasAVX() ||
53849 Subtarget.hasAVX2()) &&
53850 LHS.hasOneUse()) {
53851
53852 APInt CmpC;
53853 SDValue AddC = LHS.getOperand(1);
53854 if (ISD::isConstantSplatVector(RHS.getNode(), CmpC) &&
53856 // See which form we have depending on the constant/condition.
53857 SDValue C0 = SDValue();
53858 SDValue C1 = SDValue();
53859
53860 // If we had `(add x, -1)` and can lower with `umin`, don't transform as
53861 // we will end up generating an additional constant. Keeping in the
53862 // current form has a slight latency cost, but it probably worth saving a
53863 // constant.
53866 // Pass
53867 }
53868 // Normal Cases
53869 else if ((CC == ISD::SETULT && CmpC == 2) ||
53870 (CC == ISD::SETULE && CmpC == 1)) {
53871 // These will constant fold.
53872 C0 = DAG.getNegative(AddC, DL, OpVT);
53873 C1 = DAG.getNode(ISD::SUB, DL, OpVT, C0,
53874 DAG.getAllOnesConstant(DL, OpVT));
53875 }
53876 // Inverted Cases
53877 else if ((CC == ISD::SETUGT && (-CmpC) == 3) ||
53878 (CC == ISD::SETUGE && (-CmpC) == 2)) {
53879 // These will constant fold.
53880 C0 = DAG.getNOT(DL, AddC, OpVT);
53881 C1 = DAG.getNode(ISD::ADD, DL, OpVT, C0,
53882 DAG.getAllOnesConstant(DL, OpVT));
53883 }
53884 if (C0 && C1) {
53885 SDValue NewLHS =
53886 DAG.getSetCC(DL, VT, LHS.getOperand(0), C0, ISD::SETEQ);
53887 SDValue NewRHS =
53888 DAG.getSetCC(DL, VT, LHS.getOperand(0), C1, ISD::SETEQ);
53889 return DAG.getNode(ISD::OR, DL, VT, NewLHS, NewRHS);
53890 }
53891 }
53892 }
53893
53894 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
53895 // to avoid scalarization via legalization because v4i32 is not a legal type.
53896 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
53897 LHS.getValueType() == MVT::v4f32)
53898 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
53899
53900 // X pred 0.0 --> X pred -X
53901 // If the negation of X already exists, use it in the comparison. This removes
53902 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
53903 // instructions in patterns with a 'select' node.
53905 SDVTList FNegVT = DAG.getVTList(OpVT);
53906 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
53907 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
53908 }
53909
53910 return SDValue();
53911}
53912
53915 const X86Subtarget &Subtarget) {
53916 SDValue Src = N->getOperand(0);
53917 MVT SrcVT = Src.getSimpleValueType();
53918 MVT VT = N->getSimpleValueType(0);
53919 unsigned NumBits = VT.getScalarSizeInBits();
53920 unsigned NumElts = SrcVT.getVectorNumElements();
53921 unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
53922 assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types");
53923
53924 // Perform constant folding.
53925 APInt UndefElts;
53926 SmallVector<APInt, 32> EltBits;
53927 if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits,
53928 /*AllowWholeUndefs*/ true,
53929 /*AllowPartialUndefs*/ true)) {
53930 APInt Imm(32, 0);
53931 for (unsigned Idx = 0; Idx != NumElts; ++Idx)
53932 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
53933 Imm.setBit(Idx);
53934
53935 return DAG.getConstant(Imm, SDLoc(N), VT);
53936 }
53937
53938 // Look through int->fp bitcasts that don't change the element width.
53939 unsigned EltWidth = SrcVT.getScalarSizeInBits();
53940 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
53941 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
53942 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
53943
53944 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
53945 // with scalar comparisons.
53946 if (SDValue NotSrc = IsNOT(Src, DAG)) {
53947 SDLoc DL(N);
53948 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
53949 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
53950 return DAG.getNode(ISD::XOR, DL, VT,
53951 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
53952 DAG.getConstant(NotMask, DL, VT));
53953 }
53954
53955 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
53956 // results with scalar comparisons.
53957 if (Src.getOpcode() == X86ISD::PCMPGT &&
53958 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
53959 SDLoc DL(N);
53960 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
53961 return DAG.getNode(ISD::XOR, DL, VT,
53962 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
53963 DAG.getConstant(NotMask, DL, VT));
53964 }
53965
53966 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
53967 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
53968 // iff pow2splat(c1).
53969 // Use KnownBits to determine if only a single bit is non-zero
53970 // in each element (pow2 or zero), and shift that bit to the msb.
53971 if (Src.getOpcode() == X86ISD::PCMPEQ) {
53972 KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
53973 KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
53974 unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
53975 if (KnownLHS.countMaxPopulation() == 1 &&
53976 (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
53977 ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
53978 SDLoc DL(N);
53979 MVT ShiftVT = SrcVT;
53980 SDValue ShiftLHS = Src.getOperand(0);
53981 SDValue ShiftRHS = Src.getOperand(1);
53982 if (ShiftVT.getScalarType() == MVT::i8) {
53983 // vXi8 shifts - we only care about the signbit so can use PSLLW.
53984 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
53985 ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
53986 ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
53987 }
53988 ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
53989 ShiftLHS, ShiftAmt, DAG);
53990 ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
53991 ShiftRHS, ShiftAmt, DAG);
53992 ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
53993 ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
53994 SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
53995 return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
53996 }
53997 }
53998
53999 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
54000 if (N->isOnlyUserOf(Src.getNode())) {
54002 if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
54003 APInt UndefElts;
54004 SmallVector<APInt, 32> EltBits;
54005 if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
54006 UndefElts, EltBits)) {
54007 APInt Mask = APInt::getZero(NumBits);
54008 for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
54009 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
54010 Mask.setBit(Idx);
54011 }
54012 SDLoc DL(N);
54013 SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
54014 SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
54015 return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
54016 DAG.getConstant(Mask, DL, VT));
54017 }
54018 }
54019 }
54020
54021 // Simplify the inputs.
54022 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54023 APInt DemandedMask(APInt::getAllOnes(NumBits));
54024 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54025 return SDValue(N, 0);
54026
54027 return SDValue();
54028}
54029
54032 const X86Subtarget &Subtarget) {
54033 MVT VT = N->getSimpleValueType(0);
54034 unsigned NumBits = VT.getScalarSizeInBits();
54035
54036 // Simplify the inputs.
54037 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54038 APInt DemandedMask(APInt::getAllOnes(NumBits));
54039 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
54040 return SDValue(N, 0);
54041
54042 return SDValue();
54043}
54044
54047 auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);
54048 SDValue Mask = MemOp->getMask();
54049
54050 // With vector masks we only demand the upper bit of the mask.
54051 if (Mask.getScalarValueSizeInBits() != 1) {
54052 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54053 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
54054 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
54055 if (N->getOpcode() != ISD::DELETED_NODE)
54056 DCI.AddToWorklist(N);
54057 return SDValue(N, 0);
54058 }
54059 }
54060
54061 return SDValue();
54062}
54063
54066 SelectionDAG &DAG) {
54067 SDLoc DL(GorS);
54068
54069 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
54070 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
54071 Gather->getMask(), Base, Index, Scale } ;
54072 return DAG.getMaskedGather(Gather->getVTList(),
54073 Gather->getMemoryVT(), DL, Ops,
54074 Gather->getMemOperand(),
54075 Gather->getIndexType(),
54076 Gather->getExtensionType());
54077 }
54078 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
54079 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
54080 Scatter->getMask(), Base, Index, Scale };
54081 return DAG.getMaskedScatter(Scatter->getVTList(),
54082 Scatter->getMemoryVT(), DL,
54083 Ops, Scatter->getMemOperand(),
54084 Scatter->getIndexType(),
54085 Scatter->isTruncatingStore());
54086}
54087
54090 SDLoc DL(N);
54091 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
54092 SDValue Index = GorS->getIndex();
54093 SDValue Base = GorS->getBasePtr();
54094 SDValue Scale = GorS->getScale();
54095 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54096
54097 if (DCI.isBeforeLegalize()) {
54098 unsigned IndexWidth = Index.getScalarValueSizeInBits();
54099
54100 // Shrink constant indices if they are larger than 32-bits.
54101 // Only do this before legalize types since v2i64 could become v2i32.
54102 // FIXME: We could check that the type is legal if we're after legalize
54103 // types, but then we would need to construct test cases where that happens.
54104 // FIXME: We could support more than just constant vectors, but we need to
54105 // careful with costing. A truncate that can be optimized out would be fine.
54106 // Otherwise we might only want to create a truncate if it avoids a split.
54107 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
54108 if (BV->isConstant() && IndexWidth > 32 &&
54109 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
54110 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
54111 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
54112 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
54113 }
54114 }
54115
54116 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
54117 // there are sufficient sign bits. Only do this before legalize types to
54118 // avoid creating illegal types in truncate.
54119 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
54120 Index.getOpcode() == ISD::ZERO_EXTEND) &&
54121 IndexWidth > 32 &&
54122 Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
54123 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
54124 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
54125 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
54126 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
54127 }
54128 }
54129
54130 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
54131 // Try to move splat constant adders from the index operand to the base
54132 // pointer operand. Taking care to multiply by the scale. We can only do
54133 // this when index element type is the same as the pointer type.
54134 // Otherwise we need to be sure the math doesn't wrap before the scale.
54135 if (Index.getOpcode() == ISD::ADD &&
54136 Index.getValueType().getVectorElementType() == PtrVT &&
54137 isa<ConstantSDNode>(Scale)) {
54138 uint64_t ScaleAmt = Scale->getAsZExtVal();
54139 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) {
54140 BitVector UndefElts;
54141 if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {
54142 // FIXME: Allow non-constant?
54143 if (UndefElts.none()) {
54144 // Apply the scale.
54145 APInt Adder = C->getAPIntValue() * ScaleAmt;
54146 // Add it to the existing base.
54147 Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
54148 DAG.getConstant(Adder, DL, PtrVT));
54149 Index = Index.getOperand(0);
54150 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
54151 }
54152 }
54153
54154 // It's also possible base is just a constant. In that case, just
54155 // replace it with 0 and move the displacement into the index.
54156 if (BV->isConstant() && isa<ConstantSDNode>(Base) &&
54157 isOneConstant(Scale)) {
54158 SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base);
54159 // Combine the constant build_vector and the constant base.
54160 Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
54161 Index.getOperand(1), Splat);
54162 // Add to the LHS of the original Index add.
54163 Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
54164 Index.getOperand(0), Splat);
54165 Base = DAG.getConstant(0, DL, Base.getValueType());
54166 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
54167 }
54168 }
54169 }
54170
54171 if (DCI.isBeforeLegalizeOps()) {
54172 unsigned IndexWidth = Index.getScalarValueSizeInBits();
54173
54174 // Make sure the index is either i32 or i64
54175 if (IndexWidth != 32 && IndexWidth != 64) {
54176 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
54177 EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);
54178 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
54179 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
54180 }
54181 }
54182
54183 // With vector masks we only demand the upper bit of the mask.
54184 SDValue Mask = GorS->getMask();
54185 if (Mask.getScalarValueSizeInBits() != 1) {
54186 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
54187 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
54188 if (N->getOpcode() != ISD::DELETED_NODE)
54189 DCI.AddToWorklist(N);
54190 return SDValue(N, 0);
54191 }
54192 }
54193
54194 return SDValue();
54195}
54196
54197// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
54199 const X86Subtarget &Subtarget) {
54200 SDLoc DL(N);
54201 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
54202 SDValue EFLAGS = N->getOperand(1);
54203
54204 // Try to simplify the EFLAGS and condition code operands.
54205 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
54206 return getSETCC(CC, Flags, DL, DAG);
54207
54208 return SDValue();
54209}
54210
54211/// Optimize branch condition evaluation.
54213 const X86Subtarget &Subtarget) {
54214 SDLoc DL(N);
54215 SDValue EFLAGS = N->getOperand(3);
54216 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
54217
54218 // Try to simplify the EFLAGS and condition code operands.
54219 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
54220 // RAUW them under us.
54221 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
54222 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
54223 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
54224 N->getOperand(1), Cond, Flags);
54225 }
54226
54227 return SDValue();
54228}
54229
54230// TODO: Could we move this to DAGCombine?
54232 SelectionDAG &DAG) {
54233 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
54234 // to optimize away operation when it's from a constant.
54235 //
54236 // The general transformation is:
54237 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
54238 // AND(VECTOR_CMP(x,y), constant2)
54239 // constant2 = UNARYOP(constant)
54240
54241 // Early exit if this isn't a vector operation, the operand of the
54242 // unary operation isn't a bitwise AND, or if the sizes of the operations
54243 // aren't the same.
54244 EVT VT = N->getValueType(0);
54245 bool IsStrict = N->isStrictFPOpcode();
54246 unsigned NumEltBits = VT.getScalarSizeInBits();
54247 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
54248 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
54249 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
54250 VT.getSizeInBits() != Op0.getValueSizeInBits())
54251 return SDValue();
54252
54253 // Now check that the other operand of the AND is a constant. We could
54254 // make the transformation for non-constant splats as well, but it's unclear
54255 // that would be a benefit as it would not eliminate any operations, just
54256 // perform one more step in scalar code before moving to the vector unit.
54257 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
54258 // Bail out if the vector isn't a constant.
54259 if (!BV->isConstant())
54260 return SDValue();
54261
54262 // Everything checks out. Build up the new and improved node.
54263 SDLoc DL(N);
54264 EVT IntVT = BV->getValueType(0);
54265 // Create a new constant of the appropriate type for the transformed
54266 // DAG.
54267 SDValue SourceConst;
54268 if (IsStrict)
54269 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
54270 {N->getOperand(0), SDValue(BV, 0)});
54271 else
54272 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
54273 // The AND node needs bitcasts to/from an integer vector type around it.
54274 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
54275 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
54276 MaskConst);
54277 SDValue Res = DAG.getBitcast(VT, NewAnd);
54278 if (IsStrict)
54279 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
54280 return Res;
54281 }
54282
54283 return SDValue();
54284}
54285
54286/// If we are converting a value to floating-point, try to replace scalar
54287/// truncate of an extracted vector element with a bitcast. This tries to keep
54288/// the sequence on XMM registers rather than moving between vector and GPRs.
54290 // TODO: This is currently only used by combineSIntToFP, but it is generalized
54291 // to allow being called by any similar cast opcode.
54292 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
54293 SDValue Trunc = N->getOperand(0);
54294 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
54295 return SDValue();
54296
54297 SDValue ExtElt = Trunc.getOperand(0);
54298 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54299 !isNullConstant(ExtElt.getOperand(1)))
54300 return SDValue();
54301
54302 EVT TruncVT = Trunc.getValueType();
54303 EVT SrcVT = ExtElt.getValueType();
54304 unsigned DestWidth = TruncVT.getSizeInBits();
54305 unsigned SrcWidth = SrcVT.getSizeInBits();
54306 if (SrcWidth % DestWidth != 0)
54307 return SDValue();
54308
54309 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
54310 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
54311 unsigned VecWidth = SrcVecVT.getSizeInBits();
54312 unsigned NumElts = VecWidth / DestWidth;
54313 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
54314 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
54315 SDLoc DL(N);
54316 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
54317 BitcastVec, ExtElt.getOperand(1));
54318 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
54319}
54320
54322 const X86Subtarget &Subtarget) {
54323 bool IsStrict = N->isStrictFPOpcode();
54324 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
54325 EVT VT = N->getValueType(0);
54326 EVT InVT = Op0.getValueType();
54327
54328 // Using i16 as an intermediate type is a bad idea, unless we have HW support
54329 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
54330 // if hasFP16 support:
54331 // UINT_TO_FP(vXi1~15) -> SINT_TO_FP(ZEXT(vXi1~15 to vXi16))
54332 // UINT_TO_FP(vXi17~31) -> SINT_TO_FP(ZEXT(vXi17~31 to vXi32))
54333 // else
54334 // UINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
54335 // UINT_TO_FP(vXi33~63) -> SINT_TO_FP(ZEXT(vXi33~63 to vXi64))
54336 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
54337 unsigned ScalarSize = InVT.getScalarSizeInBits();
54338 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
54339 ScalarSize >= 64)
54340 return SDValue();
54341 SDLoc dl(N);
54342 EVT DstVT =
54344 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
54345 : ScalarSize < 32 ? MVT::i32
54346 : MVT::i64,
54347 InVT.getVectorNumElements());
54348 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
54349 if (IsStrict)
54350 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
54351 {N->getOperand(0), P});
54352 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
54353 }
54354
54355 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
54356 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
54357 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
54358 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
54359 VT.getScalarType() != MVT::f16) {
54360 SDLoc dl(N);
54361 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
54362 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
54363
54364 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
54365 if (IsStrict)
54366 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
54367 {N->getOperand(0), P});
54368 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
54369 }
54370
54371 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
54372 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
54373 // the optimization here.
54374 SDNodeFlags Flags = N->getFlags();
54375 if (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0)) {
54376 if (IsStrict)
54377 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
54378 {N->getOperand(0), Op0});
54379 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
54380 }
54381
54382 return SDValue();
54383}
54384
54387 const X86Subtarget &Subtarget) {
54388 // First try to optimize away the conversion entirely when it's
54389 // conditionally from a constant. Vectors only.
54390 bool IsStrict = N->isStrictFPOpcode();
54392 return Res;
54393
54394 // Now move on to more general possibilities.
54395 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
54396 EVT VT = N->getValueType(0);
54397 EVT InVT = Op0.getValueType();
54398
54399 // Using i16 as an intermediate type is a bad idea, unless we have HW support
54400 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
54401 // if hasFP16 support:
54402 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
54403 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
54404 // else
54405 // SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
54406 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
54407 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
54408 unsigned ScalarSize = InVT.getScalarSizeInBits();
54409 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
54410 ScalarSize >= 64)
54411 return SDValue();
54412 SDLoc dl(N);
54413 EVT DstVT =
54415 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
54416 : ScalarSize < 32 ? MVT::i32
54417 : MVT::i64,
54418 InVT.getVectorNumElements());
54419 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
54420 if (IsStrict)
54421 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
54422 {N->getOperand(0), P});
54423 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
54424 }
54425
54426 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
54427 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
54428 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
54429 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
54430 VT.getScalarType() != MVT::f16) {
54431 SDLoc dl(N);
54432 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
54433 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
54434 if (IsStrict)
54435 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
54436 {N->getOperand(0), P});
54437 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
54438 }
54439
54440 // Without AVX512DQ we only support i64 to float scalar conversion. For both
54441 // vectors and scalars, see if we know that the upper bits are all the sign
54442 // bit, in which case we can truncate the input to i32 and convert from that.
54443 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
54444 unsigned BitWidth = InVT.getScalarSizeInBits();
54445 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
54446 if (NumSignBits >= (BitWidth - 31)) {
54447 EVT TruncVT = MVT::i32;
54448 if (InVT.isVector())
54449 TruncVT = InVT.changeVectorElementType(TruncVT);
54450 SDLoc dl(N);
54451 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
54452 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
54453 if (IsStrict)
54454 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
54455 {N->getOperand(0), Trunc});
54456 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
54457 }
54458 // If we're after legalize and the type is v2i32 we need to shuffle and
54459 // use CVTSI2P.
54460 assert(InVT == MVT::v2i64 && "Unexpected VT!");
54461 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
54462 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
54463 { 0, 2, -1, -1 });
54464 if (IsStrict)
54465 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
54466 {N->getOperand(0), Shuf});
54467 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
54468 }
54469 }
54470
54471 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
54472 // a 32-bit target where SSE doesn't support i64->FP operations.
54473 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
54474 Op0.getOpcode() == ISD::LOAD) {
54475 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
54476
54477 // This transformation is not supported if the result type is f16 or f128.
54478 if (VT == MVT::f16 || VT == MVT::f128)
54479 return SDValue();
54480
54481 // If we have AVX512DQ we can use packed conversion instructions unless
54482 // the VT is f80.
54483 if (Subtarget.hasDQI() && VT != MVT::f80)
54484 return SDValue();
54485
54486 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
54487 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
54488 std::pair<SDValue, SDValue> Tmp =
54489 Subtarget.getTargetLowering()->BuildFILD(
54490 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
54491 Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
54492 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
54493 return Tmp.first;
54494 }
54495 }
54496
54497 if (IsStrict)
54498 return SDValue();
54499
54500 if (SDValue V = combineToFPTruncExtElt(N, DAG))
54501 return V;
54502
54503 return SDValue();
54504}
54505
54507 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
54508
54509 for (const SDNode *User : Flags->uses()) {
54511 switch (User->getOpcode()) {
54512 default:
54513 // Be conservative.
54514 return true;
54515 case X86ISD::SETCC:
54517 CC = (X86::CondCode)User->getConstantOperandVal(0);
54518 break;
54519 case X86ISD::BRCOND:
54520 case X86ISD::CMOV:
54521 CC = (X86::CondCode)User->getConstantOperandVal(2);
54522 break;
54523 }
54524
54525 switch (CC) {
54526 // clang-format off
54527 default: break;
54528 case X86::COND_A: case X86::COND_AE:
54529 case X86::COND_B: case X86::COND_BE:
54530 case X86::COND_O: case X86::COND_NO:
54531 case X86::COND_G: case X86::COND_GE:
54532 case X86::COND_L: case X86::COND_LE:
54533 return true;
54534 // clang-format on
54535 }
54536 }
54537
54538 return false;
54539}
54540
54541static bool onlyZeroFlagUsed(SDValue Flags) {
54542 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
54543
54544 for (const SDNode *User : Flags->uses()) {
54545 unsigned CCOpNo;
54546 switch (User->getOpcode()) {
54547 default:
54548 // Be conservative.
54549 return false;
54550 case X86ISD::SETCC:
54552 CCOpNo = 0;
54553 break;
54554 case X86ISD::BRCOND:
54555 case X86ISD::CMOV:
54556 CCOpNo = 2;
54557 break;
54558 }
54559
54560 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
54561 if (CC != X86::COND_E && CC != X86::COND_NE)
54562 return false;
54563 }
54564
54565 return true;
54566}
54567
54569 const X86Subtarget &Subtarget) {
54570 // Only handle test patterns.
54571 if (!isNullConstant(N->getOperand(1)))
54572 return SDValue();
54573
54574 // If we have a CMP of a truncated binop, see if we can make a smaller binop
54575 // and use its flags directly.
54576 // TODO: Maybe we should try promoting compares that only use the zero flag
54577 // first if we can prove the upper bits with computeKnownBits?
54578 SDLoc dl(N);
54579 SDValue Op = N->getOperand(0);
54580 EVT VT = Op.getValueType();
54581 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54582
54583 // If we have a constant logical shift that's only used in a comparison
54584 // against zero turn it into an equivalent AND. This allows turning it into
54585 // a TEST instruction later.
54586 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
54587 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
54588 onlyZeroFlagUsed(SDValue(N, 0))) {
54589 unsigned BitWidth = VT.getSizeInBits();
54590 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
54591 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
54592 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
54593 APInt Mask = Op.getOpcode() == ISD::SRL
54594 ? APInt::getHighBitsSet(BitWidth, MaskBits)
54595 : APInt::getLowBitsSet(BitWidth, MaskBits);
54596 if (Mask.isSignedIntN(32)) {
54597 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
54598 DAG.getConstant(Mask, dl, VT));
54599 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
54600 DAG.getConstant(0, dl, VT));
54601 }
54602 }
54603 }
54604
54605 // If we're extracting from a avx512 bool vector and comparing against zero,
54606 // then try to just bitcast the vector to an integer to use TEST/BT directly.
54607 // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)
54608 if (Op.getOpcode() == ISD::AND && isOneConstant(Op.getOperand(1)) &&
54609 Op.hasOneUse() && onlyZeroFlagUsed(SDValue(N, 0))) {
54610 SDValue Src = Op.getOperand(0);
54611 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
54612 isNullConstant(Src.getOperand(1)) &&
54613 Src.getOperand(0).getValueType().getScalarType() == MVT::i1) {
54614 SDValue BoolVec = Src.getOperand(0);
54615 unsigned ShAmt = 0;
54616 if (BoolVec.getOpcode() == X86ISD::KSHIFTR) {
54617 ShAmt = BoolVec.getConstantOperandVal(1);
54618 BoolVec = BoolVec.getOperand(0);
54619 }
54620 BoolVec = widenMaskVector(BoolVec, false, Subtarget, DAG, dl);
54621 EVT VecVT = BoolVec.getValueType();
54622 unsigned BitWidth = VecVT.getVectorNumElements();
54623 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), BitWidth);
54624 if (TLI.isTypeLegal(VecVT) && TLI.isTypeLegal(BCVT)) {
54625 APInt Mask = APInt::getOneBitSet(BitWidth, ShAmt);
54626 Op = DAG.getBitcast(BCVT, BoolVec);
54627 Op = DAG.getNode(ISD::AND, dl, BCVT, Op,
54628 DAG.getConstant(Mask, dl, BCVT));
54629 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
54630 DAG.getConstant(0, dl, BCVT));
54631 }
54632 }
54633 }
54634
54635 // Peek through any zero-extend if we're only testing for a zero result.
54636 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
54637 SDValue Src = Op.getOperand(0);
54638 EVT SrcVT = Src.getValueType();
54639 if (SrcVT.getScalarSizeInBits() >= 8 && TLI.isTypeLegal(SrcVT))
54640 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
54641 DAG.getConstant(0, dl, SrcVT));
54642 }
54643
54644 // Look for a truncate.
54645 if (Op.getOpcode() != ISD::TRUNCATE)
54646 return SDValue();
54647
54648 SDValue Trunc = Op;
54649 Op = Op.getOperand(0);
54650
54651 // See if we can compare with zero against the truncation source,
54652 // which should help using the Z flag from many ops. Only do this for
54653 // i32 truncated op to prevent partial-reg compares of promoted ops.
54654 EVT OpVT = Op.getValueType();
54655 APInt UpperBits =
54657 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
54658 onlyZeroFlagUsed(SDValue(N, 0))) {
54659 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
54660 DAG.getConstant(0, dl, OpVT));
54661 }
54662
54663 // After this the truncate and arithmetic op must have a single use.
54664 if (!Trunc.hasOneUse() || !Op.hasOneUse())
54665 return SDValue();
54666
54667 unsigned NewOpc;
54668 switch (Op.getOpcode()) {
54669 default: return SDValue();
54670 case ISD::AND:
54671 // Skip and with constant. We have special handling for and with immediate
54672 // during isel to generate test instructions.
54673 if (isa<ConstantSDNode>(Op.getOperand(1)))
54674 return SDValue();
54675 NewOpc = X86ISD::AND;
54676 break;
54677 case ISD::OR: NewOpc = X86ISD::OR; break;
54678 case ISD::XOR: NewOpc = X86ISD::XOR; break;
54679 case ISD::ADD:
54680 // If the carry or overflow flag is used, we can't truncate.
54682 return SDValue();
54683 NewOpc = X86ISD::ADD;
54684 break;
54685 case ISD::SUB:
54686 // If the carry or overflow flag is used, we can't truncate.
54688 return SDValue();
54689 NewOpc = X86ISD::SUB;
54690 break;
54691 }
54692
54693 // We found an op we can narrow. Truncate its inputs.
54694 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
54695 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
54696
54697 // Use a X86 specific opcode to avoid DAG combine messing with it.
54698 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
54699 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
54700
54701 // For AND, keep a CMP so that we can match the test pattern.
54702 if (NewOpc == X86ISD::AND)
54703 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
54704 DAG.getConstant(0, dl, VT));
54705
54706 // Return the flags.
54707 return Op.getValue(1);
54708}
54709
54712 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
54713 "Expected X86ISD::ADD or X86ISD::SUB");
54714
54715 SDLoc DL(N);
54716 SDValue LHS = N->getOperand(0);
54717 SDValue RHS = N->getOperand(1);
54718 MVT VT = LHS.getSimpleValueType();
54719 bool IsSub = X86ISD::SUB == N->getOpcode();
54720 unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
54721
54722 // If we don't use the flag result, simplify back to a generic ADD/SUB.
54723 if (!N->hasAnyUseOfValue(1)) {
54724 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
54725 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
54726 }
54727
54728 // Fold any similar generic ADD/SUB opcodes to reuse this node.
54729 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
54730 SDValue Ops[] = {N0, N1};
54731 SDVTList VTs = DAG.getVTList(N->getValueType(0));
54732 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
54733 SDValue Op(N, 0);
54734 if (Negate)
54735 Op = DAG.getNegative(Op, DL, VT);
54736 DCI.CombineTo(GenericAddSub, Op);
54737 }
54738 };
54739 MatchGeneric(LHS, RHS, false);
54740 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
54741
54742 // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
54743 // EFLAGS result doesn't change.
54744 return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
54745 /*ZeroSecondOpOnly*/ true);
54746}
54747
54749 SDValue LHS = N->getOperand(0);
54750 SDValue RHS = N->getOperand(1);
54751 SDValue BorrowIn = N->getOperand(2);
54752
54753 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
54754 MVT VT = N->getSimpleValueType(0);
54755 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
54756 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
54757 }
54758
54759 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
54760 // iff the flag result is dead.
54761 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
54762 !N->hasAnyUseOfValue(1))
54763 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
54764 LHS.getOperand(1), BorrowIn);
54765
54766 return SDValue();
54767}
54768
54769// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
54772 SDValue LHS = N->getOperand(0);
54773 SDValue RHS = N->getOperand(1);
54774 SDValue CarryIn = N->getOperand(2);
54775 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
54776 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
54777
54778 // Canonicalize constant to RHS.
54779 if (LHSC && !RHSC)
54780 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
54781 CarryIn);
54782
54783 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
54784 // the result is either zero or one (depending on the input carry bit).
54785 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
54786 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
54787 // We don't have a good way to replace an EFLAGS use, so only do this when
54788 // dead right now.
54789 SDValue(N, 1).use_empty()) {
54790 SDLoc DL(N);
54791 EVT VT = N->getValueType(0);
54792 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
54793 SDValue Res1 = DAG.getNode(
54794 ISD::AND, DL, VT,
54796 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
54797 DAG.getConstant(1, DL, VT));
54798 return DCI.CombineTo(N, Res1, CarryOut);
54799 }
54800
54801 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
54802 // iff the flag result is dead.
54803 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
54804 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
54805 SDLoc DL(N);
54806 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
54807 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
54808 DAG.getConstant(0, DL, LHS.getValueType()),
54809 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
54810 }
54811
54812 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
54813 MVT VT = N->getSimpleValueType(0);
54814 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
54815 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
54816 }
54817
54818 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
54819 // iff the flag result is dead.
54820 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
54821 !N->hasAnyUseOfValue(1))
54822 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
54823 LHS.getOperand(1), CarryIn);
54824
54825 return SDValue();
54826}
54827
54829 const SDLoc &DL, EVT VT,
54830 const X86Subtarget &Subtarget) {
54831 // Example of pattern we try to detect:
54832 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
54833 //(add (build_vector (extract_elt t, 0),
54834 // (extract_elt t, 2),
54835 // (extract_elt t, 4),
54836 // (extract_elt t, 6)),
54837 // (build_vector (extract_elt t, 1),
54838 // (extract_elt t, 3),
54839 // (extract_elt t, 5),
54840 // (extract_elt t, 7)))
54841
54842 if (!Subtarget.hasSSE2())
54843 return SDValue();
54844
54845 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
54847 return SDValue();
54848
54849 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
54850 VT.getVectorNumElements() < 4 ||
54852 return SDValue();
54853
54854 // Check if one of Op0,Op1 is of the form:
54855 // (build_vector (extract_elt Mul, 0),
54856 // (extract_elt Mul, 2),
54857 // (extract_elt Mul, 4),
54858 // ...
54859 // the other is of the form:
54860 // (build_vector (extract_elt Mul, 1),
54861 // (extract_elt Mul, 3),
54862 // (extract_elt Mul, 5),
54863 // ...
54864 // and identify Mul.
54865 SDValue Mul;
54866 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
54867 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
54868 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
54869 // TODO: Be more tolerant to undefs.
54870 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54871 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54872 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54873 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
54874 return SDValue();
54875 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
54876 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
54877 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
54878 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
54879 if (!Const0L || !Const1L || !Const0H || !Const1H)
54880 return SDValue();
54881 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
54882 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
54883 // Commutativity of mul allows factors of a product to reorder.
54884 if (Idx0L > Idx1L)
54885 std::swap(Idx0L, Idx1L);
54886 if (Idx0H > Idx1H)
54887 std::swap(Idx0H, Idx1H);
54888 // Commutativity of add allows pairs of factors to reorder.
54889 if (Idx0L > Idx0H) {
54890 std::swap(Idx0L, Idx0H);
54891 std::swap(Idx1L, Idx1H);
54892 }
54893 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
54894 Idx1H != 2 * i + 3)
54895 return SDValue();
54896 if (!Mul) {
54897 // First time an extract_elt's source vector is visited. Must be a MUL
54898 // with 2X number of vector elements than the BUILD_VECTOR.
54899 // Both extracts must be from same MUL.
54900 Mul = Op0L->getOperand(0);
54901 if (Mul->getOpcode() != ISD::MUL ||
54902 Mul.getValueType().getVectorNumElements() != 2 * e)
54903 return SDValue();
54904 }
54905 // Check that the extract is from the same MUL previously seen.
54906 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
54907 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
54908 return SDValue();
54909 }
54910
54911 // Check if the Mul source can be safely shrunk.
54912 ShrinkMode Mode;
54913 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
54914 Mode == ShrinkMode::MULU16)
54915 return SDValue();
54916
54917 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54918 VT.getVectorNumElements() * 2);
54919 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
54920 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
54921
54922 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
54923 ArrayRef<SDValue> Ops) {
54924 EVT InVT = Ops[0].getValueType();
54925 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
54926 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
54927 InVT.getVectorNumElements() / 2);
54928 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
54929 };
54930 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
54931}
54932
54933// Attempt to turn this pattern into PMADDWD.
54934// (add (mul (sext (build_vector)), (sext (build_vector))),
54935// (mul (sext (build_vector)), (sext (build_vector)))
54937 const SDLoc &DL, EVT VT,
54938 const X86Subtarget &Subtarget) {
54939 if (!Subtarget.hasSSE2())
54940 return SDValue();
54941
54942 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
54943 return SDValue();
54944
54945 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
54946 VT.getVectorNumElements() < 4 ||
54948 return SDValue();
54949
54950 SDValue N00 = N0.getOperand(0);
54951 SDValue N01 = N0.getOperand(1);
54952 SDValue N10 = N1.getOperand(0);
54953 SDValue N11 = N1.getOperand(1);
54954
54955 // All inputs need to be sign extends.
54956 // TODO: Support ZERO_EXTEND from known positive?
54957 if (N00.getOpcode() != ISD::SIGN_EXTEND ||
54958 N01.getOpcode() != ISD::SIGN_EXTEND ||
54959 N10.getOpcode() != ISD::SIGN_EXTEND ||
54960 N11.getOpcode() != ISD::SIGN_EXTEND)
54961 return SDValue();
54962
54963 // Peek through the extends.
54964 N00 = N00.getOperand(0);
54965 N01 = N01.getOperand(0);
54966 N10 = N10.getOperand(0);
54967 N11 = N11.getOperand(0);
54968
54969 // Must be extending from vXi16.
54970 EVT InVT = N00.getValueType();
54971 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
54972 N10.getValueType() != InVT || N11.getValueType() != InVT)
54973 return SDValue();
54974
54975 // All inputs should be build_vectors.
54976 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
54977 N01.getOpcode() != ISD::BUILD_VECTOR ||
54978 N10.getOpcode() != ISD::BUILD_VECTOR ||
54980 return SDValue();
54981
54982 // For each element, we need to ensure we have an odd element from one vector
54983 // multiplied by the odd element of another vector and the even element from
54984 // one of the same vectors being multiplied by the even element from the
54985 // other vector. So we need to make sure for each element i, this operator
54986 // is being performed:
54987 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
54988 SDValue In0, In1;
54989 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
54990 SDValue N00Elt = N00.getOperand(i);
54991 SDValue N01Elt = N01.getOperand(i);
54992 SDValue N10Elt = N10.getOperand(i);
54993 SDValue N11Elt = N11.getOperand(i);
54994 // TODO: Be more tolerant to undefs.
54995 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54996 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54997 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54999 return SDValue();
55000 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
55001 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
55002 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
55003 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
55004 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
55005 return SDValue();
55006 unsigned IdxN00 = ConstN00Elt->getZExtValue();
55007 unsigned IdxN01 = ConstN01Elt->getZExtValue();
55008 unsigned IdxN10 = ConstN10Elt->getZExtValue();
55009 unsigned IdxN11 = ConstN11Elt->getZExtValue();
55010 // Add is commutative so indices can be reordered.
55011 if (IdxN00 > IdxN10) {
55012 std::swap(IdxN00, IdxN10);
55013 std::swap(IdxN01, IdxN11);
55014 }
55015 // N0 indices be the even element. N1 indices must be the next odd element.
55016 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
55017 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
55018 return SDValue();
55019 SDValue N00In = N00Elt.getOperand(0);
55020 SDValue N01In = N01Elt.getOperand(0);
55021 SDValue N10In = N10Elt.getOperand(0);
55022 SDValue N11In = N11Elt.getOperand(0);
55023
55024 // First time we find an input capture it.
55025 if (!In0) {
55026 In0 = N00In;
55027 In1 = N01In;
55028
55029 // The input vectors must be at least as wide as the output.
55030 // If they are larger than the output, we extract subvector below.
55031 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
55032 In1.getValueSizeInBits() < VT.getSizeInBits())
55033 return SDValue();
55034 }
55035 // Mul is commutative so the input vectors can be in any order.
55036 // Canonicalize to make the compares easier.
55037 if (In0 != N00In)
55038 std::swap(N00In, N01In);
55039 if (In0 != N10In)
55040 std::swap(N10In, N11In);
55041 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
55042 return SDValue();
55043 }
55044
55045 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
55046 ArrayRef<SDValue> Ops) {
55047 EVT OpVT = Ops[0].getValueType();
55048 assert(OpVT.getScalarType() == MVT::i16 &&
55049 "Unexpected scalar element type");
55050 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
55051 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
55052 OpVT.getVectorNumElements() / 2);
55053 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
55054 };
55055
55056 // If the output is narrower than an input, extract the low part of the input
55057 // vector.
55058 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
55059 VT.getVectorNumElements() * 2);
55060 if (OutVT16.bitsLT(In0.getValueType())) {
55061 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
55062 DAG.getIntPtrConstant(0, DL));
55063 }
55064 if (OutVT16.bitsLT(In1.getValueType())) {
55065 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
55066 DAG.getIntPtrConstant(0, DL));
55067 }
55068 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
55069 PMADDBuilder);
55070}
55071
55072// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
55073// If upper element in each pair of both VPMADDWD are zero then we can merge
55074// the operand elements and use the implicit add of VPMADDWD.
55075// TODO: Add support for VPMADDUBSW (which isn't commutable).
55077 const SDLoc &DL, EVT VT) {
55078 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
55079 return SDValue();
55080
55081 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
55082 if (VT.getSizeInBits() > 128)
55083 return SDValue();
55084
55085 unsigned NumElts = VT.getVectorNumElements();
55086 MVT OpVT = N0.getOperand(0).getSimpleValueType();
55088 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
55089
55090 bool Op0HiZero =
55091 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
55092 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
55093 bool Op1HiZero =
55094 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
55095 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
55096
55097 // TODO: Check for zero lower elements once we have actual codegen that
55098 // creates them.
55099 if (!Op0HiZero || !Op1HiZero)
55100 return SDValue();
55101
55102 // Create a shuffle mask packing the lower elements from each VPMADDWD.
55103 SmallVector<int> Mask;
55104 for (int i = 0; i != (int)NumElts; ++i) {
55105 Mask.push_back(2 * i);
55106 Mask.push_back(2 * (i + NumElts));
55107 }
55108
55109 SDValue LHS =
55110 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
55111 SDValue RHS =
55112 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
55113 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
55114}
55115
55116/// CMOV of constants requires materializing constant operands in registers.
55117/// Try to fold those constants into an 'add' instruction to reduce instruction
55118/// count. We do this with CMOV rather the generic 'select' because there are
55119/// earlier folds that may be used to turn select-of-constants into logic hacks.
55121 const X86Subtarget &Subtarget) {
55122 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
55123 // better because we eliminate 1-2 instructions. This transform is still
55124 // an improvement without zero operands because we trade 2 move constants and
55125 // 1 add for 2 adds (LEA) as long as the constants can be represented as
55126 // immediate asm operands (fit in 32-bits).
55127 auto isSuitableCmov = [](SDValue V) {
55128 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
55129 return false;
55130 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
55131 !isa<ConstantSDNode>(V.getOperand(1)))
55132 return false;
55133 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
55134 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
55135 V.getConstantOperandAPInt(1).isSignedIntN(32));
55136 };
55137
55138 // Match an appropriate CMOV as the first operand of the add.
55139 SDValue Cmov = N->getOperand(0);
55140 SDValue OtherOp = N->getOperand(1);
55141 if (!isSuitableCmov(Cmov))
55142 std::swap(Cmov, OtherOp);
55143 if (!isSuitableCmov(Cmov))
55144 return SDValue();
55145
55146 // Don't remove a load folding opportunity for the add. That would neutralize
55147 // any improvements from removing constant materializations.
55148 if (X86::mayFoldLoad(OtherOp, Subtarget))
55149 return SDValue();
55150
55151 EVT VT = N->getValueType(0);
55152 SDLoc DL(N);
55153 SDValue FalseOp = Cmov.getOperand(0);
55154 SDValue TrueOp = Cmov.getOperand(1);
55155
55156 // We will push the add through the select, but we can potentially do better
55157 // if we know there is another add in the sequence and this is pointer math.
55158 // In that case, we can absorb an add into the trailing memory op and avoid
55159 // a 3-operand LEA which is likely slower than a 2-operand LEA.
55160 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
55161 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
55162 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
55163 all_of(N->uses(), [&](SDNode *Use) {
55164 auto *MemNode = dyn_cast<MemSDNode>(Use);
55165 return MemNode && MemNode->getBasePtr().getNode() == N;
55166 })) {
55167 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
55168 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
55169 // it is possible that choosing op1 might be better.
55170 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
55171 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
55172 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
55173 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
55174 Cmov.getOperand(2), Cmov.getOperand(3));
55175 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
55176 }
55177
55178 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
55179 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
55180 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
55181 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
55182 Cmov.getOperand(3));
55183}
55184
55187 const X86Subtarget &Subtarget) {
55188 EVT VT = N->getValueType(0);
55189 SDValue Op0 = N->getOperand(0);
55190 SDValue Op1 = N->getOperand(1);
55191 SDLoc DL(N);
55192
55193 if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG, Subtarget))
55194 return Select;
55195
55196 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))
55197 return MAdd;
55198 if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, DL, VT, Subtarget))
55199 return MAdd;
55200 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
55201 return MAdd;
55202
55203 // Try to synthesize horizontal adds from adds of shuffles.
55204 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
55205 return V;
55206
55207 // add(psadbw(X,0),psadbw(Y,0)) -> psadbw(add(X,Y),0)
55208 // iff X and Y won't overflow.
55209 if (Op0.getOpcode() == X86ISD::PSADBW && Op1.getOpcode() == X86ISD::PSADBW &&
55212 if (DAG.willNotOverflowAdd(false, Op0.getOperand(0), Op1.getOperand(0))) {
55213 MVT OpVT = Op0.getOperand(1).getSimpleValueType();
55214 SDValue Sum =
55215 DAG.getNode(ISD::ADD, DL, OpVT, Op0.getOperand(0), Op1.getOperand(0));
55216 return DAG.getNode(X86ISD::PSADBW, DL, VT, Sum,
55217 getZeroVector(OpVT, Subtarget, DAG, DL));
55218 }
55219 }
55220
55221 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
55222 // (sub Y, (sext (vXi1 X))).
55223 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
55224 // generic DAG combine without a legal type check, but adding this there
55225 // caused regressions.
55226 if (VT.isVector()) {
55227 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55228 if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
55229 Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55230 TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
55231 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
55232 return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
55233 }
55234
55235 if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
55236 Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55237 TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
55238 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
55239 return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
55240 }
55241 }
55242
55243 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
55244 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
55245 X86::isZeroNode(Op0.getOperand(1))) {
55246 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
55247 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
55248 Op0.getOperand(0), Op0.getOperand(2));
55249 }
55250
55251 return combineAddOrSubToADCOrSBB(N, DAG);
55252}
55253
55254// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
55255// condition comes from the subtract node that produced -X. This matches the
55256// cmov expansion for absolute value. By swapping the operands we convert abs
55257// to nabs.
55259 SDValue N0 = N->getOperand(0);
55260 SDValue N1 = N->getOperand(1);
55261
55262 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
55263 return SDValue();
55264
55266 if (CC != X86::COND_S && CC != X86::COND_NS)
55267 return SDValue();
55268
55269 // Condition should come from a negate operation.
55270 SDValue Cond = N1.getOperand(3);
55271 if (Cond.getOpcode() != X86ISD::SUB || !isNullConstant(Cond.getOperand(0)))
55272 return SDValue();
55273 assert(Cond.getResNo() == 1 && "Unexpected result number");
55274
55275 // Get the X and -X from the negate.
55276 SDValue NegX = Cond.getValue(0);
55277 SDValue X = Cond.getOperand(1);
55278
55279 SDValue FalseOp = N1.getOperand(0);
55280 SDValue TrueOp = N1.getOperand(1);
55281
55282 // Cmov operands should be X and NegX. Order doesn't matter.
55283 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
55284 return SDValue();
55285
55286 // Build a new CMOV with the operands swapped.
55287 SDLoc DL(N);
55288 MVT VT = N->getSimpleValueType(0);
55289 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
55290 N1.getOperand(2), Cond);
55291 // Convert sub to add.
55292 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
55293}
55294
55296 SDValue Op0 = N->getOperand(0);
55297 SDValue Op1 = N->getOperand(1);
55298
55299 // (sub C (zero_extend (setcc)))
55300 // =>
55301 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
55302 // Don't disturb (sub 0 setcc), which is easily done with neg.
55303 EVT VT = N->getValueType(0);
55304 auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
55305 if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
55306 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
55307 Op1.getOperand(0).hasOneUse()) {
55308 SDValue SetCC = Op1.getOperand(0);
55311 APInt NewImm = Op0C->getAPIntValue() - 1;
55312 SDLoc DL(Op1);
55313 SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
55314 NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
55315 return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
55316 DAG.getConstant(NewImm, DL, VT));
55317 }
55318
55319 return SDValue();
55320}
55321
55324 const X86Subtarget &Subtarget) {
55325 SDValue Op0 = N->getOperand(0);
55326 SDValue Op1 = N->getOperand(1);
55327
55328 // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
55329 auto IsNonOpaqueConstant = [&](SDValue Op) {
55331 if (auto *Cst = dyn_cast<ConstantSDNode>(C))
55332 return !Cst->isOpaque();
55333 return true;
55334 }
55335 return false;
55336 };
55337
55338 // X86 can't encode an immediate LHS of a sub. See if we can push the
55339 // negation into a preceding instruction. If the RHS of the sub is a XOR with
55340 // one use and a constant, invert the immediate, saving one register.
55341 // However, ignore cases where C1 is 0, as those will become a NEG.
55342 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
55343 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
55344 !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) &&
55345 Op1->hasOneUse()) {
55346 SDLoc DL(N);
55347 EVT VT = Op0.getValueType();
55348 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
55349 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
55350 SDValue NewAdd =
55351 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
55352 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
55353 }
55354
55355 if (SDValue V = combineSubABS(N, DAG))
55356 return V;
55357
55358 // Try to synthesize horizontal subs from subs of shuffles.
55359 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
55360 return V;
55361
55362 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
55363 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
55364 X86::isZeroNode(Op1.getOperand(1))) {
55365 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
55366 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
55367 Op1.getOperand(0), Op1.getOperand(2));
55368 }
55369
55370 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
55371 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
55372 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
55373 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
55374 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
55375 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
55376 Op1.getOperand(1), Op1.getOperand(2));
55377 return DAG.getNode(ISD::SUB, SDLoc(N), Op0.getValueType(), ADC.getValue(0),
55378 Op1.getOperand(0));
55379 }
55380
55381 if (SDValue V = combineXorSubCTLZ(N, DAG, Subtarget))
55382 return V;
55383
55384 if (SDValue V = combineAddOrSubToADCOrSBB(N, DAG))
55385 return V;
55386
55387 return combineSubSetcc(N, DAG);
55388}
55389
55391 const X86Subtarget &Subtarget) {
55392 MVT VT = N->getSimpleValueType(0);
55393 SDLoc DL(N);
55394
55395 if (N->getOperand(0) == N->getOperand(1)) {
55396 if (N->getOpcode() == X86ISD::PCMPEQ)
55397 return DAG.getConstant(-1, DL, VT);
55398 if (N->getOpcode() == X86ISD::PCMPGT)
55399 return DAG.getConstant(0, DL, VT);
55400 }
55401
55402 return SDValue();
55403}
55404
55405/// Helper that combines an array of subvector ops as if they were the operands
55406/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
55407/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
55411 const X86Subtarget &Subtarget) {
55412 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
55413 unsigned EltSizeInBits = VT.getScalarSizeInBits();
55414
55415 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
55416 return DAG.getUNDEF(VT);
55417
55418 if (llvm::all_of(Ops, [](SDValue Op) {
55419 return ISD::isBuildVectorAllZeros(Op.getNode());
55420 }))
55421 return getZeroVector(VT, Subtarget, DAG, DL);
55422
55423 SDValue Op0 = Ops[0];
55424 bool IsSplat = llvm::all_equal(Ops);
55425 unsigned NumOps = Ops.size();
55426 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55427 LLVMContext &Ctx = *DAG.getContext();
55428
55429 // Repeated subvectors.
55430 if (IsSplat &&
55431 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
55432 // If this broadcast is inserted into both halves, use a larger broadcast.
55433 if (Op0.getOpcode() == X86ISD::VBROADCAST)
55434 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
55435
55436 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
55437 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
55438 (Subtarget.hasAVX2() ||
55440 VT.getScalarType(), Subtarget)))
55441 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
55442 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
55443 Op0.getOperand(0),
55444 DAG.getIntPtrConstant(0, DL)));
55445
55446 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
55447 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
55448 (Subtarget.hasAVX2() ||
55449 (EltSizeInBits >= 32 &&
55450 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
55451 Op0.getOperand(0).getValueType() == VT.getScalarType())
55452 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
55453
55454 // concat_vectors(extract_subvector(broadcast(x)),
55455 // extract_subvector(broadcast(x))) -> broadcast(x)
55456 // concat_vectors(extract_subvector(subv_broadcast(x)),
55457 // extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)
55458 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
55459 Op0.getOperand(0).getValueType() == VT) {
55460 SDValue SrcVec = Op0.getOperand(0);
55461 if (SrcVec.getOpcode() == X86ISD::VBROADCAST ||
55463 return Op0.getOperand(0);
55464 if (SrcVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
55465 Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
55466 return Op0.getOperand(0);
55467 }
55468
55469 // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x))
55470 if (Op0.getOpcode() == X86ISD::VPERMI && Subtarget.useAVX512Regs() &&
55471 !X86::mayFoldLoad(Op0.getOperand(0), Subtarget))
55472 return DAG.getNode(Op0.getOpcode(), DL, VT,
55474 Op0.getOperand(0), Op0.getOperand(0)),
55475 Op0.getOperand(1));
55476 }
55477
55478 // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
55479 // Only concat of subvector high halves which vperm2x128 is best at.
55480 // TODO: This should go in combineX86ShufflesRecursively eventually.
55481 if (VT.is256BitVector() && NumOps == 2) {
55482 SDValue Src0 = peekThroughBitcasts(Ops[0]);
55483 SDValue Src1 = peekThroughBitcasts(Ops[1]);
55484 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
55486 EVT SrcVT0 = Src0.getOperand(0).getValueType();
55487 EVT SrcVT1 = Src1.getOperand(0).getValueType();
55488 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
55489 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
55490 if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&
55491 Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
55492 Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
55493 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
55494 DAG.getBitcast(VT, Src0.getOperand(0)),
55495 DAG.getBitcast(VT, Src1.getOperand(0)),
55496 DAG.getTargetConstant(0x31, DL, MVT::i8));
55497 }
55498 }
55499 }
55500
55501 // Repeated opcode.
55502 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
55503 // but it currently struggles with different vector widths.
55504 if (llvm::all_of(Ops, [Op0](SDValue Op) {
55505 return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();
55506 })) {
55507 auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
55509 for (SDValue SubOp : SubOps)
55510 Subs.push_back(SubOp.getOperand(I));
55511 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
55512 };
55513 auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
55514 bool AllConstants = true;
55515 bool AllSubVectors = true;
55516 for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
55517 SDValue Sub = SubOps[I].getOperand(Op);
55518 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
55519 SDValue BC = peekThroughBitcasts(Sub);
55520 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
55522 AllSubVectors &= Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
55523 Sub.getOperand(0).getValueType() == VT &&
55524 Sub.getConstantOperandAPInt(1) == (I * NumSubElts);
55525 }
55526 return AllConstants || AllSubVectors;
55527 };
55528
55529 switch (Op0.getOpcode()) {
55530 case X86ISD::VBROADCAST: {
55531 if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
55532 return Op.getOperand(0).getValueType().is128BitVector();
55533 })) {
55534 if (VT == MVT::v4f64 || VT == MVT::v4i64)
55535 return DAG.getNode(X86ISD::UNPCKL, DL, VT,
55536 ConcatSubOperand(VT, Ops, 0),
55537 ConcatSubOperand(VT, Ops, 0));
55538 // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
55539 if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
55540 return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
55542 DL, VT, ConcatSubOperand(VT, Ops, 0),
55543 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
55544 }
55545 break;
55546 }
55547 case X86ISD::MOVDDUP:
55548 case X86ISD::MOVSHDUP:
55549 case X86ISD::MOVSLDUP: {
55550 if (!IsSplat)
55551 return DAG.getNode(Op0.getOpcode(), DL, VT,
55552 ConcatSubOperand(VT, Ops, 0));
55553 break;
55554 }
55555 case X86ISD::SHUFP: {
55556 // Add SHUFPD support if/when necessary.
55557 if (!IsSplat && VT.getScalarType() == MVT::f32 &&
55558 llvm::all_of(Ops, [Op0](SDValue Op) {
55559 return Op.getOperand(2) == Op0.getOperand(2);
55560 })) {
55561 return DAG.getNode(Op0.getOpcode(), DL, VT,
55562 ConcatSubOperand(VT, Ops, 0),
55563 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
55564 }
55565 break;
55566 }
55567 case X86ISD::UNPCKH:
55568 case X86ISD::UNPCKL: {
55569 // Don't concatenate build_vector patterns.
55570 if (!IsSplat && EltSizeInBits >= 32 &&
55571 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55572 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
55573 none_of(Ops, [](SDValue Op) {
55574 return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==
55576 peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==
55578 })) {
55579 return DAG.getNode(Op0.getOpcode(), DL, VT,
55580 ConcatSubOperand(VT, Ops, 0),
55581 ConcatSubOperand(VT, Ops, 1));
55582 }
55583 break;
55584 }
55585 case X86ISD::PSHUFHW:
55586 case X86ISD::PSHUFLW:
55587 case X86ISD::PSHUFD:
55588 if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
55589 Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
55590 return DAG.getNode(Op0.getOpcode(), DL, VT,
55591 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
55592 }
55593 [[fallthrough]];
55594 case X86ISD::VPERMILPI:
55595 if (!IsSplat && EltSizeInBits == 32 &&
55596 (VT.is256BitVector() ||
55597 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
55598 all_of(Ops, [&Op0](SDValue Op) {
55599 return Op0.getOperand(1) == Op.getOperand(1);
55600 })) {
55601 MVT FloatVT = VT.changeVectorElementType(MVT::f32);
55602 SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
55603 Res =
55604 DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
55605 return DAG.getBitcast(VT, Res);
55606 }
55607 if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) {
55608 uint64_t Idx0 = Ops[0].getConstantOperandVal(1);
55609 uint64_t Idx1 = Ops[1].getConstantOperandVal(1);
55610 uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3);
55611 return DAG.getNode(Op0.getOpcode(), DL, VT,
55612 ConcatSubOperand(VT, Ops, 0),
55613 DAG.getTargetConstant(Idx, DL, MVT::i8));
55614 }
55615 break;
55616 case X86ISD::PSHUFB:
55617 case X86ISD::PSADBW:
55618 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55619 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
55620 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
55621 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
55622 NumOps * SrcVT.getVectorNumElements());
55623 return DAG.getNode(Op0.getOpcode(), DL, VT,
55624 ConcatSubOperand(SrcVT, Ops, 0),
55625 ConcatSubOperand(SrcVT, Ops, 1));
55626 }
55627 break;
55628 case X86ISD::VPERMV:
55629 if (!IsSplat && NumOps == 2 &&
55630 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
55631 MVT OpVT = Op0.getSimpleValueType();
55632 int NumSrcElts = OpVT.getVectorNumElements();
55633 SmallVector<int, 64> ConcatMask;
55634 for (unsigned i = 0; i != NumOps; ++i) {
55635 SmallVector<int, 64> SubMask;
55637 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
55638 break;
55639 for (int M : SubMask) {
55640 if (0 <= M)
55641 M += i * NumSrcElts;
55642 ConcatMask.push_back(M);
55643 }
55644 }
55645 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
55646 SDValue Src = concatSubVectors(Ops[0].getOperand(1),
55647 Ops[1].getOperand(1), DAG, DL);
55648 MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
55649 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
55650 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
55651 return DAG.getNode(X86ISD::VPERMV, DL, VT, Mask, Src);
55652 }
55653 }
55654 break;
55655 case X86ISD::VPERMV3:
55656 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
55657 MVT OpVT = Op0.getSimpleValueType();
55658 int NumSrcElts = OpVT.getVectorNumElements();
55659 SmallVector<int, 64> ConcatMask;
55660 for (unsigned i = 0; i != NumOps; ++i) {
55661 SmallVector<int, 64> SubMask;
55663 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
55664 break;
55665 for (int M : SubMask) {
55666 if (0 <= M) {
55667 M += M < NumSrcElts ? 0 : NumSrcElts;
55668 M += i * NumSrcElts;
55669 }
55670 ConcatMask.push_back(M);
55671 }
55672 }
55673 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
55674 SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
55675 Ops[1].getOperand(0), DAG, DL);
55676 SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
55677 Ops[1].getOperand(2), DAG, DL);
55678 MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
55679 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
55680 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
55681 return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
55682 }
55683 }
55684 break;
55685 case X86ISD::VPERM2X128: {
55686 if (!IsSplat && VT.is512BitVector() && Subtarget.useAVX512Regs()) {
55687 assert(NumOps == 2 && "Bad concat_vectors operands");
55688 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
55689 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
55690 // TODO: Handle zero'd subvectors.
55691 if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {
55692 int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03),
55693 (int)((Imm1 >> 4) & 0x3)};
55694 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
55695 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
55696 Ops[0].getOperand(1), DAG, DL);
55697 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
55698 Ops[1].getOperand(1), DAG, DL);
55699 SDValue Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
55700 DAG.getBitcast(ShuffleVT, LHS),
55701 DAG.getBitcast(ShuffleVT, RHS),
55702 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
55703 return DAG.getBitcast(VT, Res);
55704 }
55705 }
55706 break;
55707 }
55708 case X86ISD::SHUF128: {
55709 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
55710 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
55711 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
55712 unsigned Imm = ((Imm0 & 1) << 0) | ((Imm0 & 2) << 1) | 0x08 |
55713 ((Imm1 & 1) << 4) | ((Imm1 & 2) << 5) | 0x80;
55714 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
55715 Ops[0].getOperand(1), DAG, DL);
55716 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
55717 Ops[1].getOperand(1), DAG, DL);
55718 return DAG.getNode(X86ISD::SHUF128, DL, VT, LHS, RHS,
55719 DAG.getTargetConstant(Imm, DL, MVT::i8));
55720 }
55721 break;
55722 }
55723 case ISD::TRUNCATE:
55724 if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
55725 EVT SrcVT = Ops[0].getOperand(0).getValueType();
55726 if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
55727 SrcVT == Ops[1].getOperand(0).getValueType() &&
55728 Subtarget.useAVX512Regs() &&
55729 Subtarget.getPreferVectorWidth() >= 512 &&
55730 (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
55731 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
55732 return DAG.getNode(ISD::TRUNCATE, DL, VT,
55733 ConcatSubOperand(NewSrcVT, Ops, 0));
55734 }
55735 }
55736 break;
55737 case X86ISD::VSHLI:
55738 case X86ISD::VSRLI:
55739 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
55740 // TODO: Move this to LowerShiftByScalarImmediate?
55741 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
55742 llvm::all_of(Ops, [](SDValue Op) {
55743 return Op.getConstantOperandAPInt(1) == 32;
55744 })) {
55745 SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
55746 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
55747 if (Op0.getOpcode() == X86ISD::VSHLI) {
55748 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
55749 {8, 0, 8, 2, 8, 4, 8, 6});
55750 } else {
55751 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
55752 {1, 8, 3, 8, 5, 8, 7, 8});
55753 }
55754 return DAG.getBitcast(VT, Res);
55755 }
55756 [[fallthrough]];
55757 case X86ISD::VSRAI:
55758 case X86ISD::VSHL:
55759 case X86ISD::VSRL:
55760 case X86ISD::VSRA:
55761 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
55762 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
55763 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
55764 llvm::all_of(Ops, [Op0](SDValue Op) {
55765 return Op0.getOperand(1) == Op.getOperand(1);
55766 })) {
55767 return DAG.getNode(Op0.getOpcode(), DL, VT,
55768 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
55769 }
55770 break;
55771 case X86ISD::VPERMI:
55772 case X86ISD::VROTLI:
55773 case X86ISD::VROTRI:
55774 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
55775 llvm::all_of(Ops, [Op0](SDValue Op) {
55776 return Op0.getOperand(1) == Op.getOperand(1);
55777 })) {
55778 return DAG.getNode(Op0.getOpcode(), DL, VT,
55779 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
55780 }
55781 break;
55782 case ISD::AND:
55783 case ISD::OR:
55784 case ISD::XOR:
55785 case X86ISD::ANDNP:
55786 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55787 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
55788 return DAG.getNode(Op0.getOpcode(), DL, VT,
55789 ConcatSubOperand(VT, Ops, 0),
55790 ConcatSubOperand(VT, Ops, 1));
55791 }
55792 break;
55793 case X86ISD::PCMPEQ:
55794 case X86ISD::PCMPGT:
55795 if (!IsSplat && VT.is256BitVector() && Subtarget.hasInt256() &&
55796 (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1))) {
55797 return DAG.getNode(Op0.getOpcode(), DL, VT,
55798 ConcatSubOperand(VT, Ops, 0),
55799 ConcatSubOperand(VT, Ops, 1));
55800 }
55801 break;
55802 case ISD::CTPOP:
55803 case ISD::CTTZ:
55804 case ISD::CTLZ:
55807 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55808 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
55809 return DAG.getNode(Op0.getOpcode(), DL, VT,
55810 ConcatSubOperand(VT, Ops, 0));
55811 }
55812 break;
55814 if (!IsSplat &&
55815 (VT.is256BitVector() ||
55816 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
55817 llvm::all_of(Ops, [Op0](SDValue Op) {
55818 return Op0.getOperand(2) == Op.getOperand(2);
55819 })) {
55820 return DAG.getNode(Op0.getOpcode(), DL, VT,
55821 ConcatSubOperand(VT, Ops, 0),
55822 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
55823 }
55824 break;
55825 case ISD::ADD:
55826 case ISD::SUB:
55827 case ISD::MUL:
55828 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55829 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
55830 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
55831 return DAG.getNode(Op0.getOpcode(), DL, VT,
55832 ConcatSubOperand(VT, Ops, 0),
55833 ConcatSubOperand(VT, Ops, 1));
55834 }
55835 break;
55836 // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
55837 // their latency are short, so here we don't replace them unless we won't
55838 // introduce extra VINSERT.
55839 case ISD::FADD:
55840 case ISD::FSUB:
55841 case ISD::FMUL:
55842 if (!IsSplat && (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1)) &&
55843 (VT.is256BitVector() ||
55844 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
55845 return DAG.getNode(Op0.getOpcode(), DL, VT,
55846 ConcatSubOperand(VT, Ops, 0),
55847 ConcatSubOperand(VT, Ops, 1));
55848 }
55849 break;
55850 case ISD::FDIV:
55851 if (!IsSplat && (VT.is256BitVector() ||
55852 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
55853 return DAG.getNode(Op0.getOpcode(), DL, VT,
55854 ConcatSubOperand(VT, Ops, 0),
55855 ConcatSubOperand(VT, Ops, 1));
55856 }
55857 break;
55858 case X86ISD::HADD:
55859 case X86ISD::HSUB:
55860 case X86ISD::FHADD:
55861 case X86ISD::FHSUB:
55862 if (!IsSplat && VT.is256BitVector() &&
55863 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
55864 return DAG.getNode(Op0.getOpcode(), DL, VT,
55865 ConcatSubOperand(VT, Ops, 0),
55866 ConcatSubOperand(VT, Ops, 1));
55867 }
55868 break;
55869 case X86ISD::PACKSS:
55870 case X86ISD::PACKUS:
55871 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55872 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
55873 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
55874 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
55875 NumOps * SrcVT.getVectorNumElements());
55876 return DAG.getNode(Op0.getOpcode(), DL, VT,
55877 ConcatSubOperand(SrcVT, Ops, 0),
55878 ConcatSubOperand(SrcVT, Ops, 1));
55879 }
55880 break;
55881 case X86ISD::PALIGNR:
55882 if (!IsSplat &&
55883 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55884 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
55885 llvm::all_of(Ops, [Op0](SDValue Op) {
55886 return Op0.getOperand(2) == Op.getOperand(2);
55887 })) {
55888 return DAG.getNode(Op0.getOpcode(), DL, VT,
55889 ConcatSubOperand(VT, Ops, 0),
55890 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
55891 }
55892 break;
55893 case X86ISD::BLENDI:
55894 if (NumOps == 2 && VT.is512BitVector() && Subtarget.useBWIRegs()) {
55895 uint64_t Mask0 = Ops[0].getConstantOperandVal(2);
55896 uint64_t Mask1 = Ops[1].getConstantOperandVal(2);
55897 // MVT::v16i16 has repeated blend mask.
55898 if (Op0.getSimpleValueType() == MVT::v16i16) {
55899 Mask0 = (Mask0 << 8) | Mask0;
55900 Mask1 = (Mask1 << 8) | Mask1;
55901 }
55902 uint64_t Mask = (Mask1 << (VT.getVectorNumElements() / 2)) | Mask0;
55904 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
55905 SDValue Sel =
55906 DAG.getBitcast(MaskVT, DAG.getConstant(Mask, DL, MaskSVT));
55907 return DAG.getSelect(DL, VT, Sel, ConcatSubOperand(VT, Ops, 1),
55908 ConcatSubOperand(VT, Ops, 0));
55909 }
55910 break;
55911 case ISD::VSELECT:
55912 if (!IsSplat && Subtarget.hasAVX512() &&
55913 (VT.is256BitVector() ||
55914 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
55915 (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
55916 EVT SelVT = Ops[0].getOperand(0).getValueType();
55917 if (SelVT.getVectorElementType() == MVT::i1) {
55918 SelVT = EVT::getVectorVT(Ctx, MVT::i1,
55919 NumOps * SelVT.getVectorNumElements());
55920 if (TLI.isTypeLegal(SelVT))
55921 return DAG.getNode(Op0.getOpcode(), DL, VT,
55922 ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
55923 ConcatSubOperand(VT, Ops, 1),
55924 ConcatSubOperand(VT, Ops, 2));
55925 }
55926 }
55927 [[fallthrough]];
55928 case X86ISD::BLENDV:
55929 if (!IsSplat && VT.is256BitVector() && NumOps == 2 &&
55930 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
55931 IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
55932 EVT SelVT = Ops[0].getOperand(0).getValueType();
55933 SelVT = SelVT.getDoubleNumVectorElementsVT(Ctx);
55934 if (TLI.isTypeLegal(SelVT))
55935 return DAG.getNode(Op0.getOpcode(), DL, VT,
55936 ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
55937 ConcatSubOperand(VT, Ops, 1),
55938 ConcatSubOperand(VT, Ops, 2));
55939 }
55940 break;
55941 }
55942 }
55943
55944 // Fold subvector loads into one.
55945 // If needed, look through bitcasts to get to the load.
55946 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
55947 unsigned Fast;
55948 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
55949 if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT,
55950 *FirstLd->getMemOperand(), &Fast) &&
55951 Fast) {
55952 if (SDValue Ld =
55953 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
55954 return Ld;
55955 }
55956 }
55957
55958 // Attempt to fold target constant loads.
55959 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
55960 SmallVector<APInt> EltBits;
55961 APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
55962 for (unsigned I = 0; I != NumOps; ++I) {
55963 APInt OpUndefElts;
55964 SmallVector<APInt> OpEltBits;
55965 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
55966 OpEltBits, /*AllowWholeUndefs*/ true,
55967 /*AllowPartialUndefs*/ false))
55968 break;
55969 EltBits.append(OpEltBits);
55970 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
55971 }
55972 if (EltBits.size() == VT.getVectorNumElements()) {
55973 Constant *C = getConstantVector(VT, EltBits, UndefElts, Ctx);
55974 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
55975 SDValue CV = DAG.getConstantPool(C, PVT);
55978 SDValue Ld = DAG.getLoad(VT, DL, DAG.getEntryNode(), CV, MPI);
55979 SDValue Sub = extractSubVector(Ld, 0, DAG, DL, Op0.getValueSizeInBits());
55980 DAG.ReplaceAllUsesOfValueWith(Op0, Sub);
55981 return Ld;
55982 }
55983 }
55984
55985 // If this simple subvector or scalar/subvector broadcast_load is inserted
55986 // into both halves, use a larger broadcast_load. Update other uses to use
55987 // an extracted subvector.
55988 if (IsSplat &&
55989 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
55990 if (ISD::isNormalLoad(Op0.getNode()) ||
55993 auto *Mem = cast<MemSDNode>(Op0);
55994 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
55997 if (SDValue BcastLd =
55998 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
55999 SDValue BcastSrc =
56000 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
56001 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
56002 return BcastLd;
56003 }
56004 }
56005 }
56006
56007 // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.
56008 if (IsSplat && NumOps == 4 && VT.is512BitVector() &&
56009 Subtarget.useAVX512Regs()) {
56010 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
56011 SDValue Res = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
56012 Res = DAG.getBitcast(ShuffleVT, Res);
56013 Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, Res, Res,
56014 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
56015 return DAG.getBitcast(VT, Res);
56016 }
56017
56018 return SDValue();
56019}
56020
56023 const X86Subtarget &Subtarget) {
56024 EVT VT = N->getValueType(0);
56025 EVT SrcVT = N->getOperand(0).getValueType();
56026 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56027 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
56028
56029 if (VT.getVectorElementType() == MVT::i1) {
56030 // Attempt to constant fold.
56031 unsigned SubSizeInBits = SrcVT.getSizeInBits();
56033 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
56034 auto *C = dyn_cast<ConstantSDNode>(peekThroughBitcasts(Ops[I]));
56035 if (!C) break;
56036 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
56037 if (I == (E - 1)) {
56038 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
56039 if (TLI.isTypeLegal(IntVT))
56040 return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
56041 }
56042 }
56043
56044 // Don't do anything else for i1 vectors.
56045 return SDValue();
56046 }
56047
56048 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
56049 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
56050 DCI, Subtarget))
56051 return R;
56052 }
56053
56054 return SDValue();
56055}
56056
56059 const X86Subtarget &Subtarget) {
56060 if (DCI.isBeforeLegalizeOps())
56061 return SDValue();
56062
56063 MVT OpVT = N->getSimpleValueType(0);
56064
56065 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
56066
56067 SDLoc dl(N);
56068 SDValue Vec = N->getOperand(0);
56069 SDValue SubVec = N->getOperand(1);
56070
56071 uint64_t IdxVal = N->getConstantOperandVal(2);
56072 MVT SubVecVT = SubVec.getSimpleValueType();
56073
56074 if (Vec.isUndef() && SubVec.isUndef())
56075 return DAG.getUNDEF(OpVT);
56076
56077 // Inserting undefs/zeros into zeros/undefs is a zero vector.
56078 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
56079 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
56080 return getZeroVector(OpVT, Subtarget, DAG, dl);
56081
56083 // If we're inserting into a zero vector and then into a larger zero vector,
56084 // just insert into the larger zero vector directly.
56085 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
56087 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
56088 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
56089 getZeroVector(OpVT, Subtarget, DAG, dl),
56090 SubVec.getOperand(1),
56091 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
56092 }
56093
56094 // If we're inserting into a zero vector and our input was extracted from an
56095 // insert into a zero vector of the same type and the extraction was at
56096 // least as large as the original insertion. Just insert the original
56097 // subvector into a zero vector.
56098 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
56099 isNullConstant(SubVec.getOperand(1)) &&
56101 SDValue Ins = SubVec.getOperand(0);
56102 if (isNullConstant(Ins.getOperand(2)) &&
56103 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
56104 Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
56105 SubVecVT.getFixedSizeInBits())
56106 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
56107 getZeroVector(OpVT, Subtarget, DAG, dl),
56108 Ins.getOperand(1), N->getOperand(2));
56109 }
56110 }
56111
56112 // Stop here if this is an i1 vector.
56113 if (IsI1Vector)
56114 return SDValue();
56115
56116 // Eliminate an intermediate vector widening:
56117 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
56118 // insert_subvector X, Y, Idx
56119 // TODO: This is a more general version of a DAGCombiner fold, can we move it
56120 // there?
56121 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
56122 SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
56123 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
56124 SubVec.getOperand(1), N->getOperand(2));
56125
56126 // If this is an insert of an extract, combine to a shuffle. Don't do this
56127 // if the insert or extract can be represented with a subregister operation.
56128 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
56129 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
56130 (IdxVal != 0 ||
56131 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
56132 int ExtIdxVal = SubVec.getConstantOperandVal(1);
56133 if (ExtIdxVal != 0) {
56134 int VecNumElts = OpVT.getVectorNumElements();
56135 int SubVecNumElts = SubVecVT.getVectorNumElements();
56136 SmallVector<int, 64> Mask(VecNumElts);
56137 // First create an identity shuffle mask.
56138 for (int i = 0; i != VecNumElts; ++i)
56139 Mask[i] = i;
56140 // Now insert the extracted portion.
56141 for (int i = 0; i != SubVecNumElts; ++i)
56142 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
56143
56144 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
56145 }
56146 }
56147
56148 // Match concat_vector style patterns.
56149 SmallVector<SDValue, 2> SubVectorOps;
56150 if (collectConcatOps(N, SubVectorOps, DAG)) {
56151 if (SDValue Fold =
56152 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
56153 return Fold;
56154
56155 // If we're inserting all zeros into the upper half, change this to
56156 // a concat with zero. We will match this to a move
56157 // with implicit upper bit zeroing during isel.
56158 // We do this here because we don't want combineConcatVectorOps to
56159 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
56160 if (SubVectorOps.size() == 2 &&
56161 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
56162 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
56163 getZeroVector(OpVT, Subtarget, DAG, dl),
56164 SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
56165
56166 // Attempt to recursively combine to a shuffle.
56167 if (all_of(SubVectorOps, [](SDValue SubOp) {
56168 return isTargetShuffle(SubOp.getOpcode());
56169 })) {
56170 SDValue Op(N, 0);
56171 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
56172 return Res;
56173 }
56174 }
56175
56176 // If this is a broadcast insert into an upper undef, use a larger broadcast.
56177 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
56178 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
56179
56180 // If this is a broadcast load inserted into an upper undef, use a larger
56181 // broadcast load.
56182 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
56183 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
56184 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
56185 SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
56186 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
56187 SDValue BcastLd =
56189 MemIntr->getMemoryVT(),
56190 MemIntr->getMemOperand());
56191 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
56192 return BcastLd;
56193 }
56194
56195 // If we're splatting the lower half subvector of a full vector load into the
56196 // upper half, attempt to create a subvector broadcast.
56197 if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&
56198 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
56199 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
56200 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
56201 if (VecLd && SubLd &&
56202 DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,
56203 SubVec.getValueSizeInBits() / 8, 0))
56204 return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,
56205 SubLd, 0, DAG);
56206 }
56207
56208 return SDValue();
56209}
56210
56211/// If we are extracting a subvector of a vector select and the select condition
56212/// is composed of concatenated vectors, try to narrow the select width. This
56213/// is a common pattern for AVX1 integer code because 256-bit selects may be
56214/// legal, but there is almost no integer math/logic available for 256-bit.
56215/// This function should only be called with legal types (otherwise, the calls
56216/// to get simple value types will assert).
56218 SelectionDAG &DAG) {
56219 SDValue Sel = Ext->getOperand(0);
56220 if (Sel.getOpcode() != ISD::VSELECT ||
56221 !isFreeToSplitVector(Sel.getOperand(0).getNode(), DAG))
56222 return SDValue();
56223
56224 // Note: We assume simple value types because this should only be called with
56225 // legal operations/types.
56226 // TODO: This can be extended to handle extraction to 256-bits.
56227 MVT VT = Ext->getSimpleValueType(0);
56228 if (!VT.is128BitVector())
56229 return SDValue();
56230
56231 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
56232 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
56233 return SDValue();
56234
56235 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
56236 MVT SelVT = Sel.getSimpleValueType();
56237 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
56238 "Unexpected vector type with legal operations");
56239
56240 unsigned SelElts = SelVT.getVectorNumElements();
56241 unsigned CastedElts = WideVT.getVectorNumElements();
56242 unsigned ExtIdx = Ext->getConstantOperandVal(1);
56243 if (SelElts % CastedElts == 0) {
56244 // The select has the same or more (narrower) elements than the extract
56245 // operand. The extraction index gets scaled by that factor.
56246 ExtIdx *= (SelElts / CastedElts);
56247 } else if (CastedElts % SelElts == 0) {
56248 // The select has less (wider) elements than the extract operand. Make sure
56249 // that the extraction index can be divided evenly.
56250 unsigned IndexDivisor = CastedElts / SelElts;
56251 if (ExtIdx % IndexDivisor != 0)
56252 return SDValue();
56253 ExtIdx /= IndexDivisor;
56254 } else {
56255 llvm_unreachable("Element count of simple vector types are not divisible?");
56256 }
56257
56258 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
56259 unsigned NarrowElts = SelElts / NarrowingFactor;
56260 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
56261 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
56262 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
56263 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
56264 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
56265 return DAG.getBitcast(VT, NarrowSel);
56266}
56267
56270 const X86Subtarget &Subtarget) {
56271 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
56272 // eventually get combined/lowered into ANDNP) with a concatenated operand,
56273 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
56274 // We let generic combining take over from there to simplify the
56275 // insert/extract and 'not'.
56276 // This pattern emerges during AVX1 legalization. We handle it before lowering
56277 // to avoid complications like splitting constant vector loads.
56278
56279 // Capture the original wide type in the likely case that we need to bitcast
56280 // back to this type.
56281 if (!N->getValueType(0).isSimple())
56282 return SDValue();
56283
56284 MVT VT = N->getSimpleValueType(0);
56285 SDValue InVec = N->getOperand(0);
56286 unsigned IdxVal = N->getConstantOperandVal(1);
56287 SDValue InVecBC = peekThroughBitcasts(InVec);
56288 EVT InVecVT = InVec.getValueType();
56289 unsigned SizeInBits = VT.getSizeInBits();
56290 unsigned InSizeInBits = InVecVT.getSizeInBits();
56291 unsigned NumSubElts = VT.getVectorNumElements();
56292 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56293 SDLoc DL(N);
56294
56295 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
56296 TLI.isTypeLegal(InVecVT) &&
56297 InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
56298 auto isConcatenatedNot = [](SDValue V) {
56299 V = peekThroughBitcasts(V);
56300 if (!isBitwiseNot(V))
56301 return false;
56302 SDValue NotOp = V->getOperand(0);
56304 };
56305 if (isConcatenatedNot(InVecBC.getOperand(0)) ||
56306 isConcatenatedNot(InVecBC.getOperand(1))) {
56307 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
56308 SDValue Concat = splitVectorIntBinary(InVecBC, DAG, SDLoc(InVecBC));
56309 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
56310 DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
56311 }
56312 }
56313
56314 if (DCI.isBeforeLegalizeOps())
56315 return SDValue();
56316
56317 if (SDValue V = narrowExtractedVectorSelect(N, DL, DAG))
56318 return V;
56319
56321 return getZeroVector(VT, Subtarget, DAG, DL);
56322
56323 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
56324 if (VT.getScalarType() == MVT::i1)
56325 return DAG.getConstant(1, DL, VT);
56326 return getOnesVector(VT, DAG, DL);
56327 }
56328
56329 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
56330 return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
56331
56332 // If we are extracting from an insert into a larger vector, replace with a
56333 // smaller insert if we don't access less than the original subvector. Don't
56334 // do this for i1 vectors.
56335 // TODO: Relax the matching indices requirement?
56336 if (VT.getVectorElementType() != MVT::i1 &&
56337 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&
56338 IdxVal == InVec.getConstantOperandVal(2) &&
56339 InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
56340 SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
56341 InVec.getOperand(0), N->getOperand(1));
56342 unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;
56343 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,
56344 InVec.getOperand(1),
56345 DAG.getVectorIdxConstant(NewIdxVal, DL));
56346 }
56347
56348 // If we're extracting an upper subvector from a broadcast we should just
56349 // extract the lowest subvector instead which should allow
56350 // SimplifyDemandedVectorElts do more simplifications.
56351 if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
56353 DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
56354 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
56355
56356 // If we're extracting a broadcasted subvector, just use the lowest subvector.
56357 if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
56358 cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
56359 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
56360
56361 // Attempt to extract from the source of a shuffle vector.
56362 if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
56363 SmallVector<int, 32> ShuffleMask;
56364 SmallVector<int, 32> ScaledMask;
56365 SmallVector<SDValue, 2> ShuffleInputs;
56366 unsigned NumSubVecs = InSizeInBits / SizeInBits;
56367 // Decode the shuffle mask and scale it so its shuffling subvectors.
56368 if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
56369 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
56370 unsigned SubVecIdx = IdxVal / NumSubElts;
56371 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
56372 return DAG.getUNDEF(VT);
56373 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
56374 return getZeroVector(VT, Subtarget, DAG, DL);
56375 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
56376 if (Src.getValueSizeInBits() == InSizeInBits) {
56377 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
56378 unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
56379 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
56380 DL, SizeInBits);
56381 }
56382 }
56383 }
56384
56385 auto IsExtractFree = [](SDValue V) {
56386 V = peekThroughBitcasts(V);
56387 if (ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
56388 return true;
56390 return true;
56391 return V.isUndef();
56392 };
56393
56394 // If we're extracting the lowest subvector and we're the only user,
56395 // we may be able to perform this with a smaller vector width.
56396 unsigned InOpcode = InVec.getOpcode();
56397 if (InVec.hasOneUse()) {
56398 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
56399 // v2f64 CVTDQ2PD(v4i32).
56400 if (InOpcode == ISD::SINT_TO_FP &&
56401 InVec.getOperand(0).getValueType() == MVT::v4i32) {
56402 return DAG.getNode(X86ISD::CVTSI2P, DL, VT, InVec.getOperand(0));
56403 }
56404 // v2f64 CVTUDQ2PD(v4i32).
56405 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
56406 InVec.getOperand(0).getValueType() == MVT::v4i32) {
56407 return DAG.getNode(X86ISD::CVTUI2P, DL, VT, InVec.getOperand(0));
56408 }
56409 // v2f64 CVTPS2PD(v4f32).
56410 if (InOpcode == ISD::FP_EXTEND &&
56411 InVec.getOperand(0).getValueType() == MVT::v4f32) {
56412 return DAG.getNode(X86ISD::VFPEXT, DL, VT, InVec.getOperand(0));
56413 }
56414 }
56415 // v4i32 CVTPS2DQ(v4f32).
56416 if (InOpcode == ISD::FP_TO_SINT && VT == MVT::v4i32) {
56417 SDValue Src = InVec.getOperand(0);
56418 if (Src.getValueType().getScalarType() == MVT::f32)
56419 return DAG.getNode(InOpcode, DL, VT,
56420 extractSubVector(Src, IdxVal, DAG, DL, SizeInBits));
56421 }
56422 if (IdxVal == 0 &&
56423 (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&
56424 (SizeInBits == 128 || SizeInBits == 256) &&
56425 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
56426 SDValue Ext = InVec.getOperand(0);
56427 if (Ext.getValueSizeInBits() > SizeInBits)
56428 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
56429 unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
56430 return DAG.getNode(ExtOp, DL, VT, Ext);
56431 }
56432 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
56433 InVec.getOperand(0).getValueType().is256BitVector() &&
56434 InVec.getOperand(1).getValueType().is256BitVector() &&
56435 InVec.getOperand(2).getValueType().is256BitVector()) {
56436 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
56437 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
56438 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
56439 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
56440 }
56441 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
56442 (SizeInBits == 128 || SizeInBits == 256)) {
56443 SDValue InVecSrc = InVec.getOperand(0);
56444 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
56445 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
56446 return DAG.getNode(InOpcode, DL, VT, Ext);
56447 }
56448 if ((InOpcode == X86ISD::CMPP || InOpcode == X86ISD::PCMPEQ ||
56449 InOpcode == X86ISD::PCMPGT) &&
56450 (IsExtractFree(InVec.getOperand(0)) ||
56451 IsExtractFree(InVec.getOperand(1))) &&
56452 SizeInBits == 128) {
56453 SDValue Ext0 =
56454 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
56455 SDValue Ext1 =
56456 extractSubVector(InVec.getOperand(1), IdxVal, DAG, DL, SizeInBits);
56457 if (InOpcode == X86ISD::CMPP)
56458 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, InVec.getOperand(2));
56459 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1);
56460 }
56461 if (InOpcode == X86ISD::MOVDDUP &&
56462 (SizeInBits == 128 || SizeInBits == 256)) {
56463 SDValue Ext0 =
56464 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
56465 return DAG.getNode(InOpcode, DL, VT, Ext0);
56466 }
56467 }
56468
56469 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
56470 // as this is very likely to fold into a shuffle/truncation.
56471 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
56472 InVecVT.getScalarSizeInBits() == 64 &&
56473 InVec.getConstantOperandAPInt(1) == 32) {
56474 SDValue Ext =
56475 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
56476 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
56477 }
56478
56479 return SDValue();
56480}
56481
56483 EVT VT = N->getValueType(0);
56484 SDValue Src = N->getOperand(0);
56485 SDLoc DL(N);
56486
56487 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
56488 // This occurs frequently in our masked scalar intrinsic code and our
56489 // floating point select lowering with AVX512.
56490 // TODO: SimplifyDemandedBits instead?
56491 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
56492 isOneConstant(Src.getOperand(1)))
56493 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
56494
56495 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
56496 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
56497 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
56498 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
56499 isNullConstant(Src.getOperand(1)))
56500 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
56501 Src.getOperand(1));
56502
56503 // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
56504 // TODO: Move to DAGCombine/SimplifyDemandedBits?
56505 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
56506 auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
56507 if (Op.getValueType() != MVT::i64)
56508 return SDValue();
56509 unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
56510 if (Op.getOpcode() == Opc &&
56511 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
56512 return Op.getOperand(0);
56513 unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
56514 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
56515 if (Ld->getExtensionType() == Ext &&
56516 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
56517 return Op;
56518 if (IsZeroExt) {
56519 KnownBits Known = DAG.computeKnownBits(Op);
56520 if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
56521 return Op;
56522 }
56523 return SDValue();
56524 };
56525
56526 if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
56527 return DAG.getBitcast(
56528 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
56529 DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
56530
56531 if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
56532 return DAG.getBitcast(
56533 VT,
56534 DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
56535 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
56536 DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
56537 }
56538
56539 // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
56540 if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
56541 Src.getOperand(0).getValueType() == MVT::x86mmx)
56542 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
56543
56544 // See if we're broadcasting the scalar value, in which case just reuse that.
56545 // Ensure the same SDValue from the SDNode use is being used.
56546 if (VT.getScalarType() == Src.getValueType())
56547 for (SDNode *User : Src->uses())
56548 if (User->getOpcode() == X86ISD::VBROADCAST &&
56549 Src == User->getOperand(0)) {
56550 unsigned SizeInBits = VT.getFixedSizeInBits();
56551 unsigned BroadcastSizeInBits =
56552 User->getValueSizeInBits(0).getFixedValue();
56553 if (BroadcastSizeInBits == SizeInBits)
56554 return SDValue(User, 0);
56555 if (BroadcastSizeInBits > SizeInBits)
56556 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
56557 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
56558 // coverage.
56559 }
56560
56561 return SDValue();
56562}
56563
56564// Simplify PMULDQ and PMULUDQ operations.
56567 const X86Subtarget &Subtarget) {
56568 SDValue LHS = N->getOperand(0);
56569 SDValue RHS = N->getOperand(1);
56570
56571 // Canonicalize constant to RHS.
56574 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
56575
56576 // Multiply by zero.
56577 // Don't return RHS as it may contain UNDEFs.
56578 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
56579 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
56580
56581 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
56582 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56583 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
56584 return SDValue(N, 0);
56585
56586 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
56587 // convert it to any_extend_invec, due to the LegalOperations check, do the
56588 // conversion directly to a vector shuffle manually. This exposes combine
56589 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
56590 // combineX86ShufflesRecursively on SSE4.1 targets.
56591 // FIXME: This is basically a hack around several other issues related to
56592 // ANY_EXTEND_VECTOR_INREG.
56593 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
56594 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
56595 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
56596 LHS.getOperand(0).getValueType() == MVT::v4i32) {
56597 SDLoc dl(N);
56598 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
56599 LHS.getOperand(0), { 0, -1, 1, -1 });
56600 LHS = DAG.getBitcast(MVT::v2i64, LHS);
56601 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
56602 }
56603 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
56604 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
56605 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
56606 RHS.getOperand(0).getValueType() == MVT::v4i32) {
56607 SDLoc dl(N);
56608 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
56609 RHS.getOperand(0), { 0, -1, 1, -1 });
56610 RHS = DAG.getBitcast(MVT::v2i64, RHS);
56611 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
56612 }
56613
56614 return SDValue();
56615}
56616
56617// Simplify VPMADDUBSW/VPMADDWD operations.
56620 EVT VT = N->getValueType(0);
56621 SDValue LHS = N->getOperand(0);
56622 SDValue RHS = N->getOperand(1);
56623
56624 // Multiply by zero.
56625 // Don't return LHS/RHS as it may contain UNDEFs.
56626 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
56628 return DAG.getConstant(0, SDLoc(N), VT);
56629
56630 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56631 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
56632 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
56633 return SDValue(N, 0);
56634
56635 return SDValue();
56636}
56637
56640 const X86Subtarget &Subtarget) {
56641 EVT VT = N->getValueType(0);
56642 SDValue In = N->getOperand(0);
56643 unsigned Opcode = N->getOpcode();
56644 unsigned InOpcode = In.getOpcode();
56645 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56646 SDLoc DL(N);
56647
56648 // Try to merge vector loads and extend_inreg to an extload.
56649 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
56650 In.hasOneUse()) {
56651 auto *Ld = cast<LoadSDNode>(In);
56652 if (Ld->isSimple()) {
56653 MVT SVT = In.getSimpleValueType().getVectorElementType();
56656 : ISD::ZEXTLOAD;
56657 EVT MemVT = VT.changeVectorElementType(SVT);
56658 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
56659 SDValue Load = DAG.getExtLoad(
56660 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
56661 MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());
56662 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
56663 return Load;
56664 }
56665 }
56666 }
56667
56668 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
56669 if (Opcode == InOpcode)
56670 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
56671
56672 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
56673 // -> EXTEND_VECTOR_INREG(X).
56674 // TODO: Handle non-zero subvector indices.
56675 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
56676 In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
56677 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
56678 In.getValueSizeInBits())
56679 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
56680
56681 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
56682 // TODO: Move to DAGCombine?
56683 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
56684 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
56685 In.getValueSizeInBits() == VT.getSizeInBits()) {
56686 unsigned NumElts = VT.getVectorNumElements();
56687 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
56688 EVT EltVT = In.getOperand(0).getValueType();
56689 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
56690 for (unsigned I = 0; I != NumElts; ++I)
56691 Elts[I * Scale] = In.getOperand(I);
56692 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
56693 }
56694
56695 // Attempt to combine as a shuffle on SSE41+ targets.
56696 if (Subtarget.hasSSE41()) {
56697 SDValue Op(N, 0);
56698 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
56699 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
56700 return Res;
56701 }
56702
56703 return SDValue();
56704}
56705
56708 EVT VT = N->getValueType(0);
56709
56710 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
56711 return DAG.getConstant(0, SDLoc(N), VT);
56712
56713 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56714 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
56715 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
56716 return SDValue(N, 0);
56717
56718 return SDValue();
56719}
56720
56721// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
56722// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
56723// extra instructions between the conversion due to going to scalar and back.
56725 const X86Subtarget &Subtarget) {
56726 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
56727 return SDValue();
56728
56729 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
56730 return SDValue();
56731
56732 if (N->getValueType(0) != MVT::f32 ||
56733 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
56734 return SDValue();
56735
56736 SDLoc dl(N);
56737 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
56738 N->getOperand(0).getOperand(0));
56739 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
56740 DAG.getTargetConstant(4, dl, MVT::i32));
56741 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
56742 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
56743 DAG.getIntPtrConstant(0, dl));
56744}
56745
56748 const X86Subtarget &Subtarget) {
56749 EVT VT = N->getValueType(0);
56750 bool IsStrict = N->isStrictFPOpcode();
56751 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
56752 EVT SrcVT = Src.getValueType();
56753
56754 SDLoc dl(N);
56755 if (SrcVT.getScalarType() == MVT::bf16) {
56756 if (DCI.isAfterLegalizeDAG() && Src.getOpcode() == ISD::FP_ROUND &&
56757 !IsStrict && Src.getOperand(0).getValueType() == VT)
56758 return Src.getOperand(0);
56759
56760 if (!SrcVT.isVector())
56761 return SDValue();
56762
56763 assert(!IsStrict && "Strict FP doesn't support BF16");
56764 if (VT.getVectorElementType() == MVT::f64) {
56765 MVT TmpVT = VT.getSimpleVT().changeVectorElementType(MVT::f32);
56766 return DAG.getNode(ISD::FP_EXTEND, dl, VT,
56767 DAG.getNode(ISD::FP_EXTEND, dl, TmpVT, Src));
56768 }
56769 assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
56770 MVT NVT = SrcVT.getSimpleVT().changeVectorElementType(MVT::i32);
56771 Src = DAG.getBitcast(SrcVT.changeTypeToInteger(), Src);
56772 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Src);
56773 Src = DAG.getNode(ISD::SHL, dl, NVT, Src, DAG.getConstant(16, dl, NVT));
56774 return DAG.getBitcast(VT, Src);
56775 }
56776
56777 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
56778 return SDValue();
56779
56780 if (Subtarget.hasFP16())
56781 return SDValue();
56782
56783 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
56784 return SDValue();
56785
56786 if (VT.getVectorElementType() != MVT::f32 &&
56787 VT.getVectorElementType() != MVT::f64)
56788 return SDValue();
56789
56790 unsigned NumElts = VT.getVectorNumElements();
56791 if (NumElts == 1 || !isPowerOf2_32(NumElts))
56792 return SDValue();
56793
56794 // Convert the input to vXi16.
56795 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
56796 Src = DAG.getBitcast(IntVT, Src);
56797
56798 // Widen to at least 8 input elements.
56799 if (NumElts < 8) {
56800 unsigned NumConcats = 8 / NumElts;
56801 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
56802 : DAG.getConstant(0, dl, IntVT);
56803 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
56804 Ops[0] = Src;
56805 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
56806 }
56807
56808 // Destination is vXf32 with at least 4 elements.
56809 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
56810 std::max(4U, NumElts));
56811 SDValue Cvt, Chain;
56812 if (IsStrict) {
56813 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
56814 {N->getOperand(0), Src});
56815 Chain = Cvt.getValue(1);
56816 } else {
56817 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
56818 }
56819
56820 if (NumElts < 4) {
56821 assert(NumElts == 2 && "Unexpected size");
56822 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
56823 DAG.getIntPtrConstant(0, dl));
56824 }
56825
56826 if (IsStrict) {
56827 // Extend to the original VT if necessary.
56828 if (Cvt.getValueType() != VT) {
56829 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
56830 {Chain, Cvt});
56831 Chain = Cvt.getValue(1);
56832 }
56833 return DAG.getMergeValues({Cvt, Chain}, dl);
56834 }
56835
56836 // Extend to the original VT if necessary.
56837 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
56838}
56839
56840// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
56841// from. Limit this to cases where the loads have the same input chain and the
56842// output chains are unused. This avoids any memory ordering issues.
56845 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
56846 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
56847 "Unknown broadcast load type");
56848
56849 // Only do this if the chain result is unused.
56850 if (N->hasAnyUseOfValue(1))
56851 return SDValue();
56852
56853 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
56854
56855 SDValue Ptr = MemIntrin->getBasePtr();
56856 SDValue Chain = MemIntrin->getChain();
56857 EVT VT = N->getSimpleValueType(0);
56858 EVT MemVT = MemIntrin->getMemoryVT();
56859
56860 // Look at other users of our base pointer and try to find a wider broadcast.
56861 // The input chain and the size of the memory VT must match.
56862 for (SDNode *User : Ptr->uses())
56863 if (User != N && User->getOpcode() == N->getOpcode() &&
56864 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
56865 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
56866 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
56867 MemVT.getSizeInBits() &&
56868 !User->hasAnyUseOfValue(1) &&
56869 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
56870 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
56871 VT.getSizeInBits());
56872 Extract = DAG.getBitcast(VT, Extract);
56873 return DCI.CombineTo(N, Extract, SDValue(User, 1));
56874 }
56875
56876 return SDValue();
56877}
56878
56880 const X86Subtarget &Subtarget) {
56881 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
56882 return SDValue();
56883
56884 bool IsStrict = N->isStrictFPOpcode();
56885 EVT VT = N->getValueType(0);
56886 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
56887 EVT SrcVT = Src.getValueType();
56888
56889 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
56890 SrcVT.getVectorElementType() != MVT::f32)
56891 return SDValue();
56892
56893 SDLoc dl(N);
56894
56895 SDValue Cvt, Chain;
56896 unsigned NumElts = VT.getVectorNumElements();
56897 if (Subtarget.hasFP16()) {
56898 // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64), ..)))
56899 // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64), ..))
56900 if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS) {
56901 SDValue Cvt0, Cvt1;
56902 SDValue Op0 = Src.getOperand(0);
56903 SDValue Op1 = Src.getOperand(1);
56904 bool IsOp0Strict = Op0->isStrictFPOpcode();
56905 if (Op0.getOpcode() != Op1.getOpcode() ||
56906 Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
56907 Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
56908 return SDValue();
56909 }
56910 int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
56911 if (IsStrict) {
56912 assert(IsOp0Strict && "Op0 must be strict node");
56913 unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
56916 Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
56917 {Op0.getOperand(0), Op0.getOperand(1)});
56918 Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
56919 {Op1.getOperand(0), Op1.getOperand(1)});
56920 Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
56921 return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
56922 }
56923 unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
56925 Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
56926 Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
56927 return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
56928 }
56929 return SDValue();
56930 }
56931
56932 if (NumElts == 1 || !isPowerOf2_32(NumElts))
56933 return SDValue();
56934
56935 // Widen to at least 4 input elements.
56936 if (NumElts < 4)
56937 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
56938 DAG.getConstantFP(0.0, dl, SrcVT));
56939
56940 // Destination is v8i16 with at least 8 elements.
56941 EVT CvtVT =
56942 EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
56943 SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
56944 if (IsStrict) {
56945 Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
56946 {N->getOperand(0), Src, Rnd});
56947 Chain = Cvt.getValue(1);
56948 } else {
56949 Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
56950 }
56951
56952 // Extract down to real number of elements.
56953 if (NumElts < 8) {
56955 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
56956 DAG.getIntPtrConstant(0, dl));
56957 }
56958
56959 Cvt = DAG.getBitcast(VT, Cvt);
56960
56961 if (IsStrict)
56962 return DAG.getMergeValues({Cvt, Chain}, dl);
56963
56964 return Cvt;
56965}
56966
56968 SDValue Src = N->getOperand(0);
56969
56970 // Turn MOVDQ2Q+simple_load into an mmx load.
56971 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
56972 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
56973
56974 if (LN->isSimple()) {
56975 SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
56976 LN->getBasePtr(),
56977 LN->getPointerInfo(),
56978 LN->getOriginalAlign(),
56979 LN->getMemOperand()->getFlags());
56980 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
56981 return NewLd;
56982 }
56983 }
56984
56985 return SDValue();
56986}
56987
56990 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
56991 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56992 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
56993 return SDValue(N, 0);
56994
56995 return SDValue();
56996}
56997
56999 DAGCombinerInfo &DCI) const {
57000 SelectionDAG &DAG = DCI.DAG;
57001 switch (N->getOpcode()) {
57002 // clang-format off
57003 default: break;
57005 return combineScalarToVector(N, DAG);
57007 case X86ISD::PEXTRW:
57008 case X86ISD::PEXTRB:
57009 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
57011 return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
57013 return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
57015 return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
57016 case ISD::VSELECT:
57017 case ISD::SELECT:
57018 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
57019 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
57020 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
57021 case X86ISD::CMP: return combineCMP(N, DAG, Subtarget);
57022 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
57023 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
57024 case X86ISD::ADD:
57025 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
57026 case X86ISD::SBB: return combineSBB(N, DAG);
57027 case X86ISD::ADC: return combineADC(N, DAG, DCI);
57028 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
57029 case ISD::SHL: return combineShiftLeft(N, DAG);
57030 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
57031 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
57032 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
57033 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
57034 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
57035 case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget);
57036 case X86ISD::BEXTR:
57037 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
57038 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
57039 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
57040 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
57041 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
57043 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
57044 case ISD::SINT_TO_FP:
57046 return combineSIntToFP(N, DAG, DCI, Subtarget);
57047 case ISD::UINT_TO_FP:
57049 return combineUIntToFP(N, DAG, Subtarget);
57050 case ISD::LRINT:
57051 case ISD::LLRINT: return combineLRINT_LLRINT(N, DAG, Subtarget);
57052 case ISD::FADD:
57053 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
57054 case X86ISD::VFCMULC:
57055 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
57056 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
57057 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
57058 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
57059 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
57060 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
57061 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
57062 case X86ISD::FXOR:
57063 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
57064 case X86ISD::FMIN:
57065 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
57066 case ISD::FMINNUM:
57067 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
57068 case X86ISD::CVTSI2P:
57069 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
57070 case X86ISD::CVTP2SI:
57071 case X86ISD::CVTP2UI:
57073 case X86ISD::CVTTP2SI:
57075 case X86ISD::CVTTP2UI:
57076 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
57078 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
57079 case X86ISD::BT: return combineBT(N, DAG, DCI);
57080 case ISD::ANY_EXTEND:
57081 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
57082 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
57083 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
57087 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
57088 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
57089 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
57090 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
57091 case X86ISD::PACKSS:
57092 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
57093 case X86ISD::HADD:
57094 case X86ISD::HSUB:
57095 case X86ISD::FHADD:
57096 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
57097 case X86ISD::VSHL:
57098 case X86ISD::VSRA:
57099 case X86ISD::VSRL:
57100 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
57101 case X86ISD::VSHLI:
57102 case X86ISD::VSRAI:
57103 case X86ISD::VSRLI:
57104 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
57106 case X86ISD::PINSRB:
57107 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
57108 case X86ISD::SHUFP: // Handle all target specific shuffles
57109 case X86ISD::INSERTPS:
57110 case X86ISD::EXTRQI:
57111 case X86ISD::INSERTQI:
57112 case X86ISD::VALIGN:
57113 case X86ISD::PALIGNR:
57114 case X86ISD::VSHLDQ:
57115 case X86ISD::VSRLDQ:
57116 case X86ISD::BLENDI:
57117 case X86ISD::UNPCKH:
57118 case X86ISD::UNPCKL:
57119 case X86ISD::MOVHLPS:
57120 case X86ISD::MOVLHPS:
57121 case X86ISD::PSHUFB:
57122 case X86ISD::PSHUFD:
57123 case X86ISD::PSHUFHW:
57124 case X86ISD::PSHUFLW:
57125 case X86ISD::MOVSHDUP:
57126 case X86ISD::MOVSLDUP:
57127 case X86ISD::MOVDDUP:
57128 case X86ISD::MOVSS:
57129 case X86ISD::MOVSD:
57130 case X86ISD::MOVSH:
57131 case X86ISD::VBROADCAST:
57132 case X86ISD::VPPERM:
57133 case X86ISD::VPERMI:
57134 case X86ISD::VPERMV:
57135 case X86ISD::VPERMV3:
57136 case X86ISD::VPERMIL2:
57137 case X86ISD::VPERMILPI:
57138 case X86ISD::VPERMILPV:
57139 case X86ISD::VPERM2X128:
57140 case X86ISD::SHUF128:
57141 case X86ISD::VZEXT_MOVL:
57142 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
57143 case X86ISD::FMADD_RND:
57144 case X86ISD::FMSUB:
57146 case X86ISD::FMSUB_RND:
57147 case X86ISD::FNMADD:
57149 case X86ISD::FNMADD_RND:
57150 case X86ISD::FNMSUB:
57152 case X86ISD::FNMSUB_RND:
57153 case ISD::FMA:
57154 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
57157 case X86ISD::FMADDSUB:
57158 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
57159 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
57160 case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);
57161 case X86ISD::MGATHER:
57162 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
57163 case ISD::MGATHER:
57164 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
57165 case X86ISD::PCMPEQ:
57166 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
57167 case X86ISD::PMULDQ:
57168 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
57169 case X86ISD::VPMADDUBSW:
57170 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
57171 case X86ISD::KSHIFTL:
57172 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
57173 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
57175 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, DCI, Subtarget);
57177 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
57179 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
57180 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
57181 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
57182 // clang-format on
57183 }
57184
57185 return SDValue();
57186}
57187
57189 return false;
57190}
57191
57192// Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
57194 EVT ExtVT) const {
57195 return Subtarget.hasAVX512() || !VT.isVector();
57196}
57197
57198bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
57199 if (!isTypeLegal(VT))
57200 return false;
57201
57202 // There are no vXi8 shifts.
57203 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
57204 return false;
57205
57206 // TODO: Almost no 8-bit ops are desirable because they have no actual
57207 // size/speed advantages vs. 32-bit ops, but they do have a major
57208 // potential disadvantage by causing partial register stalls.
57209 //
57210 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
57211 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
57212 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
57213 // check for a constant operand to the multiply.
57214 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
57215 return false;
57216
57217 // i16 instruction encodings are longer and some i16 instructions are slow,
57218 // so those are not desirable.
57219 if (VT == MVT::i16) {
57220 switch (Opc) {
57221 default:
57222 break;
57223 case ISD::LOAD:
57224 case ISD::SIGN_EXTEND:
57225 case ISD::ZERO_EXTEND:
57226 case ISD::ANY_EXTEND:
57227 case ISD::SHL:
57228 case ISD::SRA:
57229 case ISD::SRL:
57230 case ISD::SUB:
57231 case ISD::ADD:
57232 case ISD::MUL:
57233 case ISD::AND:
57234 case ISD::OR:
57235 case ISD::XOR:
57236 return false;
57237 }
57238 }
57239
57240 // Any legal type not explicitly accounted for above here is desirable.
57241 return true;
57242}
57243
57246 int JTI,
57247 SelectionDAG &DAG) const {
57248 const Module *M = DAG.getMachineFunction().getMMI().getModule();
57249 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
57250 if (IsCFProtectionSupported) {
57251 // In case control-flow branch protection is enabled, we need to add
57252 // notrack prefix to the indirect branch.
57253 // In order to do that we create NT_BRIND SDNode.
57254 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
57255 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Value, dl);
57256 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, JTInfo, Addr);
57257 }
57258
57259 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG);
57260}
57261
57264 const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
57266 EVT VT = LogicOp->getValueType(0);
57267 EVT OpVT = SETCC0->getOperand(0).getValueType();
57268 if (!VT.isInteger())
57270
57271 if (VT.isVector())
57276
57277 // Don't use `NotAnd` as even though `not` is generally shorter code size than
57278 // `add`, `add` can lower to LEA which can save moves / spills. Any case where
57279 // `NotAnd` applies, `AddAnd` does as well.
57280 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
57281 // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
57283}
57284
57286 EVT VT = Op.getValueType();
57287 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
57288 isa<ConstantSDNode>(Op.getOperand(1));
57289
57290 // i16 is legal, but undesirable since i16 instruction encodings are longer
57291 // and some i16 instructions are slow.
57292 // 8-bit multiply-by-constant can usually be expanded to something cheaper
57293 // using LEA and/or other ALU ops.
57294 if (VT != MVT::i16 && !Is8BitMulByConstant)
57295 return false;
57296
57297 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
57298 if (!Op.hasOneUse())
57299 return false;
57300 SDNode *User = *Op->use_begin();
57302 return false;
57303 auto *Ld = cast<LoadSDNode>(Load);
57304 auto *St = cast<StoreSDNode>(User);
57305 return Ld->getBasePtr() == St->getBasePtr();
57306 };
57307
57308 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
57309 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
57310 return false;
57311 if (!Op.hasOneUse())
57312 return false;
57313 SDNode *User = *Op->use_begin();
57314 if (User->getOpcode() != ISD::ATOMIC_STORE)
57315 return false;
57316 auto *Ld = cast<AtomicSDNode>(Load);
57317 auto *St = cast<AtomicSDNode>(User);
57318 return Ld->getBasePtr() == St->getBasePtr();
57319 };
57320
57321 bool Commute = false;
57322 switch (Op.getOpcode()) {
57323 default: return false;
57324 case ISD::SIGN_EXTEND:
57325 case ISD::ZERO_EXTEND:
57326 case ISD::ANY_EXTEND:
57327 break;
57328 case ISD::SHL:
57329 case ISD::SRA:
57330 case ISD::SRL: {
57331 SDValue N0 = Op.getOperand(0);
57332 // Look out for (store (shl (load), x)).
57333 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
57334 return false;
57335 break;
57336 }
57337 case ISD::ADD:
57338 case ISD::MUL:
57339 case ISD::AND:
57340 case ISD::OR:
57341 case ISD::XOR:
57342 Commute = true;
57343 [[fallthrough]];
57344 case ISD::SUB: {
57345 SDValue N0 = Op.getOperand(0);
57346 SDValue N1 = Op.getOperand(1);
57347 // Avoid disabling potential load folding opportunities.
57348 if (X86::mayFoldLoad(N1, Subtarget) &&
57349 (!Commute || !isa<ConstantSDNode>(N0) ||
57350 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
57351 return false;
57352 if (X86::mayFoldLoad(N0, Subtarget) &&
57353 ((Commute && !isa<ConstantSDNode>(N1)) ||
57354 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
57355 return false;
57356 if (IsFoldableAtomicRMW(N0, Op) ||
57357 (Commute && IsFoldableAtomicRMW(N1, Op)))
57358 return false;
57359 }
57360 }
57361
57362 PVT = MVT::i32;
57363 return true;
57364}
57365
57366//===----------------------------------------------------------------------===//
57367// X86 Inline Assembly Support
57368//===----------------------------------------------------------------------===//
57369
57370// Helper to match a string separated by whitespace.
57372 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
57373
57374 for (StringRef Piece : Pieces) {
57375 if (!S.starts_with(Piece)) // Check if the piece matches.
57376 return false;
57377
57378 S = S.substr(Piece.size());
57380 if (Pos == 0) // We matched a prefix.
57381 return false;
57382
57383 S = S.substr(Pos);
57384 }
57385
57386 return S.empty();
57387}
57388
57390
57391 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
57392 if (llvm::is_contained(AsmPieces, "~{cc}") &&
57393 llvm::is_contained(AsmPieces, "~{flags}") &&
57394 llvm::is_contained(AsmPieces, "~{fpsr}")) {
57395
57396 if (AsmPieces.size() == 3)
57397 return true;
57398 else if (llvm::is_contained(AsmPieces, "~{dirflag}"))
57399 return true;
57400 }
57401 }
57402 return false;
57403}
57404
57406 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
57407
57408 const std::string &AsmStr = IA->getAsmString();
57409
57410 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
57411 if (!Ty || Ty->getBitWidth() % 16 != 0)
57412 return false;
57413
57414 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
57415 SmallVector<StringRef, 4> AsmPieces;
57416 SplitString(AsmStr, AsmPieces, ";\n");
57417
57418 switch (AsmPieces.size()) {
57419 default: return false;
57420 case 1:
57421 // FIXME: this should verify that we are targeting a 486 or better. If not,
57422 // we will turn this bswap into something that will be lowered to logical
57423 // ops instead of emitting the bswap asm. For now, we don't support 486 or
57424 // lower so don't worry about this.
57425 // bswap $0
57426 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
57427 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
57428 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
57429 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
57430 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
57431 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
57432 // No need to check constraints, nothing other than the equivalent of
57433 // "=r,0" would be valid here.
57435 }
57436
57437 // rorw $$8, ${0:w} --> llvm.bswap.i16
57438 if (CI->getType()->isIntegerTy(16) &&
57439 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
57440 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
57441 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
57442 AsmPieces.clear();
57443 StringRef ConstraintsStr = IA->getConstraintString();
57444 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
57445 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
57446 if (clobbersFlagRegisters(AsmPieces))
57448 }
57449 break;
57450 case 3:
57451 if (CI->getType()->isIntegerTy(32) &&
57452 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
57453 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
57454 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
57455 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
57456 AsmPieces.clear();
57457 StringRef ConstraintsStr = IA->getConstraintString();
57458 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
57459 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
57460 if (clobbersFlagRegisters(AsmPieces))
57462 }
57463
57464 if (CI->getType()->isIntegerTy(64)) {
57465 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
57466 if (Constraints.size() >= 2 &&
57467 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
57468 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
57469 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
57470 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
57471 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
57472 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
57474 }
57475 }
57476 break;
57477 }
57478 return false;
57479}
57480
57483 .Case("{@cca}", X86::COND_A)
57484 .Case("{@ccae}", X86::COND_AE)
57485 .Case("{@ccb}", X86::COND_B)
57486 .Case("{@ccbe}", X86::COND_BE)
57487 .Case("{@ccc}", X86::COND_B)
57488 .Case("{@cce}", X86::COND_E)
57489 .Case("{@ccz}", X86::COND_E)
57490 .Case("{@ccg}", X86::COND_G)
57491 .Case("{@ccge}", X86::COND_GE)
57492 .Case("{@ccl}", X86::COND_L)
57493 .Case("{@ccle}", X86::COND_LE)
57494 .Case("{@ccna}", X86::COND_BE)
57495 .Case("{@ccnae}", X86::COND_B)
57496 .Case("{@ccnb}", X86::COND_AE)
57497 .Case("{@ccnbe}", X86::COND_A)
57498 .Case("{@ccnc}", X86::COND_AE)
57499 .Case("{@ccne}", X86::COND_NE)
57500 .Case("{@ccnz}", X86::COND_NE)
57501 .Case("{@ccng}", X86::COND_LE)
57502 .Case("{@ccnge}", X86::COND_L)
57503 .Case("{@ccnl}", X86::COND_GE)
57504 .Case("{@ccnle}", X86::COND_G)
57505 .Case("{@ccno}", X86::COND_NO)
57506 .Case("{@ccnp}", X86::COND_NP)
57507 .Case("{@ccns}", X86::COND_NS)
57508 .Case("{@cco}", X86::COND_O)
57509 .Case("{@ccp}", X86::COND_P)
57510 .Case("{@ccs}", X86::COND_S)
57512 return Cond;
57513}
57514
57515/// Given a constraint letter, return the type of constraint for this target.
57518 if (Constraint.size() == 1) {
57519 switch (Constraint[0]) {
57520 case 'R':
57521 case 'q':
57522 case 'Q':
57523 case 'f':
57524 case 't':
57525 case 'u':
57526 case 'y':
57527 case 'x':
57528 case 'v':
57529 case 'l':
57530 case 'k': // AVX512 masking registers.
57531 return C_RegisterClass;
57532 case 'a':
57533 case 'b':
57534 case 'c':
57535 case 'd':
57536 case 'S':
57537 case 'D':
57538 case 'A':
57539 return C_Register;
57540 case 'I':
57541 case 'J':
57542 case 'K':
57543 case 'N':
57544 case 'G':
57545 case 'L':
57546 case 'M':
57547 return C_Immediate;
57548 case 'C':
57549 case 'e':
57550 case 'Z':
57551 return C_Other;
57552 default:
57553 break;
57554 }
57555 }
57556 else if (Constraint.size() == 2) {
57557 switch (Constraint[0]) {
57558 default:
57559 break;
57560 case 'W':
57561 if (Constraint[1] != 's')
57562 break;
57563 return C_Other;
57564 case 'Y':
57565 switch (Constraint[1]) {
57566 default:
57567 break;
57568 case 'z':
57569 return C_Register;
57570 case 'i':
57571 case 'm':
57572 case 'k':
57573 case 't':
57574 case '2':
57575 return C_RegisterClass;
57576 }
57577 }
57578 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
57579 return C_Other;
57580 return TargetLowering::getConstraintType(Constraint);
57581}
57582
57583/// Examine constraint type and operand type and determine a weight value.
57584/// This object must already have been set up with the operand type
57585/// and the current alternative constraint selected.
57588 AsmOperandInfo &Info, const char *Constraint) const {
57590 Value *CallOperandVal = Info.CallOperandVal;
57591 // If we don't have a value, we can't do a match,
57592 // but allow it at the lowest weight.
57593 if (!CallOperandVal)
57594 return CW_Default;
57595 Type *Ty = CallOperandVal->getType();
57596 // Look at the constraint type.
57597 switch (*Constraint) {
57598 default:
57600 [[fallthrough]];
57601 case 'R':
57602 case 'q':
57603 case 'Q':
57604 case 'a':
57605 case 'b':
57606 case 'c':
57607 case 'd':
57608 case 'S':
57609 case 'D':
57610 case 'A':
57611 if (CallOperandVal->getType()->isIntegerTy())
57612 Wt = CW_SpecificReg;
57613 break;
57614 case 'f':
57615 case 't':
57616 case 'u':
57617 if (Ty->isFloatingPointTy())
57618 Wt = CW_SpecificReg;
57619 break;
57620 case 'y':
57621 if (Ty->isX86_MMXTy() && Subtarget.hasMMX())
57622 Wt = CW_SpecificReg;
57623 break;
57624 case 'Y':
57625 if (StringRef(Constraint).size() != 2)
57626 break;
57627 switch (Constraint[1]) {
57628 default:
57629 return CW_Invalid;
57630 // XMM0
57631 case 'z':
57632 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
57633 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
57634 ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
57635 return CW_SpecificReg;
57636 return CW_Invalid;
57637 // Conditional OpMask regs (AVX512)
57638 case 'k':
57639 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
57640 return CW_Register;
57641 return CW_Invalid;
57642 // Any MMX reg
57643 case 'm':
57644 if (Ty->isX86_MMXTy() && Subtarget.hasMMX())
57645 return Wt;
57646 return CW_Invalid;
57647 // Any SSE reg when ISA >= SSE2, same as 'x'
57648 case 'i':
57649 case 't':
57650 case '2':
57651 if (!Subtarget.hasSSE2())
57652 return CW_Invalid;
57653 break;
57654 }
57655 break;
57656 case 'v':
57657 if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
57658 Wt = CW_Register;
57659 [[fallthrough]];
57660 case 'x':
57661 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
57662 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
57663 Wt = CW_Register;
57664 break;
57665 case 'k':
57666 // Enable conditional vector operations using %k<#> registers.
57667 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
57668 Wt = CW_Register;
57669 break;
57670 case 'I':
57671 if (auto *C = dyn_cast<ConstantInt>(Info.CallOperandVal))
57672 if (C->getZExtValue() <= 31)
57673 Wt = CW_Constant;
57674 break;
57675 case 'J':
57676 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
57677 if (C->getZExtValue() <= 63)
57678 Wt = CW_Constant;
57679 break;
57680 case 'K':
57681 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
57682 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
57683 Wt = CW_Constant;
57684 break;
57685 case 'L':
57686 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
57687 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
57688 Wt = CW_Constant;
57689 break;
57690 case 'M':
57691 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
57692 if (C->getZExtValue() <= 3)
57693 Wt = CW_Constant;
57694 break;
57695 case 'N':
57696 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
57697 if (C->getZExtValue() <= 0xff)
57698 Wt = CW_Constant;
57699 break;
57700 case 'G':
57701 case 'C':
57702 if (isa<ConstantFP>(CallOperandVal))
57703 Wt = CW_Constant;
57704 break;
57705 case 'e':
57706 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
57707 if ((C->getSExtValue() >= -0x80000000LL) &&
57708 (C->getSExtValue() <= 0x7fffffffLL))
57709 Wt = CW_Constant;
57710 break;
57711 case 'Z':
57712 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
57713 if (C->getZExtValue() <= 0xffffffff)
57714 Wt = CW_Constant;
57715 break;
57716 }
57717 return Wt;
57718}
57719
57720/// Try to replace an X constraint, which matches anything, with another that
57721/// has more specific requirements based on the type of the corresponding
57722/// operand.
57724LowerXConstraint(EVT ConstraintVT) const {
57725 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
57726 // 'f' like normal targets.
57727 if (ConstraintVT.isFloatingPoint()) {
57728 if (Subtarget.hasSSE1())
57729 return "x";
57730 }
57731
57732 return TargetLowering::LowerXConstraint(ConstraintVT);
57733}
57734
57735// Lower @cc targets via setcc.
57737 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
57738 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
57740 if (Cond == X86::COND_INVALID)
57741 return SDValue();
57742 // Check that return type is valid.
57743 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
57744 OpInfo.ConstraintVT.getSizeInBits() < 8)
57745 report_fatal_error("Glue output operand is of invalid type");
57746
57747 // Get EFLAGS register. Only update chain when copyfrom is glued.
57748 if (Glue.getNode()) {
57749 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
57750 Chain = Glue.getValue(1);
57751 } else
57752 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
57753 // Extract CC code.
57754 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
57755 // Extend to 32-bits
57756 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
57757
57758 return Result;
57759}
57760
57761/// Lower the specified operand into the Ops vector.
57762/// If it is invalid, don't add anything to Ops.
57764 StringRef Constraint,
57765 std::vector<SDValue> &Ops,
57766 SelectionDAG &DAG) const {
57767 SDValue Result;
57768 char ConstraintLetter = Constraint[0];
57769 switch (ConstraintLetter) {
57770 default: break;
57771 case 'I':
57772 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57773 if (C->getZExtValue() <= 31) {
57774 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57775 Op.getValueType());
57776 break;
57777 }
57778 }
57779 return;
57780 case 'J':
57781 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57782 if (C->getZExtValue() <= 63) {
57783 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57784 Op.getValueType());
57785 break;
57786 }
57787 }
57788 return;
57789 case 'K':
57790 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57791 if (isInt<8>(C->getSExtValue())) {
57792 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57793 Op.getValueType());
57794 break;
57795 }
57796 }
57797 return;
57798 case 'L':
57799 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57800 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
57801 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
57802 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
57803 Op.getValueType());
57804 break;
57805 }
57806 }
57807 return;
57808 case 'M':
57809 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57810 if (C->getZExtValue() <= 3) {
57811 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57812 Op.getValueType());
57813 break;
57814 }
57815 }
57816 return;
57817 case 'N':
57818 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57819 if (C->getZExtValue() <= 255) {
57820 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57821 Op.getValueType());
57822 break;
57823 }
57824 }
57825 return;
57826 case 'O':
57827 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57828 if (C->getZExtValue() <= 127) {
57829 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57830 Op.getValueType());
57831 break;
57832 }
57833 }
57834 return;
57835 case 'e': {
57836 // 32-bit signed value
57837 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57839 C->getSExtValue())) {
57840 // Widen to 64 bits here to get it sign extended.
57841 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
57842 break;
57843 }
57844 // FIXME gcc accepts some relocatable values here too, but only in certain
57845 // memory models; it's complicated.
57846 }
57847 return;
57848 }
57849 case 'W': {
57850 assert(Constraint[1] == 's');
57851 // Op is a BlockAddressSDNode or a GlobalAddressSDNode with an optional
57852 // offset.
57853 if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op)) {
57854 Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(),
57855 BA->getValueType(0)));
57856 } else {
57857 int64_t Offset = 0;
57858 if (Op->getOpcode() == ISD::ADD &&
57859 isa<ConstantSDNode>(Op->getOperand(1))) {
57860 Offset = cast<ConstantSDNode>(Op->getOperand(1))->getSExtValue();
57861 Op = Op->getOperand(0);
57862 }
57863 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
57864 Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
57865 GA->getValueType(0), Offset));
57866 }
57867 return;
57868 }
57869 case 'Z': {
57870 // 32-bit unsigned value
57871 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57873 C->getZExtValue())) {
57874 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57875 Op.getValueType());
57876 break;
57877 }
57878 }
57879 // FIXME gcc accepts some relocatable values here too, but only in certain
57880 // memory models; it's complicated.
57881 return;
57882 }
57883 case 'i': {
57884 // Literal immediates are always ok.
57885 if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
57886 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
57887 BooleanContent BCont = getBooleanContents(MVT::i64);
57888 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
57890 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
57891 : CST->getSExtValue();
57892 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
57893 break;
57894 }
57895
57896 // In any sort of PIC mode addresses need to be computed at runtime by
57897 // adding in a register or some sort of table lookup. These can't
57898 // be used as immediates. BlockAddresses and BasicBlocks are fine though.
57899 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
57900 !(isa<BlockAddressSDNode>(Op) || isa<BasicBlockSDNode>(Op)))
57901 return;
57902
57903 // If we are in non-pic codegen mode, we allow the address of a global (with
57904 // an optional displacement) to be used with 'i'.
57905 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
57906 // If we require an extra load to get this address, as in PIC mode, we
57907 // can't accept it.
57909 Subtarget.classifyGlobalReference(GA->getGlobal())))
57910 return;
57911 break;
57912 }
57913 }
57914
57915 if (Result.getNode()) {
57916 Ops.push_back(Result);
57917 return;
57918 }
57919 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
57920}
57921
57922/// Check if \p RC is a general purpose register class.
57923/// I.e., GR* or one of their variant.
57924static bool isGRClass(const TargetRegisterClass &RC) {
57925 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
57926 RC.hasSuperClassEq(&X86::GR16RegClass) ||
57927 RC.hasSuperClassEq(&X86::GR32RegClass) ||
57928 RC.hasSuperClassEq(&X86::GR64RegClass) ||
57929 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
57930}
57931
57932/// Check if \p RC is a vector register class.
57933/// I.e., FR* / VR* or one of their variant.
57934static bool isFRClass(const TargetRegisterClass &RC) {
57935 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
57936 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
57937 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
57938 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
57939 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
57940 RC.hasSuperClassEq(&X86::VR512RegClass);
57941}
57942
57943/// Check if \p RC is a mask register class.
57944/// I.e., VK* or one of their variant.
57945static bool isVKClass(const TargetRegisterClass &RC) {
57946 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
57947 RC.hasSuperClassEq(&X86::VK2RegClass) ||
57948 RC.hasSuperClassEq(&X86::VK4RegClass) ||
57949 RC.hasSuperClassEq(&X86::VK8RegClass) ||
57950 RC.hasSuperClassEq(&X86::VK16RegClass) ||
57951 RC.hasSuperClassEq(&X86::VK32RegClass) ||
57952 RC.hasSuperClassEq(&X86::VK64RegClass);
57953}
57954
57955std::pair<unsigned, const TargetRegisterClass *>
57957 StringRef Constraint,
57958 MVT VT) const {
57959 // First, see if this is a constraint that directly corresponds to an LLVM
57960 // register class.
57961 if (Constraint.size() == 1) {
57962 // GCC Constraint Letters
57963 switch (Constraint[0]) {
57964 default: break;
57965 // 'A' means [ER]AX + [ER]DX.
57966 case 'A':
57967 if (Subtarget.is64Bit())
57968 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
57969 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
57970 "Expecting 64, 32 or 16 bit subtarget");
57971 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
57972
57973 // TODO: Slight differences here in allocation order and leaving
57974 // RIP in the class. Do they matter any more here than they do
57975 // in the normal allocation?
57976 case 'k':
57977 if (Subtarget.hasAVX512()) {
57978 if (VT == MVT::v1i1 || VT == MVT::i1)
57979 return std::make_pair(0U, &X86::VK1RegClass);
57980 if (VT == MVT::v8i1 || VT == MVT::i8)
57981 return std::make_pair(0U, &X86::VK8RegClass);
57982 if (VT == MVT::v16i1 || VT == MVT::i16)
57983 return std::make_pair(0U, &X86::VK16RegClass);
57984 }
57985 if (Subtarget.hasBWI()) {
57986 if (VT == MVT::v32i1 || VT == MVT::i32)
57987 return std::make_pair(0U, &X86::VK32RegClass);
57988 if (VT == MVT::v64i1 || VT == MVT::i64)
57989 return std::make_pair(0U, &X86::VK64RegClass);
57990 }
57991 break;
57992 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
57993 if (Subtarget.is64Bit()) {
57994 if (VT == MVT::i8 || VT == MVT::i1)
57995 return std::make_pair(0U, &X86::GR8_NOREX2RegClass);
57996 if (VT == MVT::i16)
57997 return std::make_pair(0U, &X86::GR16_NOREX2RegClass);
57998 if (VT == MVT::i32 || VT == MVT::f32)
57999 return std::make_pair(0U, &X86::GR32_NOREX2RegClass);
58000 if (VT != MVT::f80 && !VT.isVector())
58001 return std::make_pair(0U, &X86::GR64_NOREX2RegClass);
58002 break;
58003 }
58004 [[fallthrough]];
58005 // 32-bit fallthrough
58006 case 'Q': // Q_REGS
58007 if (VT == MVT::i8 || VT == MVT::i1)
58008 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
58009 if (VT == MVT::i16)
58010 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
58011 if (VT == MVT::i32 || VT == MVT::f32 ||
58012 (!VT.isVector() && !Subtarget.is64Bit()))
58013 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
58014 if (VT != MVT::f80 && !VT.isVector())
58015 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
58016 break;
58017 case 'r': // GENERAL_REGS
58018 case 'l': // INDEX_REGS
58019 if (VT == MVT::i8 || VT == MVT::i1)
58020 return std::make_pair(0U, &X86::GR8_NOREX2RegClass);
58021 if (VT == MVT::i16)
58022 return std::make_pair(0U, &X86::GR16_NOREX2RegClass);
58023 if (VT == MVT::i32 || VT == MVT::f32 ||
58024 (!VT.isVector() && !Subtarget.is64Bit()))
58025 return std::make_pair(0U, &X86::GR32_NOREX2RegClass);
58026 if (VT != MVT::f80 && !VT.isVector())
58027 return std::make_pair(0U, &X86::GR64_NOREX2RegClass);
58028 break;
58029 case 'R': // LEGACY_REGS
58030 if (VT == MVT::i8 || VT == MVT::i1)
58031 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
58032 if (VT == MVT::i16)
58033 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
58034 if (VT == MVT::i32 || VT == MVT::f32 ||
58035 (!VT.isVector() && !Subtarget.is64Bit()))
58036 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
58037 if (VT != MVT::f80 && !VT.isVector())
58038 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
58039 break;
58040 case 'f': // FP Stack registers.
58041 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
58042 // value to the correct fpstack register class.
58043 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
58044 return std::make_pair(0U, &X86::RFP32RegClass);
58045 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
58046 return std::make_pair(0U, &X86::RFP64RegClass);
58047 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
58048 return std::make_pair(0U, &X86::RFP80RegClass);
58049 break;
58050 case 'y': // MMX_REGS if MMX allowed.
58051 if (!Subtarget.hasMMX()) break;
58052 return std::make_pair(0U, &X86::VR64RegClass);
58053 case 'v':
58054 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
58055 if (!Subtarget.hasSSE1()) break;
58056 bool VConstraint = (Constraint[0] == 'v');
58057
58058 switch (VT.SimpleTy) {
58059 default: break;
58060 // Scalar SSE types.
58061 case MVT::f16:
58062 if (VConstraint && Subtarget.hasFP16())
58063 return std::make_pair(0U, &X86::FR16XRegClass);
58064 break;
58065 case MVT::f32:
58066 case MVT::i32:
58067 if (VConstraint && Subtarget.hasVLX())
58068 return std::make_pair(0U, &X86::FR32XRegClass);
58069 return std::make_pair(0U, &X86::FR32RegClass);
58070 case MVT::f64:
58071 case MVT::i64:
58072 if (VConstraint && Subtarget.hasVLX())
58073 return std::make_pair(0U, &X86::FR64XRegClass);
58074 return std::make_pair(0U, &X86::FR64RegClass);
58075 case MVT::i128:
58076 if (Subtarget.is64Bit()) {
58077 if (VConstraint && Subtarget.hasVLX())
58078 return std::make_pair(0U, &X86::VR128XRegClass);
58079 return std::make_pair(0U, &X86::VR128RegClass);
58080 }
58081 break;
58082 // Vector types and fp128.
58083 case MVT::v8f16:
58084 if (!Subtarget.hasFP16())
58085 break;
58086 if (VConstraint)
58087 return std::make_pair(0U, &X86::VR128XRegClass);
58088 return std::make_pair(0U, &X86::VR128RegClass);
58089 case MVT::v8bf16:
58090 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
58091 break;
58092 if (VConstraint)
58093 return std::make_pair(0U, &X86::VR128XRegClass);
58094 return std::make_pair(0U, &X86::VR128RegClass);
58095 case MVT::f128:
58096 case MVT::v16i8:
58097 case MVT::v8i16:
58098 case MVT::v4i32:
58099 case MVT::v2i64:
58100 case MVT::v4f32:
58101 case MVT::v2f64:
58102 if (VConstraint && Subtarget.hasVLX())
58103 return std::make_pair(0U, &X86::VR128XRegClass);
58104 return std::make_pair(0U, &X86::VR128RegClass);
58105 // AVX types.
58106 case MVT::v16f16:
58107 if (!Subtarget.hasFP16())
58108 break;
58109 if (VConstraint)
58110 return std::make_pair(0U, &X86::VR256XRegClass);
58111 return std::make_pair(0U, &X86::VR256RegClass);
58112 case MVT::v16bf16:
58113 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
58114 break;
58115 if (VConstraint)
58116 return std::make_pair(0U, &X86::VR256XRegClass);
58117 return std::make_pair(0U, &X86::VR256RegClass);
58118 case MVT::v32i8:
58119 case MVT::v16i16:
58120 case MVT::v8i32:
58121 case MVT::v4i64:
58122 case MVT::v8f32:
58123 case MVT::v4f64:
58124 if (VConstraint && Subtarget.hasVLX())
58125 return std::make_pair(0U, &X86::VR256XRegClass);
58126 if (Subtarget.hasAVX())
58127 return std::make_pair(0U, &X86::VR256RegClass);
58128 break;
58129 case MVT::v32f16:
58130 if (!Subtarget.hasFP16())
58131 break;
58132 if (VConstraint)
58133 return std::make_pair(0U, &X86::VR512RegClass);
58134 return std::make_pair(0U, &X86::VR512_0_15RegClass);
58135 case MVT::v32bf16:
58136 if (!Subtarget.hasBF16())
58137 break;
58138 if (VConstraint)
58139 return std::make_pair(0U, &X86::VR512RegClass);
58140 return std::make_pair(0U, &X86::VR512_0_15RegClass);
58141 case MVT::v64i8:
58142 case MVT::v32i16:
58143 case MVT::v8f64:
58144 case MVT::v16f32:
58145 case MVT::v16i32:
58146 case MVT::v8i64:
58147 if (!Subtarget.hasAVX512()) break;
58148 if (VConstraint)
58149 return std::make_pair(0U, &X86::VR512RegClass);
58150 return std::make_pair(0U, &X86::VR512_0_15RegClass);
58151 }
58152 break;
58153 }
58154 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
58155 switch (Constraint[1]) {
58156 default:
58157 break;
58158 case 'i':
58159 case 't':
58160 case '2':
58161 return getRegForInlineAsmConstraint(TRI, "x", VT);
58162 case 'm':
58163 if (!Subtarget.hasMMX()) break;
58164 return std::make_pair(0U, &X86::VR64RegClass);
58165 case 'z':
58166 if (!Subtarget.hasSSE1()) break;
58167 switch (VT.SimpleTy) {
58168 default: break;
58169 // Scalar SSE types.
58170 case MVT::f16:
58171 if (!Subtarget.hasFP16())
58172 break;
58173 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
58174 case MVT::f32:
58175 case MVT::i32:
58176 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
58177 case MVT::f64:
58178 case MVT::i64:
58179 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
58180 case MVT::v8f16:
58181 if (!Subtarget.hasFP16())
58182 break;
58183 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
58184 case MVT::v8bf16:
58185 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
58186 break;
58187 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
58188 case MVT::f128:
58189 case MVT::v16i8:
58190 case MVT::v8i16:
58191 case MVT::v4i32:
58192 case MVT::v2i64:
58193 case MVT::v4f32:
58194 case MVT::v2f64:
58195 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
58196 // AVX types.
58197 case MVT::v16f16:
58198 if (!Subtarget.hasFP16())
58199 break;
58200 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
58201 case MVT::v16bf16:
58202 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
58203 break;
58204 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
58205 case MVT::v32i8:
58206 case MVT::v16i16:
58207 case MVT::v8i32:
58208 case MVT::v4i64:
58209 case MVT::v8f32:
58210 case MVT::v4f64:
58211 if (Subtarget.hasAVX())
58212 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
58213 break;
58214 case MVT::v32f16:
58215 if (!Subtarget.hasFP16())
58216 break;
58217 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
58218 case MVT::v32bf16:
58219 if (!Subtarget.hasBF16())
58220 break;
58221 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
58222 case MVT::v64i8:
58223 case MVT::v32i16:
58224 case MVT::v8f64:
58225 case MVT::v16f32:
58226 case MVT::v16i32:
58227 case MVT::v8i64:
58228 if (Subtarget.hasAVX512())
58229 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
58230 break;
58231 }
58232 break;
58233 case 'k':
58234 // This register class doesn't allocate k0 for masked vector operation.
58235 if (Subtarget.hasAVX512()) {
58236 if (VT == MVT::v1i1 || VT == MVT::i1)
58237 return std::make_pair(0U, &X86::VK1WMRegClass);
58238 if (VT == MVT::v8i1 || VT == MVT::i8)
58239 return std::make_pair(0U, &X86::VK8WMRegClass);
58240 if (VT == MVT::v16i1 || VT == MVT::i16)
58241 return std::make_pair(0U, &X86::VK16WMRegClass);
58242 }
58243 if (Subtarget.hasBWI()) {
58244 if (VT == MVT::v32i1 || VT == MVT::i32)
58245 return std::make_pair(0U, &X86::VK32WMRegClass);
58246 if (VT == MVT::v64i1 || VT == MVT::i64)
58247 return std::make_pair(0U, &X86::VK64WMRegClass);
58248 }
58249 break;
58250 }
58251 }
58252
58253 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
58254 return std::make_pair(0U, &X86::GR32RegClass);
58255
58256 // Use the default implementation in TargetLowering to convert the register
58257 // constraint into a member of a register class.
58258 std::pair<Register, const TargetRegisterClass*> Res;
58260
58261 // Not found as a standard register?
58262 if (!Res.second) {
58263 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
58264 // to/from f80.
58265 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
58266 // Map st(0) -> st(7) -> ST0
58267 if (Constraint.size() == 7 && Constraint[0] == '{' &&
58268 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
58269 Constraint[3] == '(' &&
58270 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
58271 Constraint[5] == ')' && Constraint[6] == '}') {
58272 // st(7) is not allocatable and thus not a member of RFP80. Return
58273 // singleton class in cases where we have a reference to it.
58274 if (Constraint[4] == '7')
58275 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
58276 return std::make_pair(X86::FP0 + Constraint[4] - '0',
58277 &X86::RFP80RegClass);
58278 }
58279
58280 // GCC allows "st(0)" to be called just plain "st".
58281 if (StringRef("{st}").equals_insensitive(Constraint))
58282 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
58283 }
58284
58285 // flags -> EFLAGS
58286 if (StringRef("{flags}").equals_insensitive(Constraint))
58287 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
58288
58289 // dirflag -> DF
58290 // Only allow for clobber.
58291 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
58292 VT == MVT::Other)
58293 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
58294
58295 // fpsr -> FPSW
58296 // Only allow for clobber.
58297 if (StringRef("{fpsr}").equals_insensitive(Constraint) && VT == MVT::Other)
58298 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
58299
58300 return Res;
58301 }
58302
58303 // Make sure it isn't a register that requires 64-bit mode.
58304 if (!Subtarget.is64Bit() &&
58305 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
58306 TRI->getEncodingValue(Res.first) >= 8) {
58307 // Register requires REX prefix, but we're in 32-bit mode.
58308 return std::make_pair(0, nullptr);
58309 }
58310
58311 // Make sure it isn't a register that requires AVX512.
58312 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
58313 TRI->getEncodingValue(Res.first) & 0x10) {
58314 // Register requires EVEX prefix.
58315 return std::make_pair(0, nullptr);
58316 }
58317
58318 // Otherwise, check to see if this is a register class of the wrong value
58319 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
58320 // turn into {ax},{dx}.
58321 // MVT::Other is used to specify clobber names.
58322 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
58323 return Res; // Correct type already, nothing to do.
58324
58325 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
58326 // return "eax". This should even work for things like getting 64bit integer
58327 // registers when given an f64 type.
58328 const TargetRegisterClass *Class = Res.second;
58329 // The generic code will match the first register class that contains the
58330 // given register. Thus, based on the ordering of the tablegened file,
58331 // the "plain" GR classes might not come first.
58332 // Therefore, use a helper method.
58333 if (isGRClass(*Class)) {
58334 unsigned Size = VT.getSizeInBits();
58335 if (Size == 1) Size = 8;
58336 if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
58337 return std::make_pair(0, nullptr);
58338 Register DestReg = getX86SubSuperRegister(Res.first, Size);
58339 if (DestReg.isValid()) {
58340 bool is64Bit = Subtarget.is64Bit();
58341 const TargetRegisterClass *RC =
58342 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
58343 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
58344 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
58345 : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
58346 if (Size == 64 && !is64Bit) {
58347 // Model GCC's behavior here and select a fixed pair of 32-bit
58348 // registers.
58349 switch (DestReg) {
58350 case X86::RAX:
58351 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
58352 case X86::RDX:
58353 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
58354 case X86::RCX:
58355 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
58356 case X86::RBX:
58357 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
58358 case X86::RSI:
58359 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
58360 case X86::RDI:
58361 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
58362 case X86::RBP:
58363 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
58364 default:
58365 return std::make_pair(0, nullptr);
58366 }
58367 }
58368 if (RC && RC->contains(DestReg))
58369 return std::make_pair(DestReg, RC);
58370 return Res;
58371 }
58372 // No register found/type mismatch.
58373 return std::make_pair(0, nullptr);
58374 } else if (isFRClass(*Class)) {
58375 // Handle references to XMM physical registers that got mapped into the
58376 // wrong class. This can happen with constraints like {xmm0} where the
58377 // target independent register mapper will just pick the first match it can
58378 // find, ignoring the required type.
58379
58380 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
58381 if (VT == MVT::f16)
58382 Res.second = &X86::FR16XRegClass;
58383 else if (VT == MVT::f32 || VT == MVT::i32)
58384 Res.second = &X86::FR32XRegClass;
58385 else if (VT == MVT::f64 || VT == MVT::i64)
58386 Res.second = &X86::FR64XRegClass;
58387 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
58388 Res.second = &X86::VR128XRegClass;
58389 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
58390 Res.second = &X86::VR256XRegClass;
58391 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
58392 Res.second = &X86::VR512RegClass;
58393 else {
58394 // Type mismatch and not a clobber: Return an error;
58395 Res.first = 0;
58396 Res.second = nullptr;
58397 }
58398 } else if (isVKClass(*Class)) {
58399 if (VT == MVT::v1i1 || VT == MVT::i1)
58400 Res.second = &X86::VK1RegClass;
58401 else if (VT == MVT::v8i1 || VT == MVT::i8)
58402 Res.second = &X86::VK8RegClass;
58403 else if (VT == MVT::v16i1 || VT == MVT::i16)
58404 Res.second = &X86::VK16RegClass;
58405 else if (VT == MVT::v32i1 || VT == MVT::i32)
58406 Res.second = &X86::VK32RegClass;
58407 else if (VT == MVT::v64i1 || VT == MVT::i64)
58408 Res.second = &X86::VK64RegClass;
58409 else {
58410 // Type mismatch and not a clobber: Return an error;
58411 Res.first = 0;
58412 Res.second = nullptr;
58413 }
58414 }
58415
58416 return Res;
58417}
58418
58420 // Integer division on x86 is expensive. However, when aggressively optimizing
58421 // for code size, we prefer to use a div instruction, as it is usually smaller
58422 // than the alternative sequence.
58423 // The exception to this is vector division. Since x86 doesn't have vector
58424 // integer division, leaving the division as-is is a loss even in terms of
58425 // size, because it will have to be scalarized, while the alternative code
58426 // sequence can be performed in vector form.
58427 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
58428 return OptSize && !VT.isVector();
58429}
58430
58431void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
58432 if (!Subtarget.is64Bit())
58433 return;
58434
58435 // Update IsSplitCSR in X86MachineFunctionInfo.
58437 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
58438 AFI->setIsSplitCSR(true);
58439}
58440
58441void X86TargetLowering::insertCopiesSplitCSR(
58442 MachineBasicBlock *Entry,
58443 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
58444 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
58445 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
58446 if (!IStart)
58447 return;
58448
58449 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
58450 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
58451 MachineBasicBlock::iterator MBBI = Entry->begin();
58452 for (const MCPhysReg *I = IStart; *I; ++I) {
58453 const TargetRegisterClass *RC = nullptr;
58454 if (X86::GR64RegClass.contains(*I))
58455 RC = &X86::GR64RegClass;
58456 else
58457 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
58458
58459 Register NewVR = MRI->createVirtualRegister(RC);
58460 // Create copy from CSR to a virtual register.
58461 // FIXME: this currently does not emit CFI pseudo-instructions, it works
58462 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
58463 // nounwind. If we want to generalize this later, we may need to emit
58464 // CFI pseudo-instructions.
58465 assert(
58466 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
58467 "Function should be nounwind in insertCopiesSplitCSR!");
58468 Entry->addLiveIn(*I);
58469 BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)
58470 .addReg(*I);
58471
58472 // Insert the copy-back instructions right before the terminator.
58473 for (auto *Exit : Exits)
58474 BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),
58475 TII->get(TargetOpcode::COPY), *I)
58476 .addReg(NewVR);
58477 }
58478}
58479
58481 return Subtarget.is64Bit();
58482}
58483
58487 const TargetInstrInfo *TII) const {
58488 assert(MBBI->isCall() && MBBI->getCFIType() &&
58489 "Invalid call instruction for a KCFI check");
58490
58491 MachineFunction &MF = *MBB.getParent();
58492 // If the call target is a memory operand, unfold it and use R11 for the
58493 // call, so KCFI_CHECK won't have to recompute the address.
58494 switch (MBBI->getOpcode()) {
58495 case X86::CALL64m:
58496 case X86::CALL64m_NT:
58497 case X86::TAILJMPm64:
58498 case X86::TAILJMPm64_REX: {
58501 if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
58502 /*UnfoldStore=*/false, NewMIs))
58503 report_fatal_error("Failed to unfold memory operand for a KCFI check");
58504 for (auto *NewMI : NewMIs)
58505 MBBI = MBB.insert(OrigCall, NewMI);
58506 assert(MBBI->isCall() &&
58507 "Unexpected instruction after memory operand unfolding");
58508 if (OrigCall->shouldUpdateCallSiteInfo())
58509 MF.moveCallSiteInfo(&*OrigCall, &*MBBI);
58510 MBBI->setCFIType(MF, OrigCall->getCFIType());
58511 OrigCall->eraseFromParent();
58512 break;
58513 }
58514 default:
58515 break;
58516 }
58517
58518 MachineOperand &Target = MBBI->getOperand(0);
58519 Register TargetReg;
58520 switch (MBBI->getOpcode()) {
58521 case X86::CALL64r:
58522 case X86::CALL64r_NT:
58523 case X86::TAILJMPr64:
58524 case X86::TAILJMPr64_REX:
58525 assert(Target.isReg() && "Unexpected target operand for an indirect call");
58526 Target.setIsRenamable(false);
58527 TargetReg = Target.getReg();
58528 break;
58529 case X86::CALL64pcrel32:
58530 case X86::TAILJMPd64:
58531 assert(Target.isSymbol() && "Unexpected target operand for a direct call");
58532 // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for
58533 // 64-bit indirect thunk calls.
58534 assert(StringRef(Target.getSymbolName()).ends_with("_r11") &&
58535 "Unexpected register for an indirect thunk call");
58536 TargetReg = X86::R11;
58537 break;
58538 default:
58539 llvm_unreachable("Unexpected CFI call opcode");
58540 break;
58541 }
58542
58543 return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))
58544 .addReg(TargetReg)
58545 .addImm(MBBI->getCFIType())
58546 .getInstr();
58547}
58548
58549/// Returns true if stack probing through a function call is requested.
58551 return !getStackProbeSymbolName(MF).empty();
58552}
58553
58554/// Returns true if stack probing through inline assembly is requested.
58556
58557 // No inline stack probe for Windows, they have their own mechanism.
58558 if (Subtarget.isOSWindows() ||
58559 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
58560 return false;
58561
58562 // If the function specifically requests inline stack probes, emit them.
58563 if (MF.getFunction().hasFnAttribute("probe-stack"))
58564 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
58565 "inline-asm";
58566
58567 return false;
58568}
58569
58570/// Returns the name of the symbol used to emit stack probes or the empty
58571/// string if not applicable.
58574 // Inline Stack probes disable stack probe call
58575 if (hasInlineStackProbe(MF))
58576 return "";
58577
58578 // If the function specifically requests stack probes, emit them.
58579 if (MF.getFunction().hasFnAttribute("probe-stack"))
58580 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
58581
58582 // Generally, if we aren't on Windows, the platform ABI does not include
58583 // support for stack probes, so don't emit them.
58584 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
58585 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
58586 return "";
58587
58588 // We need a stack probe to conform to the Windows ABI. Choose the right
58589 // symbol.
58590 if (Subtarget.is64Bit())
58591 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
58592 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
58593}
58594
58595unsigned
58597 // The default stack probe size is 4096 if the function has no stackprobesize
58598 // attribute.
58599 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
58600 4096);
58601}
58602
58604 if (ML && ML->isInnermost() &&
58605 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
58608}
unsigned const MachineRegisterInfo * MRI
#define Success
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
unsigned RegSize
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
#define NODE_NAME_CASE(node)
static const LLT S1
amdgpu AMDGPU Register Bank Select
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
#define EXPAND(Op)
Function Alias Analysis Results
BitTracker BT
Definition: BitTracker.cpp:73
BlockVerifier::State From
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:203
This file contains the declarations for the subclasses of Constant, which represent the different fla...
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
Looks at all the uses of the given value Returns the Liveness deduced from the uses of this value Adds all uses that cause the result to be MaybeLive to MaybeLiveRetUses If the result is MaybeLiveUses might be modified but its content should be ignored(since it might not be complete). DeadArgumentEliminationPass
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
std::string Name
uint64_t Size
Symbol * Sym
Definition: ELF_riscv.cpp:479
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
static KnownBits extractBits(unsigned BitWidth, const KnownBits &SrcOpKnown, const KnownBits &OffsetKnown, const KnownBits &WidthKnown)
Hexagon Common GEP
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
static const unsigned MaxDepth
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static int matchShuffleAsBitRotate(ArrayRef< int > Mask, int NumSubElts)
Try to lower a vector shuffle as a bit rotation.
static Value * LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctlz of V before the specified instruction IP.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:528
Live Register Matrix
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc)
Return true if node is an ISD::AND or ISD::OR of two M68k::SETcc nodes each of which has no other use...
static bool hasNonFlagsUse(SDValue Op)
return true if Op has a use that doesn't just read flags.
static bool isCMOVPseudo(MachineInstr &MI)
static SDValue combineCarryThroughADD(SDValue CCR)
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG)
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
#define R2(n)
#define T1
uint64_t High
LLVMContext & Context
This file defines ARC utility functions which are used by various parts of the compiler.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
PowerPC Reduce CR logical Operation
PowerPC TLS Dynamic Call Fixup
if(VerifyEach)
const char LLVMTargetMachineRef TM
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc)
const SmallVectorImpl< MachineOperand > & Cond
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isSimple(Instruction *I)
unsigned OpIndex
static StringRef substr(StringRef Str, uint64_t Len)
This file implements the SmallBitVector class.
This file defines the SmallSet class.
static bool Enabled
Definition: Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
This file describes how to lower LLVM code to machine code.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &DL, unsigned VectorWidth)
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
static unsigned getSUBriOpcode(bool IsLP64)
static bool isNoopOrBroadcastShuffleMask(ArrayRef< int > Mask)
static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask)
static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget)
Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer t...
static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::ANDNP nodes.
static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0, const SDValue &Zext1, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, SDValue X, SDValue Y, SelectionDAG &DAG, bool ZeroSecondOpOnly=false)
If this is an add or subtract where one operand is produced by a cmp+setcc, then try to convert it to...
static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, SmallVectorImpl< SDValue > &SrcOps, SmallVectorImpl< APInt > *SrcMask=nullptr)
Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...)) style scalarized (associative) ...
static SDValue combineSubABS(SDNode *N, SelectionDAG &DAG)
static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1, bool &IsAlwaysSignaling)
Turns an ISD::CondCode into a value suitable for SSE floating-point mask CMPs.
static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC)
static SDValue combineXorSubCTLZ(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If a value is a scalar FP zero or a vector FP zero (potentially including undefined elements),...
static bool matchBinaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static SDValue combineSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isGRClass(const TargetRegisterClass &RC)
Check if RC is a general purpose register class.
static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, SmallVectorImpl< SDValue > &Ops, SmallVectorImpl< int > &Mask, bool &IsUnary)
Calculates the shuffle mask corresponding to the target-specific opcode.
static SDValue vectorizeExtractedCast(SDValue Cast, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast operation that is extracted from a vector, try to vectorize the cast op followed ...
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG)
static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, const SDLoc &DL, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1)
This is a helper function of LowerToHorizontalOp().
static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In, const SDLoc &dl, SelectionDAG &DAG)
static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, SelectionDAG &DAG, bool UseConcat=false)
Given the output values from getHalfShuffleMask(), create a half width shuffle of extracted vectors f...
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, int ShAmtIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle vector element shifts by a splat shift amount.
@ ConstantBit
@ NotConstantBit
@ NotShiftBit
@ ShiftBit
@ UndefBit
static SDValue combineZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc, bool NSW)
Given a buildvector constant, return a new vector constant with each element incremented or decrement...
static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, unsigned &NumExtracts, bool &IsSubAdd)
Returns true iff BV builds a vector with the result equivalent to the result of ADDSUB/SUBADD operati...
static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode)
static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane 32-bit floating point shuffles.
static MachineBasicBlock * emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII)
Utility function to emit xbegin specifying the start of an RTM region.
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef< SDValue > Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
Given the initializing elements 'Elts' of a vector of type 'VT', see if the elements can be replaced ...
static bool scaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts, SmallVectorImpl< int > &ScaledMask)
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation of 'extended sign-bits' or 'extended zero-bits' values.
static APInt getExtractedDemandedElts(SDNode *N)
static SDValue combineBitOpWithPACK(SDNode *N, SelectionDAG &DAG)
static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit integer shuffles.
static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we are inverting an PTEST/TESTP operand, attempt to adjust the CC to avoid the inversion.
static unsigned getAltBitOpcode(unsigned Opcode)
static Constant * getConstantVector(MVT VT, ArrayRef< APInt > Bits, const APInt &Undefs, LLVMContext &C)
static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert i1-subvector to i1-vector.
static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Create a vector constant without a load.
static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsDecomposedShuffleMerge(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic routine to decompose a shuffle and blend into independent blends and permutes.
static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
64-bit unsigned integer to double expansion.
static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, const X86Subtarget &Subtarget)
static bool isX86CCSigned(unsigned X86CC)
Return true if the condition is an signed comparison operation.
static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a 128-bit shuffles.
static SDValue combineBitOpWithShift(SDNode *N, SelectionDAG &DAG)
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on SELECT and VSELECT nodes.
static bool isUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is undef or ...
static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getConstVector(ArrayRef< int > Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask=false)
static MachineInstrBuilder createPHIsForCMOVsInSinkBB(MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *SinkMBB)
static SDValue combineCMP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to put 128-bits into a vector > 128 bits.
static bool onlyZeroFlagUsed(SDValue Flags)
static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 256-bits from a 512-bit vector.
static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Either split a vector in halves or decompose the shuffles and the blend/unpack.
static SDValue combineMulToPMADDWD(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsLanePermuteAndShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one source with a lane permutatio...
static bool isFoldableUseOfShuffle(SDNode *N)
static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return (and Op, Mask) for compare instructions or (vselect Mask, Op, PreservedSrc) for others along w...
static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg sign extension and X86ISD::PACKSS.
static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool isShuffleMaskInputInPlace(int Input, ArrayRef< int > Mask)
Test whether the specified input (0 or 1) is in-place blended by the given mask.
static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether elements in each LaneSizeInBits lane in this shuffle mask come from multiple lanes - thi...
static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, ISD::CondCode Cond, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
As another special case, use PSUBUS[BW] when it's profitable.
static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 128-bit lane.
static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineADC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static bool isShuffleFoldableLoad(SDValue V)
Helper to test for a load that can be folded with x86 shuffles.
static SDValue lowerShuffleAsElementInsertion(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower insertion of a single element into a zero vector.
static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnpackWdShuffleMask(ArrayRef< int > Mask, MVT VT, const SelectionDAG &DAG)
static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into X86ISD::PACKUS/X86ISD::P...
static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle case where shuffle sources are coming from the same 128-bit lane and every lane can be represe...
static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static int getSEHRegistrationNodeSize(const Function *Fn)
static SDValue combineShuffleOfConcatUndef(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Creates an SDNode for a predicated scalar operation.
static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, SelectionDAG &DAG)
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
If a BUILD_VECTOR's source elements all apply the same bit operation and one of their operands is con...
static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Dispatching routine to lower various 128-bit x86 vector shuffles.
static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth=0)
Returns the negated value if the node N flips sign of FP value.
static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 16-bit integer shuffles.
static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower atomic_load_ops into LOCK-prefixed operations.
static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 8-bit integer shuffles.
static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG)
static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single - truncated - integer element, coming from a scalar_to_vector/buil...
static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI)
Emit a sequence of two 128-bit horizontal add/sub followed by a concat_vector.
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, SDValue *InGlue, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LocalDynamic=false)
static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to fold: and (vector_shuffle<Z,...,Z> (insert_vector_elt undef, (xor X, -1), Z),...
static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a bitmask instruction for a shuffle.
static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 256-bit lane.
static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl, SDValue V1, SDValue V2, ArrayRef< int > Mask)
static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
32-bit unsigned integer to float expansion.
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerTruncateVecI1(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > ExperimentalPrefInnermostLoopAlignment("x86-experimental-pref-innermost-loop-alignment", cl::init(4), cl::desc("Sets the preferable loop alignment for experiments (as log2 bytes) " "for innermost loops only. If specified, this option overrides " "alignment set by x86-experimental-pref-loop-alignment."), cl::Hidden)
static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute from a vector of source v...
static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, const SDLoc &DL, SelectionDAG &DAG, unsigned HOpcode, SDValue V0, SDValue V1)
static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle as a zero or any extension.
static bool needCarryOrOverflowFlag(SDValue Flags)
static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
Returns a vector of specified type with all bits set.
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefLowerHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose lower half is undefined.
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
Search for a combinable shuffle across a chain ending in pshufd.
static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG, SDValue OpMustEq, SDValue Op, unsigned Depth)
static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG)
Handle vector element shifts where the shift amount is a constant.
static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS, bool PackHiHalf=false)
Returns a node that packs the LHS + RHS nodes together at half width.
static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG)
static bool matchUnaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue V1, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT)
static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast to FP with a cast to integer operand (almost an ftrunc), try to vectorize the cas...
static bool getHalfShuffleMask(ArrayRef< int > Mask, MutableArrayRef< int > HalfMask, int &HalfIdx1, int &HalfIdx2)
If the input shuffle mask results in a vector that is undefined in all upper or lower half elements a...
static cl::opt< int > BrMergingBaseCostThresh("x86-br-merging-base-cost", cl::init(2), cl::desc("Sets the cost threshold for when multiple conditionals will be merged " "into one branch versus be split in multiple branches. Merging " "conditionals saves branches at the cost of additional instructions. " "This value sets the instruction cost limit, below which conditionals " "will be merged, and above which conditionals will be split. Set to -1 " "to never merge branches."), cl::Hidden)
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT)
static SDValue emitLockedStackOp(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, const SDLoc &DL)
Emit a locked operation on a stack location which does not change any memory location,...
static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool &ForceV1Zero, bool &ForceV2Zero, unsigned &ShuffleImm, ArrayRef< int > Mask, const APInt &Zeroable)
static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 8-lane 16-bit floating point shuffles.
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle using bit math.
static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-extending masked load, it is a scalar load and ve...
static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, unsigned TargetOpcode, unsigned SrcReg, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics with chain that return their value into registers EDX:EAX.
static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI)
static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBuildVectorAsInsert(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, unsigned EltSizeInBits, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a target shuffle mask is equivalent within each sub-lane.
static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to map a 128-bit or larger integer comparison to vector instructions before type legalization spl...
static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
This function detects the AVG pattern between vectors of unsigned i8/i16, which is c = (a + b + 1) / ...
static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether there are elements crossing LaneSizeInBits lanes in this shuffle mask.
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, X86::CondCode &X86CC)
Result of 'and' is compared against zero.
static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsZeroOrAnyExtend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a zero extension on any microarch.
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Compute the horizontal sum of bytes in V for the elements of VT.
static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG)
static void computeInLaneShuffleMask(const ArrayRef< int > &Mask, int LaneSize, SmallVector< int > &InLaneMask)
Helper to get compute inlane shuffle mask for a complete shuffle mask.
static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT, EVT MemVT, MemSDNode *Mem, unsigned Offset, SelectionDAG &DAG)
static bool isUndefUpperHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose upper half is undefined.
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
uint64_t getGFNICtrlImm(unsigned Opcode, unsigned Amt=0)
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG)
Lower SRA_PARTS and friends, which return two i32 values and take a 2 x i32 value to shift plus a shi...
static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode)
static std::pair< SDValue, SDValue > getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG)
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs reference the same FP CMP,...
static bool isVKClass(const TargetRegisterClass &RC)
Check if RC is a mask register class.
static SDValue combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If a vector select has an operand that is -1 or 0, try to simplify the select to a bitwise logic oper...
static int canLowerByDroppingElements(ArrayRef< int > Mask, bool MatchEven, bool IsSingleInput)
Check whether a compaction lowering can be done by dropping even/odd elements and compute how many ti...
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
Attempt to pre-truncate inputs to arithmetic ops if it will simplify the codegen.
static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single element.
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask)
Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to combine a shuffle into a target-specific add-sub or mul-add-sub node.
static SDValue lowerShuffleAsLanePermuteAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes as a lane permutation followed by a per-lane p...
static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG)
static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of 8-lane i16 shuffles.
static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG)
static bool canonicalizeShuffleMaskWithCommute(ArrayRef< int > Mask)
Helper function that returns true if the shuffle mask should be commuted to improve canonicalization.
static bool matchAsm(StringRef S, ArrayRef< const char * > Pieces)
static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG)
Change a vector store into a pair of half-size vector stores.
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a vector to a larger size with the same scalar type, with the new elements either zero or undef...
static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static bool isUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue MatchVectorAllEqualTest(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FANDN nodes.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC)
static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R, SDValue And1_L, SDValue And1_R, const SDLoc &DL, SelectionDAG &DAG)
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendBoolVectorInReg(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break a binary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, SDLoc DL, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isBlendOrUndef(ArrayRef< int > Mask)
Return true if every element in Mask, is an in-place blend/select mask or is undef.
static const char * getIndirectThunkSymbol(const X86Subtarget &Subtarget, unsigned Reg)
static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static unsigned getV4X86ShuffleImm(ArrayRef< int > Mask)
Get a 4-lane 8-bit shuffle immediate for a mask.
static void resolveTargetShuffleFromZeroables(SmallVectorImpl< int > &Mask, const APInt &KnownUndef, const APInt &KnownZero, bool ResolveKnownZeros=true)
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert one bit to mask vector, like v16i1 or v8i1.
static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle by first fixing the 128-bit lanes and then shuffling each lane.
static bool isSoftF16(T VT, const X86Subtarget &Subtarget)
static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Detect vector gather/scatter index generation and convert it from being a bunch of shuffles and extra...
static bool isSingleSHUFPSMask(ArrayRef< int > Mask)
Test whether this can be lowered with a single SHUFPS instruction.
static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd)
Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
static bool isX86LogicalCmp(SDValue Op)
Return true if opcode is a X86 logical comparison.
static bool isAnyInRange(ArrayRef< int > Mask, int Low, int Hi)
Return true if the value of any element in Mask falls within the specified range (L,...
static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG)
static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, SelectionDAG &DAG, unsigned Depth)
static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS=false)
Detect patterns of truncation with signed saturation: (truncate (smin ((smax (x, signed_min_of_dest_t...
const unsigned FPStateSize
static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, unsigned &UnpackOpcode, bool IsUnary, ArrayRef< int > TargetMask, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating point negations.
static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth)
static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If both input operands of a logic op are being cast from floating-point types or FP compares,...
static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned &HOpcode, SDValue &V0, SDValue &V1)
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool createShuffleMaskFromVSELECT(SmallVectorImpl< int > &Mask, SDValue Cond, bool IsBLENDV=false)
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size, bool AllowTruncate)
static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to determine if In truncated to DstVT has the necessary signbits / leading zero bits to be tru...
static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Return Mask with the necessary casting or extending for Mask according to MaskVT when lowering maskin...
static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit floating point shuffles.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Horizontal vector math instructions may be slower than normal math with shuffles.
static bool isFRClass(const TargetRegisterClass &RC)
Check if RC is a vector register class.
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool SimpleOnly)
Generic routine to split vector shuffle into half-sized shuffles.
static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue IsNOT(SDValue V, SelectionDAG &DAG)
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG)
Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "test Op0,Op0", or something equivalent.
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, const SDLoc &dl)
Return a vector logical shift node.
static bool isFreeToSplitVector(SDNode *N, SelectionDAG &DAG)
static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane i32 vector shuffles.
static SDValue combineX86ShuffleChain(ArrayRef< SDValue > Inputs, SDValue Root, ArrayRef< int > BaseMask, int Depth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine an arbitrary chain of shuffles into a single instruction if possible.
static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer types.
static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isInRange(int Val, int Low, int Hi)
Return true if Val falls within the specified range (L, H].
static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Try to combine x86 target specific shuffles.
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static std::pair< SDValue, SDValue > splitVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG)
Helper for attempting to create a X86ISD::BT node.
static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Truncating Store with signed or unsigned saturation.
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, bool FillWithZeroes=false)
Widen a vector input to a vector of NVT.
static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool ImmBlends=false)
Try to lower as a blend of elements from two inputs followed by a single-input permutation.
static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx, const APInt &Zeroable)
const unsigned X87StateSize
static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit integer shuffles.
static bool isLegalConversion(MVT VT, bool IsSigned, const X86Subtarget &Subtarget)
static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool isUndefOrEqual(int Val, int CmpVal)
Val is the undef sentinel value or equal to the specified value.
static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isTargetShuffle(unsigned Opcode)
static bool isSingleElementRepeatedMask(ArrayRef< int > Mask)
Check if the Mask consists of the same element repeated multiple times.
static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG)
static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerIntVSETCC_AVX512(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, SDValue N1, ArrayRef< int > Mask, SelectionDAG &DAG)
If we are extracting two 128-bit halves of a vector and shuffling the result, match that to a 256-bit...
static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit floating point shuffles.
static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or 'fsubadd' operation accordingly...
static SDValue lowerV8I16GeneralSingleInputShuffle(const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 shuffle lowering,...
static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 256-bit x86 vector shuffles.
static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG)
Try to turn tests against the signbit in the form of: XOR(TRUNCATE(SRL(X, size(X)-1)),...
static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit floating point shuffles.
static SDValue combineOrXorWithSETCC(SDNode *N, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue isUpperSubvectorUndef(SDValue V, const SDLoc &DL, SelectionDAG &DAG)
static cl::opt< int > BrMergingLikelyBias("x86-br-merging-likely-bias", cl::init(0), cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "likely, then it is likely that if the conditionals are split " "both sides will be executed, so it may be desirable to increase " "the instruction cost threshold. Set to -1 to never merge likely " "branches."), cl::Hidden)
static bool clobbersFlagRegisters(const SmallVector< StringRef, 4 > &AsmPieces)
static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG)
static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, int Idx, int ExpectedIdx)
Checks whether the vector elements referenced by two shuffle masks are equivalent.
static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to match a vector shuffle as an element rotation.
static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi)
Return true if Val is undef, zero or if its value falls within the specified range (L,...
static const Constant * getTargetConstantFromBasePtr(SDValue Ptr)
static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, SDValue Src, const SDLoc &DL)
static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Original, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle.
static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset)
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Helper that combines an array of subvector ops as if they were the operands of a ISD::CONCAT_VECTORS ...
static bool isUndefOrInRange(int Val, int Low, int Hi)
Return true if Val is undef or if its value falls within the specified range (L, H].
static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT)
static bool collectConcatOps(SDNode *N, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG)
static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If both arms of a vector select are concatenated vectors, split the select, and concatenate the resul...
static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG)
static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static std::pair< Value *, BitTestKind > FindSingleBitChange(Value *V)
static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG)
If we are converting a value to floating-point, try to replace scalar truncate of an extracted vector...
static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef< int > Mask)
Test whether there are elements crossing 128-bit lanes in this shuffle mask.
static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit integer shuffles.
static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "cmp Op0,Op1", or something equivalent.
static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG)
const unsigned FPStateSizeInBits
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-truncating masked store, it is a vector extract a...
static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode)
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue narrowExtractedVectorSelect(SDNode *Ext, const SDLoc &DL, SelectionDAG &DAG)
If we are extracting a subvector of a vector select and the select condition is composed of concatena...
static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNoopShuffleMask(ArrayRef< int > Mask)
Tiny helper function to identify a no-op mask.
static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackh operation.
static SDValue combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If this is a zero/all-bits result that is bitwise-anded with a low bits mask.
static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a byte shift sequence.
static SDValue combineX86ShuffleChainWithExtract(ArrayRef< SDValue > Inputs, SDValue Root, ArrayRef< int > BaseMask, int Depth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isTargetShuffleVariableMask(unsigned Opcode)
static bool isLogicOp(unsigned Opcode)
static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool BitwiseOnly)
static SDValue LowerBuildVectorv8i16(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v8i16.
static bool matchBinaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary)
static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to lower as an unpack of elements from two inputs followed by a single-input permutation.
static bool canScaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts)
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG)
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return a vector_shuffle of the specified vector of zero or undef vector.
static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Attempt to use the vbroadcast instruction to generate a splat value from a splat BUILD_VECTOR which u...
static SDValue combineMulToPMULDQ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineX86ShufflesConstants(ArrayRef< SDValue > Ops, ArrayRef< int > Mask, SDValue Root, bool HasVariableMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG)
static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl< uint64_t > &RawMask, APInt &UndefElts)
static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, const X86Subtarget &Subtarget)
sext(add_nsw(x, C)) --> add(sext(x), C_sext) zext(add_nuw(x, C)) --> add(zext(x), C_zext) Promoting a...
static const Constant * getTargetConstantFromNode(LoadSDNode *Load)
static bool canCombineAsMaskOperation(SDValue V, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a dword/qword rotation.
static bool isProfitableToUseFlagOp(SDValue Op)
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG)
ISD::FROUND is defined to round to nearest with ties rounding away from 0.
static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
Detect patterns of truncation with unsigned saturation:
static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG)
If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the low half of each source v...
static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG)
Do a one-to-one translation of a ISD::CondCode to the X86-specific condition code,...
static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable, const X86Subtarget &Subtarget)
Try to lower a vector shuffle as a bit shift (shifts in zeros).
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG)
static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 512-bit x86 vector shuffles.
static SDValue LowerBuildVectorv16i8(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v16i8.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, SmallVectorImpl< APInt > &EltBits, bool AllowWholeUndefs=true, bool AllowPartialUndefs=false)
static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0, SDValue &Op1)
static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SelectionDAG &DAG, const SDLoc &dl)
Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then concatenate the result back.
static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit) followed by unpack 256-bit.
static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Depending on uarch and/or optimizing for size, we might prefer to use a vector operation in place of ...
static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, SelectionDAG &DAG, SDValue &Addr, SDValue &Index, Align &Alignment, unsigned &Offset)
Given a masked memory load/store operation, return true if it has one mask bit set.
static SDValue reduceVMULWidth(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
When the operands of vector mul are extended from smaller size values, like i8 and i16,...
static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode)
static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG)
static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, unsigned ExpectedUses)
Returns true if is possible to fold MUL and an idiom that has already been recognized as ADDSUB/SUBAD...
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, unsigned &LogBias, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering 2-lane 128-bit shuffles.
static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute)
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG)
The only differences between FABS and FNEG are the mask and the logic op.
ShrinkMode
Different mul shrinking modes.
static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, const SDLoc &dl)
static SDValue canonicalizeShuffleMaskWithHorizOp(MutableArrayRef< SDValue > Ops, MutableArrayRef< int > Mask, unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void computeZeroableShuffleElements(ArrayRef< int > Mask, SDValue V1, SDValue V2, APInt &KnownUndef, APInt &KnownZero)
Compute whether each element of a shuffle is zeroable.
static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Masked Truncating Store with signed or unsigned saturation.
static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a VSELECT instruction to a vector shuffle.
static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2, MutableArrayRef< int > Mask, const APInt &Zeroable, bool &ForceV1Zero, bool &ForceV2Zero, uint64_t &BlendMask)
static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src, const SDLoc &DL)
static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
CMOV of constants requires materializing constant operands in registers.
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, EVT VT, const SDLoc &DL)
static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackl operation.
static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG)
Try to get a scalar value for a specific element of a vector.
static unsigned getOpcodeForIndirectThunk(unsigned RPOpc)
static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of v16i8 shuffles.
static bool isNullFPScalarOrVectorConst(SDValue V)
static bool hasIdenticalHalvesShuffleMask(ArrayRef< int > Mask)
Return true if a shuffle mask chooses elements identically in its top and bottom halves.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef< int > TargetMask, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned MaxStages=1)
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget)
static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to convert a vector reduction sequence composed of binops and shuffles into horizontal ops.
static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffle using X86ISD::VROTLI rotations.
static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT)
static SDValue combineBlendOfPermutes(MVT VT, SDValue N0, SDValue N1, ArrayRef< int > BlendMask, const APInt &DemandedElts, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine: (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) to: (brcond/cmov/setcc ....
static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize an EFLAGS definition used according to the condition code CC into a simpler EFLAGS value,...
static bool isBroadcastShuffleMask(ArrayRef< int > Mask)
static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsCommutative, SmallVectorImpl< int > &PostShuffleMask)
Return 'true' if this vector operation is "horizontal" and return the operands for the horizontal ope...
static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extracting a scalar FP value from vector element 0 is free, so extract each operand first,...
static SDValue combineX86ShufflesRecursively(ArrayRef< SDValue > SrcOps, int SrcOpIndex, SDValue Root, ArrayRef< int > RootMask, ArrayRef< const SDNode * > SrcNodes, unsigned Depth, unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Fully generic combining of x86 shuffle instructions.
static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static StringRef getInstrStrFromOpNo(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo)
static bool isSequentialOrUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size,...
static bool canWidenShuffleElements(ArrayRef< int > Mask, SmallVectorImpl< int > &WidenedMask)
Helper function to test whether a shuffle mask could be simplified by widening the elements being shu...
static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an unary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue combineSext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit integer shuffles.
static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoConditionalNegate(EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getShuffleScalarElt(SDValue Op, unsigned Index, SelectionDAG &DAG, unsigned Depth)
Returns the scalar element that will make up the i'th element of the result of the vector shuffle.
static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable)
static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG)
Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, const APInt &Zeroable, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef< int > Mask, const EVT &VectorType, bool &IsZeroSideLeft)
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG, EVT VecVT, EVT CmpVT, bool HasPT, F SToV)
Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp expansion.
static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just pre-promote its result type since...
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to lower a vector shuffle as a byte rotation.
static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle as a permute of the inputs followed by an UNPCK instruction.
static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, SelectionDAG &DAG)
static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isAddSubOrSubAddMask(ArrayRef< int > Mask, bool &Op0Even)
Checks if the shuffle mask takes subsequent elements alternately from two vectors.
static bool isCompletePermute(ArrayRef< int > Mask)
Return true if every element of a single input is referenced by the shuffle mask.
static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, SDValue EntryEBP)
When the MSVC runtime transfers control to us, either to an outlined function or when returning to a ...
static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics that read the time stamp counter (x86_rdtsc and x86_rdtscp...
static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS, ISD::CondCode CC, const APInt &OriginalMask, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static bool is128BitUnpackShuffleMask(ArrayRef< int > Mask, const SelectionDAG &DAG)
static bool isOrXorXorTree(SDValue X, bool Root=true)
Recursive helper for combineVectorSizedSetCCEquality() to see if we have a recognizable memcmp expans...
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static ConstantPoolSDNode * getTargetConstantPoolFromBasePtr(SDValue Ptr)
static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, SelectionDAG &DAG, const SDLoc &DL)
Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
static bool isShuffleEquivalent(ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a shuffle mask is equivalent to an explicit list of arguments.
static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT, const APInt &Zeroable, ArrayRef< int > Mask, SDValue &V1, SDValue &V2, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit floating point shuffles.
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsByteRotateAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then permuting the elements of th...
static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool ZeroUppers)
static void createPackShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Unary, unsigned NumStages=1)
Create a shuffle mask that matches the PACKSS/PACKUS truncation.
static bool isUndefOrEqualInRange(ArrayRef< int > Mask, int CmpVal, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating-point adds/subs.
static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an operation into 2 half sized ops and then concatenate the results.
static cl::opt< bool > MulConstantOptimization("mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " "SHIFT, LEA, etc."), cl::Hidden)
static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld)
static bool isAnyZero(ArrayRef< int > Mask)
Return true if the value of any element in Mask is the zero sentinel value.
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKUS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl< int > &Mask, APInt &KnownUndef, APInt &KnownZero)
static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, SDValue Index, SDValue Base, SDValue Scale, SelectionDAG &DAG)
static SmallVector< int, 4 > getPSHUFShuffleMask(SDValue N)
Get the PSHUF-style mask from PSHUF node.
static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, SelectionDAG &DAG)
Scalarize a vector store, bitcasting to TargetVT to determine the scalar type.
static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineShuffleToFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isUndefOrZero(int Val)
Val is either the undef or zero sentinel value.
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If this is a dynamic select (non-constant condition) and we can match this node with one of the varia...
SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, F Builder, bool CheckBWI=true)
static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL].
static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr, MachineBasicBlock *BB)
static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 128-bits from a vector > 128 bits.
static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue &X86CC)
static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Lower a vector shuffle using the SHUFPS instruction.
static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isHorizOp(unsigned Opcode)
static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector CTLZ using native supported vector CTLZ instruction.
static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extract one bit from mask vector, like v16i1 or v8i1.
static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, MVT VT, bool IsSigned, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue *Low=nullptr)
static SDValue lowerShuffleAsBlendOfPSHUFBs(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse)
Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the blend if only one input i...
static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx)
static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS, SDValue Mask, SelectionDAG &DAG)
static bool isSequentialOrUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos + Size,...
static cl::opt< int > BrMergingUnlikelyBias("x86-br-merging-unlikely-bias", cl::init(-1), cl::desc("Decreases 'x86-br-merging-base-cost' in cases that it is unlikely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "unlikely, then it is unlikely that if the conditionals are split " "both sides will be executed, so it may be desirable to decrease " "the instruction cost threshold. Set to -1 to never merge unlikely " "branches."), cl::Hidden)
static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, APInt &KnownUndef, APInt &KnownZero)
Decode a target shuffle mask and inputs and see if any values are known to be undef or zero from thei...
static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v4i32 or v4f32.
static bool isTargetShuffleEquivalent(MVT VT, ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, const SelectionDAG &DAG, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a target shuffle mask is equivalent to an explicit pattern.
static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG)
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG)
static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG)
Fold "masked merge" expressions like (m & x) | (~m & y) into the equivalent ((x ^ y) & m) ^ y) patter...
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1)
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool Is64Bit, bool Is64BitLP64)
static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG)
Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx)
For an EXTRACT_VECTOR_ELT with a constant index return the real underlying vector and index.
static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnaryOp(unsigned Opcode)
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each sub-lane.
static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize branch condition evaluation.
static bool hasFPCMov(unsigned X86CC)
Is there a floating point cmov for the specific X86 condition code? Current x86 isa includes the foll...
static int getOneTrueElt(SDValue V)
If V is a build vector of boolean constants and exactly one of those constants is true,...
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static constexpr int Concat[]
Value * RHS
Value * LHS
if(isa< SExtInst >(LHS)) std auto IsFreeTruncation
static const unsigned FramePtr
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:5191
static APFloat getAllOnesValue(const fltSemantics &Semantics)
Returns a float which is bitcasted from an all one value int.
Definition: APFloat.cpp:5216
void clearSign()
Definition: APFloat.h:1159
opStatus next(bool nextDown)
Definition: APFloat.h:1115
void changeSign()
Definition: APFloat.h:1158
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:957
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1385
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:427
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition: APInt.h:207
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition: APInt.h:401
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1498
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1370
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1627
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1364
uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition: APInt.cpp:489
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1002
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition: APInt.h:1470
APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:906
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition: APInt.h:184
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1308
APInt abs() const
Get the absolute value.
Definition: APInt.h:1744
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:349
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:236
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:358
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:444
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1636
void setSignBit()
Set the sign bit to 1.
Definition: APInt.h:1318
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1446
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1089
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:187
bool isMinValue() const
Determine if this is the smallest unsigned value.
Definition: APInt.h:395
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition: APInt.h:194
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:307
bool intersects(const APInt &RHS) const
This operation tests if there are any pairs of corresponding bits between this APInt and RHS that are...
Definition: APInt.h:1227
bool eq(const APInt &RHS) const
Equality comparison.
Definition: APInt.h:1057
int32_t exactLogBase2() const
Definition: APInt.h:1732
void clearAllBits()
Set every bit to 0.
Definition: APInt.h:1375
void ashrInPlace(unsigned ShiftAmt)
Arithmetic right-shift this APInt by ShiftAmt in place.
Definition: APInt.h:812
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1596
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition: APInt.h:413
unsigned getNumSignBits() const
Computes the number of leading bits of this APInt that are equal to its sign bit.
Definition: APInt.h:1585
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition: APInt.h:1555
static APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition: APInt.cpp:620
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:197
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition: APInt.h:1489
void flipAllBits()
Toggle every bit to its opposite value.
Definition: APInt.h:1412
unsigned countl_one() const
Count the number of leading one bits.
Definition: APInt.h:1572
void insertBits(const APInt &SubBits, unsigned bitPosition)
Insert the bits from a smaller APInt starting at bitPosition.
Definition: APInt.cpp:368
void clearLowBits(unsigned loBits)
Set bottom loBits bits to 0.
Definition: APInt.h:1395
unsigned logBase2() const
Definition: APInt.h:1710
void setAllBits()
Set every bit to 1.
Definition: APInt.h:1297
bool getBoolValue() const
Convert APInt to a boolean value.
Definition: APInt.h:449
bool isMask(unsigned numBits) const
Definition: APInt.h:466
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition: APInt.h:383
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:312
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition: APInt.h:1128
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:954
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition: APInt.h:1345
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition: APInt.h:851
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1235
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:418
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:284
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition: APInt.h:319
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:178
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition: APInt.h:1367
APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition: APInt.cpp:453
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition: APInt.h:410
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:367
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:264
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:217
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition: APInt.h:836
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition: APInt.h:829
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1613
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1199
bool isMaxValue() const
Determine if this is the largest unsigned value.
Definition: APInt.h:377
APInt truncSSat(unsigned width) const
Truncate to new width with signed saturation.
Definition: APInt.cpp:942
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:154
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:210
iterator begin() const
Definition: ArrayRef.h:153
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:195
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:647
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
Definition: Instructions.h:696
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:748
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:867
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:760
@ Add
*p = old + v
Definition: Instructions.h:764
@ FAdd
*p = old + v
Definition: Instructions.h:785
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:778
@ Or
*p = old | v
Definition: Instructions.h:772
@ Sub
*p = old - v
Definition: Instructions.h:766
@ And
*p = old & v
Definition: Instructions.h:768
@ Xor
*p = old ^ v
Definition: Instructions.h:774
@ FSub
*p = old - v
Definition: Instructions.h:788
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:800
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:776
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:782
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:796
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:780
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:792
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:804
@ Nand
*p = ~(old & v)
Definition: Instructions.h:770
Value * getPointerOperand()
Definition: Instructions.h:910
BinOp getOperation() const
Definition: Instructions.h:845
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:901
Value * getValOperand()
Definition: Instructions.h:914
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:887
This is an SDNode representing atomic operations.
bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:349
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:162
bool none() const
none - Returns true if none of the bits are set.
Definition: BitVector.h:188
The address of a basic block.
Definition: Constants.h:889
A "pseudo-class" with methods for operating on BUILD_VECTORs.
bool getRepeatedSequence(const APInt &DemandedElts, SmallVectorImpl< SDValue > &Sequence, BitVector *UndefElements=nullptr) const
Find the shortest repeating sequence of values in the build vector.
SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
Value * getCalledOperand() const
Definition: InstrTypes.h:1735
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:1022
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:1020
@ ICMP_EQ
equal
Definition: InstrTypes.h:1014
@ ICMP_NE
not equal
Definition: InstrTypes.h:1015
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:1105
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1291
static Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
Definition: Constants.cpp:2897
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
static bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
Definition: Constants.cpp:1588
bool isMachineConstantPoolEntry() const
const Constant * getConstVal() const
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1398
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
Definition: Constants.cpp:400
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
Definition: Constants.cpp:432
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Tagged union holding either a T or a Error.
Definition: Error.h:474
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type::subtype_iterator param_iterator
Definition: DerivedTypes.h:126
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:685
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:701
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:713
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:682
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:264
bool hasPersonalityFn() const
Check whether this function has a personality function.
Definition: Function.h:855
Constant * getPersonalityFn() const
Get the personality function associated with this function.
Definition: Function.cpp:1919
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:340
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:675
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:973
const GlobalValue * getGlobal() const
static StringRef dropLLVMManglingEscape(StringRef Name)
If the given string begins with the GlobalValue name mangling escape character '\1',...
Definition: GlobalValue.h:566
bool isAbsoluteSymbolRef() const
Returns whether this is a reference to an absolute symbol.
Definition: Globals.cpp:385
ThreadLocalMode getThreadLocalMode() const
Definition: GlobalValue.h:270
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:655
This instruction compares its operands according to the predicate given to the constructor.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
std::vector< ConstraintInfo > ConstraintInfoVector
Definition: InlineAsm.h:121
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:83
const BasicBlock * getParent() const
Definition: Instruction.h:152
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:149
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:87
Class to represent integer types.
Definition: DerivedTypes.h:40
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:72
static bool LowerToByteSwap(CallInst *CI)
Try to replace a call instruction with a call to a bswap intrinsic.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:184
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:266
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
bool usesWindowsCFI() const
Definition: MCAsmInfo.h:799
MCSymbol * getOrCreateParentFrameOffsetSymbol(const Twine &FuncName)
Definition: MCContext.cpp:220
MCSymbol * getOrCreateLSDASymbol(const Twine &FuncName)
Definition: MCContext.cpp:225
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:40
Set of metadata that should be preserved when using BuildMI().
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
@ INVALID_SIMPLE_VALUE_TYPE
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:585
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
MVT getDoubleNumVectorElementsVT() const
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
MVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
reverse_iterator rend()
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
void push_back(MachineInstr *MI)
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
bool isLiveIn(MCPhysReg Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
unsigned succ_size() const
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
succ_reverse_iterator succ_rbegin()
void eraseFromParent()
This method unlinks 'this' from the containing function and deletes it.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
iterator insertAfter(iterator I, MachineInstr *MI)
Insert MI into the instruction list after I.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
succ_reverse_iterator succ_rend()
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setReturnAddressIsTaken(bool s)
void setHasCopyImplyingStackAdjustment(bool B)
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
const WinEHFuncInfo * getWinEHFuncInfo() const
getWinEHFuncInfo - Return information about how the current function uses Windows exception handling.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
MachineModuleInfo & getMMI() const
bool shouldSplitStack() const
Should we be emitting segmented stack stuff for the function.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
void moveCallSiteInfo(const MachineInstr *Old, const MachineInstr *New)
Move the call site info from Old to \New call site info.
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
bool killsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr kills the specified register.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:568
unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_LabelDifference32
EK_LabelDifference32 - Each entry is the address of the block minus the address of the jump table.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
const MCContext & getContext() const
const Module * getModule() const
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This class is used to represent an MGATHER node.
This is a base class used to represent MGATHER and MSCATTER nodes.
This class is used to represent an MLOAD node.
This base class is used to represent MLOAD and MSTORE nodes.
const SDValue & getMask() const
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
This class is used to represent an MSTORE node.
bool isCompressingStore() const
Returns true if the op does a compression to the vector before storing.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID for this memory operation.
Align getOriginalAlign() const
Returns alignment and volatility of the memory access.
bool readMem() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition: Module.cpp:331
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:307
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:116
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
iterator_range< use_iterator > uses()
SDNodeFlags getFlags() const
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
static bool areOnlyUsersOf(ArrayRef< const SDNode * > Nodes, const SDNode *N)
Return true if all the users of N are contained in Nodes.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Return true if the type of the node type undefined.
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
void setFlags(SDNodeFlags NewFlags)
op_iterator op_end() const
op_iterator op_begin() const
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
Help to insert SDNodeFlags automatically in transforming.
Definition: SelectionDAG.h:361
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
bool willNotOverflowAdd(bool IsSigned, SDValue N0, SDValue N1) const
Determine if the result of the addition of 2 nodes can never overflow.
static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode)
Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
Definition: SelectionDAG.h:924
SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op)
Return the specified value casted to the target's desired shift amount type.
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:722
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition: SelectionDAG.h:954
SDValue getSplatSourceVector(SDValue V, int &SplatIndex)
If V is a splatted value, return the source vector and its splat index.
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:474
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
SDValue getConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offs=0, bool isT=false, unsigned TargetFlags=0)
SDNode * isConstantIntBuildVectorOrConstantInt(SDValue N) const
Test whether the given value is a constant int or similar node.
SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
bool shouldOptForSize() const
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:478
bool isEqualTo(SDValue A, SDValue B) const
Test whether two SDValues are known to compare equal.
static constexpr unsigned MaxRecursionDepth
Definition: SelectionDAG.h:448
SDValue expandVACopy(SDNode *Node)
Expand the specified ISD::VACOPY node as the Legalize pass would.
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:732
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:828
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal)
Try to simplify a select/vselect into 1 of its operands or a constant.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:472
SDValue expandVAArg(SDNode *Node)
Expand the specified ISD::VAARG node as the Legalize pass would.
bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
Definition: SelectionDAG.h:659
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV)
Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to the shuffle node in input but with swa...
std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
const APInt * getValidShiftAmountConstant(SDValue V, const APInt &DemandedElts) const
If a SHL/SRA/SRL node V has a constant or splat constant shift amount that is less than the element b...
bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, bool PoisonOnly=false, unsigned Depth=0) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getRegister(unsigned Reg, EVT VT)
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:473
std::pair< SDValue, SDValue > getStrictFPExtendOrRound(SDValue Op, SDValue Chain, const SDLoc &DL, EVT VT)
Convert Op, which must be a STRICT operation of float type, to the float type VT, by either extending...
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:773
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
bool isKnownNeverZeroFloat(SDValue Op) const
Test whether the given floating point SDValue is known to never be positive or negative zero.
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:676
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool MaskedVectorIsZero(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
Return true if 'Op' is known to be zero in DemandedElts.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:768
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:469
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:799
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:845
SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops)
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
Definition: SelectionDAG.h:485
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
SDValue getMCSymbol(MCSymbol *Sym, EVT VT)
SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:739
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:554
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
static unsigned getOpcode_EXTEND(unsigned Opcode)
Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
Definition: SelectionDAG.h:908
SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp, ArrayRef< ISD::NodeType > CandidateBinOps, bool AllowPartials=false)
Match a binop + shuffle pyramid that represents a horizontal reduction over the elements of a vector ...
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
static bool isBitRotateMask(ArrayRef< int > Mask, unsigned EltSizeInBits, unsigned MinSubElts, unsigned MaxSubElts, unsigned &NumSubElts, unsigned &RotateAmt)
Checks if the shuffle is a bit rotation of the first operand across multiple subelements,...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static void commuteMask(MutableArrayRef< int > Mask)
Change values in a shuffle permute mask assuming the two vector operands have swapped position.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
size_type count() const
Returns the number of bits which are set.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
size_type size() const
Definition: SmallSet.h:161
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:717
iterator erase(const_iterator CI)
Definition: SmallVector.h:750
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:591
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
pointer data()
Return a pointer to the vector's buffer, even if empty().
Definition: SmallVector.h:299
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition: StringRef.h:563
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:257
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:134
size_t size_type
Definition: StringRef.h:56
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:269
static constexpr size_t npos
Definition: StringRef.h:52
bool equals_insensitive(StringRef RHS) const
Check for string equality, ignoring case.
Definition: StringRef.h:170
size_t find_first_not_of(char C, size_t From=0) const
Find the first character in the string that is not C or npos if not found.
Definition: StringRef.cpp:251
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:373
Information about stack frame layout on the target.
virtual bool hasFP(const MachineFunction &MF) const =0
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC)
Set the CallingConv that should be used for the specified libcall.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp convert the backend supports.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
virtual bool isCommutativeBinOp(unsigned Opcode) const
Returns true if the opcode is a commutative binary operation.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const
Return true if it is profitable to fold a pair of shifts into a mask.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
BooleanContent getBooleanContents(bool isVec, bool isFloat) const
For targets without i1 registers, this gives the nature of the high-bits of boolean values held in ty...
virtual MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const
Returns preferred type for switch condition.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
BooleanContent
Enum that describes how the target represents true/false values.
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
AndOrSETCCFoldKind
Enum of different potentially desirable ways to fold (and/or (setcc ...), (setcc ....
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
NegatibleCost
Enum that specifies when a float negation is beneficial.
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual bool shouldConvertPhiType(Type *From, Type *To) const
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
static ISD::NodeType getExtendForContent(BooleanContent Content)
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const
Expands target specific indirect branch for the case of JumpTable expansion.
SDValue getCheaperNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, unsigned Depth=0) const
This is the helper function to return the newly negated expression only when the cost is cheaper.
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue SimplifyMultipleUseDemandedVectorElts(SDValue Op, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
Helper wrapper around SimplifyMultipleUseDemandedBits, demanding all bits from only some vector eleme...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual const char * LowerXConstraint(EVT ConstraintVT) const
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const
virtual bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth=0) const
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index) const
Get a pointer to vector element Idx located in memory for a vector of type VecVT starting at a base a...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
bool useTLSDESC() const
Returns true if this target uses TLS Descriptors.
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
unsigned NoSignedZerosFPMath
NoSignedZerosFPMath - This flag is enabled when the -enable-no-signed-zeros-fp-math is specified on t...
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetInstrInfo * getInstrInfo() const
Target - Wrapper for Target specific information.
bool isOSMSVCRT() const
Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
Definition: Triple.h:667
bool isOSDarwin() const
Is this a "Darwin" OS (macOS, iOS, tvOS, watchOS, XROS, or DriverKit).
Definition: Triple.h:558
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:342
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
static IntegerType * getInt1Ty(LLVMContext &C)
Type * getArrayElementType() const
Definition: Type.h:404
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
uint64_t getArrayNumElements() const
bool isX86_MMXTy() const
Return true if this is X86 MMX.
Definition: Type.h:201
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1808
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
use_iterator use_begin()
Definition: Value.h:360
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:683
Type * getElementType() const
Definition: DerivedTypes.h:436
bool has128ByteRedZone(const MachineFunction &MF) const
Return true if the function has a redzone (accessible bytes past the frame of the top of stack functi...
bool Uses64BitFramePtr
True if the 64-bit frame or stack pointer should be used.
unsigned getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
ArrayRef< size_t > getPreallocatedArgOffsets(const size_t Id)
void setRestoreBasePointer(const MachineFunction *MF)
size_t getPreallocatedStackSize(const size_t Id)
unsigned getPtrSizedFrameRegister(const MachineFunction &MF) const
bool hasBasePointer(const MachineFunction &MF) const
Register getFrameRegister(const MachineFunction &MF) const override
const uint32_t * getDarwinTLSCallPreservedMask() const
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
Register getStackRegister() const
unsigned getSlotSize() const
Register getBaseRegister() const
const uint32_t * getNoPreservedMask() const override
bool canExtendTo512BW() const
Definition: X86Subtarget.h:250
bool hasAnyFMA() const
Definition: X86Subtarget.h:213
bool isOSWindows() const
Definition: X86Subtarget.h:336
bool isTargetMachO() const
Definition: X86Subtarget.h:302
bool useIndirectThunkBranches() const
Definition: X86Subtarget.h:235
bool hasSSE1() const
Definition: X86Subtarget.h:200
bool hasThreeDNow() const
Definition: X86Subtarget.h:211
bool isPICStyleGOT() const
Definition: X86Subtarget.h:342
bool hasSSE42() const
Definition: X86Subtarget.h:205
const X86TargetLowering * getTargetLowering() const override
Definition: X86Subtarget.h:125
bool hasMFence() const
Use mfence if we have SSE2 or we're on x86-64 (even if we asked for no-sse2).
Definition: X86Subtarget.h:290
bool canUseCMOV() const
Definition: X86Subtarget.h:199
bool isPICStyleStubPIC() const
Definition: X86Subtarget.h:345
bool isTargetWindowsMSVC() const
Definition: X86Subtarget.h:314
bool canUseCMPXCHG8B() const
Definition: X86Subtarget.h:192
bool isTargetDarwin() const
Definition: X86Subtarget.h:294
bool isTargetWin64() const
Definition: X86Subtarget.h:338
bool isTarget64BitLP64() const
Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
Definition: X86Subtarget.h:185
const Triple & getTargetTriple() const
Definition: X86Subtarget.h:292
const X86InstrInfo * getInstrInfo() const override
Definition: X86Subtarget.h:129
bool useAVX512Regs() const
Definition: X86Subtarget.h:267
bool hasSSE3() const
Definition: X86Subtarget.h:202
bool isCallingConvWin64(CallingConv::ID CC) const
Definition: X86Subtarget.h:351
bool hasAVX512() const
Definition: X86Subtarget.h:208
bool canExtendTo512DQ() const
Definition: X86Subtarget.h:246
bool hasSSE41() const
Definition: X86Subtarget.h:204
bool hasMMX() const
Definition: X86Subtarget.h:210
bool isTargetELF() const
Definition: X86Subtarget.h:300
bool hasSSEPrefetch() const
Definition: X86Subtarget.h:221
bool canUseCMPXCHG16B() const
Definition: X86Subtarget.h:193
unsigned char classifyGlobalReference(const GlobalValue *GV, const Module &M) const
bool hasSSE2() const
Definition: X86Subtarget.h:201
bool hasSSSE3() const
Definition: X86Subtarget.h:203
bool hasInt256() const
Definition: X86Subtarget.h:209
bool isPICStyleRIPRel() const
Definition: X86Subtarget.h:343
bool isTargetCygMing() const
Definition: X86Subtarget.h:334
unsigned char classifyLocalReference(const GlobalValue *GV) const
Classify a global variable reference for the current subtarget according to how we should reference i...
unsigned char classifyBlockAddressReference() const
Classify a blockaddress reference for the current subtarget according to how we should reference it i...
bool isTargetPS() const
Definition: X86Subtarget.h:298
const X86RegisterInfo * getRegisterInfo() const override
Definition: X86Subtarget.h:139
bool hasAVX() const
Definition: X86Subtarget.h:206
bool isTargetWindowsGNU() const
Definition: X86Subtarget.h:326
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:239
bool isTargetWindowsItanium() const
Definition: X86Subtarget.h:330
bool isTargetNaCl64() const
Definition: X86Subtarget.h:310
const X86FrameLowering * getFrameLowering() const override
Definition: X86Subtarget.h:131
bool useBWIRegs() const
Definition: X86Subtarget.h:276
unsigned char classifyGlobalFunctionReference(const GlobalValue *GV, const Module &M) const
Classify a global function reference for the current subtarget.
bool hasAVX2() const
Definition: X86Subtarget.h:207
bool shouldFormOverflowOp(unsigned Opcode, EVT VT, bool MathUsed) const override
Overflow nodes should get combined/lowered to optimal instructions (they should allow eliminating exp...
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
bool isLegalAddImmediate(int64_t Imm) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const override
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool preferABDSToABSWithNSW(EVT VT) const override
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
std::pair< SDValue, SDValue > BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer, MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const
bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded vector elements, returning true on success...
SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, const SDLoc &DL, const AsmOperandInfo &Constraint, SelectionDAG &DAG) const override
Handle Lowering flag assembly outputs.
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const override
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth) const override
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
bool convertSelectOfConstantsToMath(EVT VT) const override
Return true if a select of constants (select Cond, C1, C2) should be transformed into simple math ops...
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint letter, return the type of constraint for this target.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
bool isVectorShiftByScalarCheap(Type *Ty) const override
This is used to enable splatted operand transforms for vector shifts and vector funnel shifts.
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool isLegalStoreImmediate(int64_t Imm) const override
Return true if the specified immediate is legal for the value input of a store instruction.
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isCtlzFast() const override
Return true if ctlz instruction is fast.
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
bool supportSwiftError() const override
Return true if the target supports swifterror attribute.
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
bool shouldSplatInsEltVarIndex(EVT VT) const override
Return true if inserting a scalar into a variable element of an undef vector is more efficiently hand...
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Return true if sinking I's operands to the same basic block as I is profitable, e....
bool isInlineAsmTargetBranch(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo) const override
On x86, return true if the operand with index OpNo is a CALL or JUMP instruction, which can use eithe...
MVT hasFastEqualityCompare(unsigned NumBits) const override
Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op, const APInt &DemandedElts, unsigned MaskIndex, TargetLoweringOpt &TLO, unsigned Depth) const
bool isLegalICmpImmediate(int64_t Imm) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool hasInlineStackProbe(const MachineFunction &MF) const override
Returns true if stack probing through inline assembly is requested.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
unsigned preferedOpcodeForCmpEqPiecesOfOperand(EVT VT, unsigned ShiftOpc, bool MayTransformRotate, const APInt &ShiftOrRotateAmt, const std::optional< APInt > &AndMask) const override
bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond, EVT VT) const override
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool ExpandInlineAsm(CallInst *CI) const override
This hook allows the target to expand an inline asm call to be explicit llvm code if it wants to.
bool hasAndNot(SDValue Y) const override
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override
Return true if we believe it is correct and profitable to reduce the load node to a smaller type.
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool preferScalarizeSplat(SDNode *N) const override
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const override
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool useLoadStackGuardNode() const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool hasAndNotCompare(SDValue Y) const override
Return true if the target should transform: (X & Y) == Y —> (~X & Y) == 0 (X & Y) !...
bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override
Return true if it is profitable to convert a select of FP constants into a constant pool load whose a...
StringRef getStackProbeSymbolName(const MachineFunction &MF) const override
Returns the name of the symbol used to emit stack probes or the empty string if not applicable.
bool hasBitTest(SDValue X, SDValue Y) const override
Return true if the target has a bit-test instruction: (X & (1 << Y)) ==/!= 0 This knowledge can be us...
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
bool isShuffleMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
bool useStackGuardXorFP() const override
If this function returns true, stack protection checks should XOR the frame pointer (or whichever poi...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine the number of bits in the operation that are sign bits.
bool shouldScalarizeBinop(SDValue) const override
Scalar ops always have equal or better analysis/performance/power than the vector equivalent,...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type Ty1 to type Ty2.
bool decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const override
Return true if it is profitable to transform an integer multiplication-by-constant into simpler opera...
bool areJTsAllowed(const Function *Fn) const override
Returns true if lowering to a jump table is allowed.
bool isCommutativeBinOp(unsigned Opcode) const override
Returns true if the opcode is a commutative binary operation.
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const override
Returns preferred type for switch condition.
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isVectorClearMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Similar to isShuffleMaskLegal.
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info, const char *Constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Customize the preferred legalization strategy for certain types.
bool shouldConvertPhiType(Type *From, Type *To) const override
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool hasStackProbeSymbol(const MachineFunction &MF) const override
Returns true if stack probing through a function call is requested.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type Ty1 implicit zero-extends the valu...
bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
This function returns true if the memory access is aligned or if the target allows this specific unal...
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override
TargetLowering::AndOrSETCCFoldKind isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const override
Return prefered fold type, Abs if this is a vector, AddAnd if its an integer, None otherwise.
bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override
There are two ways to clear extreme bits (either low or high): Mask: x & (-1 << y) (the instcombine c...
bool addressingModeSupportsTLS(const GlobalValue &GV) const override
Returns true if the targets addressing mode can target thread local storage (TLS).
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const override
Expands target specific indirect branch for the case of JumpTable expansion.
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isBinOp(unsigned Opcode) const override
Add x86-specific opcodes to the default list.
bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const override
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDValue unwrapAddress(SDValue N) const override
CondMergingParams getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs) const override
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the value type to use for ISD::SETCC.
X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI)
bool isVectorLoadExtDesirable(SDValue) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const Constant * getTargetConstantFromLoad(LoadSDNode *LD) const override
This method returns the constant pool value that will be loaded by LD.
EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override
For types supported by the target, this is an identity function.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
unsigned getStackProbeSize(const MachineFunction &MF) const
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
Replace the results of node with an illegal result type with new values built out of custom code.
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
bool needsFixedCatchObjects() const override
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:199
self_iterator getIterator()
Definition: ilist_node.h:109
#define INT64_MIN
Definition: DataTypes.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2978
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ X86_ThisCall
Similar to X86_StdCall.
Definition: CallingConv.h:122
@ X86_StdCall
stdcall is mostly used by the Win32 API.
Definition: CallingConv.h:99
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ X86_FastCall
'fast' analog of X86_StdCall.
Definition: CallingConv.h:103
bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:751
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:237
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1133
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1129
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:724
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:477
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition: ISDOpcodes.h:147
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition: ISDOpcodes.h:498
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:251
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1276
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:560
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:715
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1162
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1278
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1248
@ STRICT_FCEIL
Definition: ISDOpcodes.h:427
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1279
@ FRAME_TO_ARGS_OFFSET
FRAME_TO_ARGS_OFFSET - This node represents offset from frame pointer to first (possible) on-stack ar...
Definition: ISDOpcodes.h:124
@ RESET_FPENV
Set floating-point environment to default state.
Definition: ISDOpcodes.h:1009
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:488
@ FMAXNUM_IEEE
Definition: ISDOpcodes.h:986
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:240
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1038
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:784
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:484
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ RETURNADDR
Definition: ISDOpcodes.h:95
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition: ISDOpcodes.h:151
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1261
@ STRICT_FMINIMUM
Definition: ISDOpcodes.h:437
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:791
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:544
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:391
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:689
@ MEMBARRIER
MEMBARRIER - Compiler barrier only; generate a no-op.
Definition: ISDOpcodes.h:1235
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
Definition: ISDOpcodes.h:1240
@ SIGN_EXTEND_VECTOR_INREG
SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register sign-extension of the low ...
Definition: ISDOpcodes.h:821
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:256
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:478
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:914
@ STRICT_FLOG2
Definition: ISDOpcodes.h:422
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1274
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:904
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:230
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1275
@ INIT_TRAMPOLINE
INIT_TRAMPOLINE - This corresponds to the init_trampoline intrinsic.
Definition: ISDOpcodes.h:1206
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:940
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition: ISDOpcodes.h:412
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1412
@ GlobalTLSAddress
Definition: ISDOpcodes.h:79
@ EH_LABEL
EH_LABEL - Represents a label in mid basic block used to track locations needed for debug and excepti...
Definition: ISDOpcodes.h:1109
@ EH_RETURN
OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER) - This node represents 'eh_return' gcc dwarf builtin,...
Definition: ISDOpcodes.h:135
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:886
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:775
@ STRICT_UINT_TO_FP
Definition: ISDOpcodes.h:451
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:621
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition: ISDOpcodes.h:101
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1054
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:723
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1228
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:995
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:759
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:931
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1084
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:328
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1277
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1063
@ GC_TRANSITION_START
GC_TRANSITION_START/GC_TRANSITION_END - These operators mark the beginning and end of GC transition s...
Definition: ISDOpcodes.h:1320
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:350
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:728
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1244
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:212
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition: ISDOpcodes.h:628
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1158
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:209
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:324
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:431
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:881
@ STRICT_FP_TO_FP16
Definition: ISDOpcodes.h:917
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:652
@ STRICT_FP16_TO_FP
Definition: ISDOpcodes.h:916
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:706
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:601
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1272
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:574
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:985
@ STRICT_FMAXIMUM
Definition: ISDOpcodes.h:436
@ STRICT_FMAXNUM
Definition: ISDOpcodes.h:425
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:536
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:781
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1218
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:857
@ STRICT_FMINNUM
Definition: ISDOpcodes.h:426
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:743
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1280
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition: ISDOpcodes.h:114
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:972
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
Definition: ISDOpcodes.h:1222
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:332
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1048
@ ConstantPool
Definition: ISDOpcodes.h:82
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition: ISDOpcodes.h:810
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:799
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:675
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:889
@ STRICT_FROUND
Definition: ISDOpcodes.h:429
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:737
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:304
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition: ISDOpcodes.h:450
@ STRICT_BF16_TO_FP
Definition: ISDOpcodes.h:925
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:428
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:430
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:923
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:94
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1270
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:444
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:466
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:443
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:991
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1271
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:837
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1189
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition: ISDOpcodes.h:158
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:471
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:681
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1215
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ GET_FPENV_MEM
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1014
@ STRICT_FP_TO_BF16
Definition: ISDOpcodes.h:926
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition: ISDOpcodes.h:401
@ STRICT_FLOG10
Definition: ISDOpcodes.h:421
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:525
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ STRICT_FEXP2
Definition: ISDOpcodes.h:419
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1269
@ ExternalSymbol
Definition: ISDOpcodes.h:83
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:870
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:415
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:832
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:908
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:424
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:856
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition: ISDOpcodes.h:141
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:787
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1153
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1077
@ BlockAddress
Definition: ISDOpcodes.h:84
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:764
@ GC_TRANSITION_END
Definition: ISDOpcodes.h:1321
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:494
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:341
@ AssertZext
Definition: ISDOpcodes.h:62
@ STRICT_FRINT
Definition: ISDOpcodes.h:423
@ SET_FPENV_MEM
Sets the current floating point environment.
Definition: ISDOpcodes.h:1019
@ ADJUST_TRAMPOLINE
ADJUST_TRAMPOLINE - This corresponds to the adjust_trampoline intrinsic.
Definition: ISDOpcodes.h:1212
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:314
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:516
bool isExtVecInRegOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1611
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isBuildVectorOfConstantSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantSDNode or undef.
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool matchUnaryPredicate(SDValue Op, std::function< bool(ConstantSDNode *)> Match, bool AllowUndefs=false)
Hook for matching ConstantSDNode predicate.
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1606
CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
Definition: ISDOpcodes.h:1427
bool isTrueWhenEqual(CondCode Cond)
Return true if the specified condition returns true if the two operands to the condition are equal.
Definition: ISDOpcodes.h:1593
bool isUNINDEXEDLoad(const SDNode *N)
Returns true if the specified node is an unindexed load.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
bool isFreezeUndef(const SDNode *N)
Return true if the specified node is FREEZE(UNDEF).
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
Definition: ISDOpcodes.h:1568
bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
bool isBuildVectorOfConstantFPSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantFPSDNode or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1535
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1515
bool isUnsignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs an unsigned comparison when used with intege...
Definition: ISDOpcodes.h:1574
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1469
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
Definition: PatternMatch.h:524
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
cst_pred_ty< is_sign_mask > m_SignMask()
Match an integer or vector with only the sign bit(s) set.
Definition: PatternMatch.h:664
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:972
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:875
BinaryOp_match< LHS, RHS, Instruction::Xor, true > m_c_Xor(const LHS &L, const RHS &R)
Matches an Xor with LHS and RHS in either order.
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
CmpClass_match< LHS, RHS, ICmpInst, ICmpInst::Predicate > m_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
CmpClass_match< LHS, RHS, ICmpInst, ICmpInst::Predicate, true > m_c_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R)
Matches an ICmp with a predicate over LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:299
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition: LLVMContext.h:54
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
@ GeneralDynamic
Definition: CodeGen.h:46
@ X86
Windows x64, Windows Itanium (IA-64)
@ PTR32_UPTR
Definition: X86.h:209
@ FS
Definition: X86.h:206
@ PTR64
Definition: X86.h:210
@ PTR32_SPTR
Definition: X86.h:208
@ GS
Definition: X86.h:205
Reg
All possible values of the reg field in the ModR/M byte.
@ MO_TLSLD
MO_TLSLD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
Definition: X86BaseInfo.h:425
@ MO_GOTPCREL_NORELAX
MO_GOTPCREL_NORELAX - Same as MO_GOTPCREL except that R_X86_64_GOTPCREL relocations are guaranteed to...
Definition: X86BaseInfo.h:405
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
Definition: X86BaseInfo.h:502
@ MO_NTPOFF
MO_NTPOFF - On a symbol operand this indicates that the immediate is the negative thread-pointer offs...
Definition: X86BaseInfo.h:464
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
Definition: X86BaseInfo.h:446
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
Definition: X86BaseInfo.h:470
@ MO_TPOFF
MO_TPOFF - On a symbol operand this indicates that the immediate is the thread-pointer offset for the...
Definition: X86BaseInfo.h:452
@ MO_TLVP_PIC_BASE
MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate is some TLS offset from the ...
Definition: X86BaseInfo.h:490
@ MO_TLSGD
MO_TLSGD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
Definition: X86BaseInfo.h:417
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
Definition: X86BaseInfo.h:377
@ MO_TLVP
MO_TLVP - On a symbol operand this indicates that the immediate is some TLS offset.
Definition: X86BaseInfo.h:486
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the reference is actually to the "__imp...
Definition: X86BaseInfo.h:474
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
Definition: X86BaseInfo.h:439
@ MO_SECREL
MO_SECREL - On a symbol operand this indicates that the immediate is the offset from beginning of sec...
Definition: X86BaseInfo.h:494
@ MO_DTPOFF
MO_DTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
Definition: X86BaseInfo.h:458
@ MO_TLSLDM
MO_TLSLDM - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
Definition: X86BaseInfo.h:433
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
Definition: X86BaseInfo.h:401
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FMAX
Floating point max and min.
@ BT
X86 bit-test instructions.
@ HADD
Integer horizontal add/sub.
@ MOVQ2DQ
Copies a 64-bit value from an MMX vector to the low word of an XMM vector, with the high word zero fi...
@ BLENDI
Blend where the selector is an immediate.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ ADDSUB
Combined add and sub on an FP vector.
@ STRICT_FCMP
X86 strict FP compare instructions.
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FHADD
Floating point horizontal add/sub.
@ BSR
Bit scan reverse.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ PEXTRB
Extract an 8-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRB.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ FSETCC
X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
@ PINSRB
Insert the lower 8-bits of a 32-bit value to a vector, corresponds to X86::PINSRB.
@ INSERTPS
Insert any element of a 4 x float vector into any element of a destination 4 x floatvector.
@ PSHUFB
Shuffle 16 8-bit values within a vector.
@ PEXTRW
Extract a 16-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRW.
@ AADD
RAO arithmetic instructions.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FMAXC
Commutative FMIN and FMAX.
@ EXTRQI
SSE4A Extraction and Insertion.
@ FLD
This instruction implements an extending load to FP stack slots.
@ PSADBW
Compute Sum of Absolute Differences.
@ FOR
Bitwise logical OR of floating point values.
@ FIST
This instruction implements a fp->int store from FP stack slots.
@ FP_TO_INT_IN_MEM
This instruction implements FP_TO_SINT with the integer destination in memory and a FP reg source.
@ LADD
LOCK-prefixed arithmetic read-modify-write instructions.
@ MMX_MOVW2D
Copies a GPR into the low 32-bit word of a MMX vector and zero out the high word.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ PINSRW
Insert the lower 16-bits of a 32-bit value to a vector, corresponds to X86::PINSRW.
@ CMPCCXADD
Compare and Add if Condition is Met.
@ MMX_MOVD2W
Copies a 32-bit value from the low word of a MMX vector to a GPR.
@ FILD
This instruction implements SINT_TO_FP with the integer source in memory and FP reg result.
@ MOVDQ2Q
Copies a 64-bit value from the low word of an XMM vector to an MMX vector.
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ BSF
Bit scan forward.
@ VAARG_64
These instructions grab the address of the next argument from a va_list.
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
@ FSHL
X86 funnel/double shift i16 instructions.
@ FRSQRT
Floating point reciprocal-sqrt and reciprocal approximation.
@ TO_NEAREST_INT
Definition: X86BaseInfo.h:42
@ CUR_DIRECTION
Definition: X86BaseInfo.h:46
bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into a vector splat instruction as a memory oper...
bool isZeroNode(SDValue Elt)
Returns true if Elt is a constant zero or floating point constant +0.0.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
bool mayFoldIntoZeroExtend(SDValue Op)
Check if Op is an operation that could be folded into a zero extend x86 instruction.
@ AddrNumOperands
Definition: X86BaseInfo.h:36
bool mayFoldIntoStore(SDValue Op)
Check if Op is a value that could be used to fold a store into some other x86 instruction as a memory...
bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget, const MachineFunction &MF)
True if the target supports the extended frame for async Swift functions.
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
std::optional< const char * > toString(const std::optional< DWARFFormValue > &V)
Take an optional DWARFFormValue and try to extract a string value from it.
constexpr double e
Definition: MathExtras.h:31
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, unsigned NumDstElts, bool IsAnyExtend, SmallVectorImpl< int > &ShuffleMask)
Decode a zero extension instruction as a shuffle mask.
IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It, then continue incrementing it while it points to a debug instruction.
static bool isGlobalStubReference(unsigned char TargetFlag)
isGlobalStubReference - Return true if the specified TargetFlag operand is a reference to a stub for ...
Definition: X86InstrInfo.h:109
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:456
@ Length
Definition: DWP.cpp:456
void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
static bool isGlobalRelativeToPICBase(unsigned char TargetFlag)
isGlobalRelativeToPICBase - Return true if the specified global value reference is relative to a 32-b...
Definition: X86InstrInfo.h:127
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decode a move lower and zero upper instruction as a shuffle mask.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isAllOnesOrAllOnesSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant -1 integer or a splatted vector of a constant -1 integer (with...
Definition: Utils.cpp:1539
void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshuflw.
static const IntrinsicData * getIntrinsicWithChain(unsigned IntNo)
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2406
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition: MathExtras.h:343
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
AddressSpace
Definition: NVPTXBaseInfo.h:21
@ SjLj
setjmp/longjmp based exceptions
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
static void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand, unsigned Reg)
Replace the address used in the instruction with the direct memory reference.
void DecodeVPERMV3Mask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
void DecodeBLENDMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a BLEND immediate mask into a shuffle mask.
void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a shuffle packed values at 128-bit granularity (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) immed...
void DecodeVPERMMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for VPERMQ/VPERMPD.
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A EXTRQ instruction as a shuffle mask.
static const MachineInstrBuilder & addFullAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM)
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:280
static const IntrinsicData * getIntrinsicWithoutChain(unsigned IntNo)
@ SM_SentinelUndef
@ SM_SentinelZero
bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition: Utils.cpp:1521
void DecodePSRLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1768
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:330
bool isMinSignedConstant(SDValue V)
Returns true if V is a constant min signed integer value.
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
unsigned M1(unsigned Val)
Definition: VE.h:376
void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
bool getShuffleDemandedElts(int SrcWidth, ArrayRef< int > Mask, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS, bool AllowUndefElts=false)
Transform a shuffle mask's output demanded element mask into demanded element masks for the 2 operand...
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, unsigned Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
void createUnpackShuffleMask(EVT VT, SmallVectorImpl< int > &Mask, bool Lo, bool Unary)
Generate unpacklo/unpackhi shuffle mask.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:138
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A INSERTQ instruction as a shuffle mask.
SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:143
void DecodeVPERMVMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
static void verifyIntrinsicTables()
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Mod
The access may modify the value stored in memory.
void createSplat2ShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Lo)
Similar to unpacklo/unpackhi, but without the 128-bit lane limitation imposed by AVX and specific to ...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
bool isFuncletEHPersonality(EHPersonality Pers)
Returns true if this is a personality function that invokes handler funclets (which must return to it...
void DecodeVALIGNMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
CombineLevel
Definition: DAGCombine.h:15
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1954
void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Add
Sum of integers.
void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad, SmallVectorImpl< int > &ShuffleMask)
Decode a scalar float move instruction as a shuffle mask.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1914
static X86AddressMode getAddressFromInstr(const MachineInstr *MI, unsigned Operand)
Compute the addressing mode from an machine instruction starting with the given operand.
void DecodeVPPERMMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPPERM mask from a raw array of constants such as from BUILD_VECTOR.
DWARFExpression::Operation Op
void DecodePALIGNRMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
RoundingMode
Rounding mode.
@ TowardZero
roundTowardZero.
@ NearestTiesToEven
roundTiesToEven.
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
Definition: VE.h:375
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
void DecodePSLLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1921
void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2039
@ TRUNCATE_TO_MEM_VI16
@ INTR_TYPE_SCALAR_MASK_SAE
@ INTR_TYPE_1OP_SAE
@ TRUNCATE_TO_MEM_VI32
@ INTR_TYPE_2OP_SAE
@ TRUNCATE_TO_REG
@ INTR_TYPE_3OP_SCALAR_MASK_SAE
@ INTR_TYPE_3OP_MASK_SAE
@ INTR_TYPE_2OP_MASK
@ TRUNCATE_TO_MEM_VI8
@ CVTNEPS2BF16_MASK
@ CMP_MASK_SCALAR_CC
@ INTR_TYPE_1OP_MASK_SAE
@ FIXUPIMM_MASKZ
@ INTR_TYPE_SCALAR_MASK
@ INTR_TYPE_3OP_IMM8
@ INTR_TYPE_2OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK_RND
@ INTR_TYPE_1OP_MASK
@ COMPRESS_EXPAND_IN_REG
@ INTR_TYPE_4OP_IMM8
void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
void array_pod_sort(IteratorTy Start, IteratorTy End)
array_pod_sort - This sorts an array with the specified start and end extent.
Definition: STLExtras.h:1607
void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decodes a broadcast of the first element of a vector.
void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for shufp*.
void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufhw.
void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition: MathExtras.h:203
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
void DecodePSHUFBMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a PSHUFB mask from a raw array of constants such as from BUILD_VECTOR.
int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
static const MachineInstrBuilder & addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg)
addDirectMem - This function is used to add a direct memory reference to the current instruction – th...
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
#define EQ(a, b)
Definition: regexec.c:112
This is used by foldAnyOrAllBitsSet() to capture a source value (Root) and the bit indexes (Mask) nee...
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:249
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:230
static constexpr roundingMode rmTowardZero
Definition: APFloat.h:234
static const fltSemantics & x87DoubleExtended() LLVM_READNONE
Definition: APFloat.cpp:263
static const fltSemantics & IEEEquad() LLVM_READNONE
Definition: APFloat.cpp:251
static const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:250
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:247
static const fltSemantics & BFloat() LLVM_READNONE
Definition: APFloat.cpp:248
opStatus
IEEE-754R 7: Default exception handling.
Definition: APFloat.h:246
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Extended Value Type.
Definition: ValueTypes.h:34
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:93
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:290
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:448
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:233
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:349
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:628
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:203
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:366
bool is512BitVector() const
Return true if this is a 512-bit vector type.
Definition: ValueTypes.h:213
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:58
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:208
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:202
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:101
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:438
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:198
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:297
static std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
Definition: KnownBits.cpp:494
KnownBits anyextOrTrunc(unsigned BitWidth) const
Return known bits for an "any" extension or truncation of the value we're tracking.
Definition: KnownBits.h:182
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:104
bool isZero() const
Returns true if value is all zero.
Definition: KnownBits.h:77
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition: KnownBits.h:238
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:63
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:157
bool hasConflict() const
Returns true if there is conflicting information.
Definition: KnownBits.h:47
unsigned countMaxPopulation() const
Returns the maximum number of bits that could be one.
Definition: KnownBits.h:285
void setAllZero()
Make all bits known to be zero and discard any previous information.
Definition: KnownBits.h:89
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:40
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition: KnownBits.h:168
bool isConstant() const
Returns true if we know the value of all bits.
Definition: KnownBits.h:50
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:71
static KnownBits abdu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits for abdu(LHS, RHS).
Definition: KnownBits.cpp:234
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition: KnownBits.h:221
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:292
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:307
KnownBits zextOrTrunc(unsigned BitWidth) const
Return known bits for a zero extension or truncation of the value we're tracking.
Definition: KnownBits.h:192
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:244
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:141
static KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition: KnownBits.cpp:57
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:101
void setAllOnes()
Make all bits known to be one and discard any previous information.
Definition: KnownBits.h:95
static KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
Definition: KnownBits.cpp:777
static std::optional< bool > sgt(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_SGT result.
Definition: KnownBits.cpp:532
bool isAllOnes() const
Returns true if value is all one bits.
Definition: KnownBits.h:83
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition: KnownBits.h:57
This class contains a discriminated union of information about pointers in memory operands,...
bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoSignedZeros() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
MVT ConstraintVT
The ValueType for the operand value.
std::string ConstraintCode
This contains the actual string for the code, like "m".
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setChain(SDValue InChain)
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)
X86AddressMode - This struct holds a generalized full x86 address mode.