LLVM 19.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86CallingConv.h"
18#include "X86FrameLowering.h"
19#include "X86InstrBuilder.h"
20#include "X86IntrinsicsInfo.h"
22#include "X86TargetMachine.h"
23#include "X86TargetObjectFile.h"
25#include "llvm/ADT/SmallSet.h"
26#include "llvm/ADT/Statistic.h"
43#include "llvm/IR/CallingConv.h"
44#include "llvm/IR/Constants.h"
47#include "llvm/IR/Function.h"
48#include "llvm/IR/GlobalAlias.h"
50#include "llvm/IR/IRBuilder.h"
52#include "llvm/IR/Intrinsics.h"
54#include "llvm/MC/MCAsmInfo.h"
55#include "llvm/MC/MCContext.h"
56#include "llvm/MC/MCExpr.h"
57#include "llvm/MC/MCSymbol.h"
59#include "llvm/Support/Debug.h"
64#include <algorithm>
65#include <bitset>
66#include <cctype>
67#include <numeric>
68using namespace llvm;
69
70#define DEBUG_TYPE "x86-isel"
71
73 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
75 "Sets the preferable loop alignment for experiments (as log2 bytes) "
76 "for innermost loops only. If specified, this option overrides "
77 "alignment set by x86-experimental-pref-loop-alignment."),
79
81 "x86-br-merging-base-cost", cl::init(2),
83 "Sets the cost threshold for when multiple conditionals will be merged "
84 "into one branch versus be split in multiple branches. Merging "
85 "conditionals saves branches at the cost of additional instructions. "
86 "This value sets the instruction cost limit, below which conditionals "
87 "will be merged, and above which conditionals will be split. Set to -1 "
88 "to never merge branches."),
90
92 "x86-br-merging-likely-bias", cl::init(0),
93 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
94 "that all conditionals will be executed. For example for merging "
95 "the conditionals (a == b && c > d), if its known that a == b is "
96 "likely, then it is likely that if the conditionals are split "
97 "both sides will be executed, so it may be desirable to increase "
98 "the instruction cost threshold. Set to -1 to never merge likely "
99 "branches."),
100 cl::Hidden);
101
103 "x86-br-merging-unlikely-bias", cl::init(-1),
104 cl::desc(
105 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
106 "that all conditionals will be executed. For example for merging "
107 "the conditionals (a == b && c > d), if its known that a == b is "
108 "unlikely, then it is unlikely that if the conditionals are split "
109 "both sides will be executed, so it may be desirable to decrease "
110 "the instruction cost threshold. Set to -1 to never merge unlikely "
111 "branches."),
112 cl::Hidden);
113
115 "mul-constant-optimization", cl::init(true),
116 cl::desc("Replace 'mul x, Const' with more effective instructions like "
117 "SHIFT, LEA, etc."),
118 cl::Hidden);
119
121 const X86Subtarget &STI)
122 : TargetLowering(TM), Subtarget(STI) {
123 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
124 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
125
126 // Set up the TargetLowering object.
127
128 // X86 is weird. It always uses i8 for shift amounts and setcc results.
130 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
132
133 // For 64-bit, since we have so many registers, use the ILP scheduler.
134 // For 32-bit, use the register pressure specific scheduling.
135 // For Atom, always use ILP scheduling.
136 if (Subtarget.isAtom())
138 else if (Subtarget.is64Bit())
140 else
142 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
144
145 // Bypass expensive divides and use cheaper ones.
146 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
147 if (Subtarget.hasSlowDivide32())
148 addBypassSlowDiv(32, 8);
149 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
150 addBypassSlowDiv(64, 32);
151 }
152
153 // Setup Windows compiler runtime calls.
154 if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
155 static const struct {
156 const RTLIB::Libcall Op;
157 const char * const Name;
158 const CallingConv::ID CC;
159 } LibraryCalls[] = {
160 { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
161 { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
162 { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
163 { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
164 { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
165 };
166
167 for (const auto &LC : LibraryCalls) {
168 setLibcallName(LC.Op, LC.Name);
169 setLibcallCallingConv(LC.Op, LC.CC);
170 }
171 }
172
173 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
174 // MSVCRT doesn't have powi; fall back to pow
175 setLibcallName(RTLIB::POWI_F32, nullptr);
176 setLibcallName(RTLIB::POWI_F64, nullptr);
177 }
178
179 if (Subtarget.canUseCMPXCHG16B())
181 else if (Subtarget.canUseCMPXCHG8B())
183 else
185
186 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
187
189
190 // Set up the register classes.
191 addRegisterClass(MVT::i8, &X86::GR8RegClass);
192 addRegisterClass(MVT::i16, &X86::GR16RegClass);
193 addRegisterClass(MVT::i32, &X86::GR32RegClass);
194 if (Subtarget.is64Bit())
195 addRegisterClass(MVT::i64, &X86::GR64RegClass);
196
197 for (MVT VT : MVT::integer_valuetypes())
199
200 // We don't accept any truncstore of integer registers.
201 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
202 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
203 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
204 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
205 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
206 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
207
208 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
209
210 // SETOEQ and SETUNE require checking two conditions.
211 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
214 }
215
216 // Integer absolute.
217 if (Subtarget.canUseCMOV()) {
218 setOperationAction(ISD::ABS , MVT::i16 , Custom);
219 setOperationAction(ISD::ABS , MVT::i32 , Custom);
220 if (Subtarget.is64Bit())
221 setOperationAction(ISD::ABS , MVT::i64 , Custom);
222 }
223
224 // Absolute difference.
225 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
226 setOperationAction(Op , MVT::i8 , Custom);
227 setOperationAction(Op , MVT::i16 , Custom);
228 setOperationAction(Op , MVT::i32 , Custom);
229 if (Subtarget.is64Bit())
230 setOperationAction(Op , MVT::i64 , Custom);
231 }
232
233 // Signed saturation subtraction.
237 if (Subtarget.is64Bit())
239
240 // Funnel shifts.
241 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
242 // For slow shld targets we only lower for code size.
243 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
244
245 setOperationAction(ShiftOp , MVT::i8 , Custom);
246 setOperationAction(ShiftOp , MVT::i16 , Custom);
247 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
248 if (Subtarget.is64Bit())
249 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
250 }
251
252 if (!Subtarget.useSoftFloat()) {
253 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
254 // operation.
259 // We have an algorithm for SSE2, and we turn this into a 64-bit
260 // FILD or VCVTUSI2SS/SD for other targets.
263 // We have an algorithm for SSE2->double, and we turn this into a
264 // 64-bit FILD followed by conditional FADD for other targets.
267
268 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
269 // this operation.
272 // SSE has no i16 to fp conversion, only i32. We promote in the handler
273 // to allow f80 to use i16 and f64 to use i16 with sse1 only
276 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
279 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
280 // are Legal, f80 is custom lowered.
283
284 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
285 // this operation.
287 // FIXME: This doesn't generate invalid exception when it should. PR44019.
293 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
294 // are Legal, f80 is custom lowered.
297
298 // Handle FP_TO_UINT by promoting the destination to a larger signed
299 // conversion.
301 // FIXME: This doesn't generate invalid exception when it should. PR44019.
304 // FIXME: This doesn't generate invalid exception when it should. PR44019.
310
315
316 if (!Subtarget.is64Bit()) {
319 }
320 }
321
322 if (Subtarget.hasSSE2()) {
323 // Custom lowering for saturating float to int conversions.
324 // We handle promotion to larger result types manually.
325 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
328 }
329 if (Subtarget.is64Bit()) {
332 }
333 }
334
335 // Handle address space casts between mixed sized pointers.
338
339 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
340 if (!Subtarget.hasSSE2()) {
343 if (Subtarget.is64Bit()) {
345 // Without SSE, i64->f64 goes through memory.
347 }
348 } else if (!Subtarget.is64Bit())
350
351 // Scalar integer divide and remainder are lowered to use operations that
352 // produce two results, to match the available instructions. This exposes
353 // the two-result form to trivial CSE, which is able to combine x/y and x%y
354 // into a single instruction.
355 //
356 // Scalar integer multiply-high is also lowered to use two-result
357 // operations, to match the available instructions. However, plain multiply
358 // (low) operations are left as Legal, as there are single-result
359 // instructions for this in x86. Using the two-result multiply instructions
360 // when both high and low results are needed must be arranged by dagcombine.
361 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
368 }
369
370 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
372 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
373 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
376 }
377 if (Subtarget.is64Bit())
382
383 setOperationAction(ISD::FREM , MVT::f32 , Expand);
384 setOperationAction(ISD::FREM , MVT::f64 , Expand);
385 setOperationAction(ISD::FREM , MVT::f80 , Expand);
386 setOperationAction(ISD::FREM , MVT::f128 , Expand);
387
388 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
394 }
395
396 // Promote the i8 variants and force them on up to i32 which has a shorter
397 // encoding.
398 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
400 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
401 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
402 // promote that too.
403 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
405
406 if (!Subtarget.hasBMI()) {
407 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
409 if (Subtarget.is64Bit()) {
410 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
412 }
413 }
414
415 if (Subtarget.hasLZCNT()) {
416 // When promoting the i8 variants, force them to i32 for a shorter
417 // encoding.
418 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
420 } else {
421 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
422 if (VT == MVT::i64 && !Subtarget.is64Bit())
423 continue;
426 }
427 }
428
431 // Special handling for half-precision floating point conversions.
432 // If we don't have F16C support, then lower half float conversions
433 // into library calls.
435 Op, MVT::f32,
436 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
437 // There's never any support for operations beyond MVT::f32.
438 setOperationAction(Op, MVT::f64, Expand);
439 setOperationAction(Op, MVT::f80, Expand);
440 setOperationAction(Op, MVT::f128, Expand);
441 }
442
443 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
446 }
447
448 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
449 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
450 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
451 setTruncStoreAction(VT, MVT::f16, Expand);
452 setTruncStoreAction(VT, MVT::bf16, Expand);
453
456 }
457
461 if (Subtarget.is64Bit())
463 if (Subtarget.hasPOPCNT()) {
464 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
465 // popcntw is longer to encode than popcntl and also has a false dependency
466 // on the dest that popcntl hasn't had since Cannon Lake.
467 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
468 } else {
473 }
474
476
477 if (!Subtarget.hasMOVBE())
479
480 // X86 wants to expand cmov itself.
481 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
486 }
487 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
488 if (VT == MVT::i64 && !Subtarget.is64Bit())
489 continue;
492 }
493
494 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
497
499 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
500 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
504 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
505 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
506
507 // Darwin ABI issue.
508 for (auto VT : { MVT::i32, MVT::i64 }) {
509 if (VT == MVT::i64 && !Subtarget.is64Bit())
510 continue;
517 }
518
519 // 64-bit shl, sra, srl (iff 32-bit x86)
520 for (auto VT : { MVT::i32, MVT::i64 }) {
521 if (VT == MVT::i64 && !Subtarget.is64Bit())
522 continue;
526 }
527
528 if (Subtarget.hasSSEPrefetch() || Subtarget.hasThreeDNow())
530
532
533 // Expand certain atomics
534 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
542 }
543
544 if (!Subtarget.is64Bit())
546
547 if (Subtarget.canUseCMPXCHG16B())
549
550 // FIXME - use subtarget debug flags
551 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
552 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
553 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
555 }
556
559
562
563 setOperationAction(ISD::TRAP, MVT::Other, Legal);
565 if (Subtarget.isTargetPS())
567 else
569
570 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
572 setOperationAction(ISD::VAEND , MVT::Other, Expand);
573 bool Is64Bit = Subtarget.is64Bit();
574 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
575 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
576
579
581
582 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
585
587
588 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
589 setOperationAction(ISD::FABS, VT, Action);
590 setOperationAction(ISD::FNEG, VT, Action);
592 setOperationAction(ISD::FREM, VT, Action);
593 setOperationAction(ISD::FMA, VT, Action);
594 setOperationAction(ISD::FMINNUM, VT, Action);
595 setOperationAction(ISD::FMAXNUM, VT, Action);
598 setOperationAction(ISD::FSIN, VT, Action);
599 setOperationAction(ISD::FCOS, VT, Action);
600 setOperationAction(ISD::FSINCOS, VT, Action);
601 setOperationAction(ISD::FSQRT, VT, Action);
602 setOperationAction(ISD::FPOW, VT, Action);
603 setOperationAction(ISD::FLOG, VT, Action);
604 setOperationAction(ISD::FLOG2, VT, Action);
605 setOperationAction(ISD::FLOG10, VT, Action);
606 setOperationAction(ISD::FEXP, VT, Action);
607 setOperationAction(ISD::FEXP2, VT, Action);
608 setOperationAction(ISD::FEXP10, VT, Action);
609 setOperationAction(ISD::FCEIL, VT, Action);
610 setOperationAction(ISD::FFLOOR, VT, Action);
612 setOperationAction(ISD::FRINT, VT, Action);
613 setOperationAction(ISD::BR_CC, VT, Action);
614 setOperationAction(ISD::SETCC, VT, Action);
617 setOperationAction(ISD::FROUND, VT, Action);
619 setOperationAction(ISD::FTRUNC, VT, Action);
620 setOperationAction(ISD::FLDEXP, VT, Action);
621 };
622
623 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
624 // f16, f32 and f64 use SSE.
625 // Set up the FP register classes.
626 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
627 : &X86::FR16RegClass);
628 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
629 : &X86::FR32RegClass);
630 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
631 : &X86::FR64RegClass);
632
633 // Disable f32->f64 extload as we can only generate this in one instruction
634 // under optsize. So its easier to pattern match (fpext (load)) for that
635 // case instead of needing to emit 2 instructions for extload in the
636 // non-optsize case.
637 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
638
639 for (auto VT : { MVT::f32, MVT::f64 }) {
640 // Use ANDPD to simulate FABS.
642
643 // Use XORP to simulate FNEG.
645
646 // Use ANDPD and ORPD to simulate FCOPYSIGN.
648
649 // These might be better off as horizontal vector ops.
652
653 // We don't support sin/cos/fmod
657 }
658
659 // Half type will be promoted by default.
660 setF16Action(MVT::f16, Promote);
668
698
699 setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
700 setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
701
702 // Lower this to MOVMSK plus an AND.
705
706 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
707 (UseX87 || Is64Bit)) {
708 // Use SSE for f32, x87 for f64.
709 // Set up the FP register classes.
710 addRegisterClass(MVT::f32, &X86::FR32RegClass);
711 if (UseX87)
712 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
713
714 // Use ANDPS to simulate FABS.
716
717 // Use XORP to simulate FNEG.
719
720 if (UseX87)
722
723 // Use ANDPS and ORPS to simulate FCOPYSIGN.
724 if (UseX87)
727
728 // We don't support sin/cos/fmod
732
733 if (UseX87) {
734 // Always expand sin/cos functions even though x87 has an instruction.
738 }
739 } else if (UseX87) {
740 // f32 and f64 in x87.
741 // Set up the FP register classes.
742 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
743 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
744
745 for (auto VT : { MVT::f32, MVT::f64 }) {
748
749 // Always expand sin/cos functions even though x87 has an instruction.
753 }
754 }
755
756 // Expand FP32 immediates into loads from the stack, save special cases.
757 if (isTypeLegal(MVT::f32)) {
758 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
759 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
760 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
761 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
762 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
763 } else // SSE immediates.
764 addLegalFPImmediate(APFloat(+0.0f)); // xorps
765 }
766 // Expand FP64 immediates into loads from the stack, save special cases.
767 if (isTypeLegal(MVT::f64)) {
768 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
769 addLegalFPImmediate(APFloat(+0.0)); // FLD0
770 addLegalFPImmediate(APFloat(+1.0)); // FLD1
771 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
772 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
773 } else // SSE immediates.
774 addLegalFPImmediate(APFloat(+0.0)); // xorpd
775 }
776 // Support fp16 0 immediate.
777 if (isTypeLegal(MVT::f16))
778 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
779
780 // Handle constrained floating-point operations of scalar.
793
794 // We don't support FMA.
797
798 // f80 always uses X87.
799 if (UseX87) {
800 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
803 {
805 addLegalFPImmediate(TmpFlt); // FLD0
806 TmpFlt.changeSign();
807 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
808
809 bool ignored;
810 APFloat TmpFlt2(+1.0);
812 &ignored);
813 addLegalFPImmediate(TmpFlt2); // FLD1
814 TmpFlt2.changeSign();
815 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
816 }
817
818 // Always expand sin/cos functions even though x87 has an instruction.
822
834
835 // Handle constrained floating-point operations of scalar.
841 if (isTypeLegal(MVT::f16)) {
844 } else {
846 }
847 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
848 // as Custom.
850 }
851
852 // f128 uses xmm registers, but most operations require libcalls.
853 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
854 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
855 : &X86::VR128RegClass);
856
857 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
858
869
873
879 // No STRICT_FSINCOS
882
885 // We need to custom handle any FP_ROUND with an f128 input, but
886 // LegalizeDAG uses the result type to know when to run a custom handler.
887 // So we have to list all legal floating point result types here.
888 if (isTypeLegal(MVT::f32)) {
891 }
892 if (isTypeLegal(MVT::f64)) {
895 }
896 if (isTypeLegal(MVT::f80)) {
899 }
900
902
903 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
904 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
905 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
906 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
907 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
908 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
909 }
910
911 // Always use a library call for pow.
912 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
913 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
914 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
915 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
916
925
926 // Some FP actions are always expanded for vector types.
927 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
928 MVT::v4f32, MVT::v8f32, MVT::v16f32,
929 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
942 }
943
944 // First set operation action for all vector types to either promote
945 // (for widening) or expand (for scalarization). Then we will selectively
946 // turn on ones that can be effectively codegen'd.
986 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
987 setTruncStoreAction(InnerVT, VT, Expand);
988
991
992 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
993 // types, we have to deal with them whether we ask for Expansion or not.
994 // Setting Expand causes its own optimisation problems though, so leave
995 // them legal.
996 if (VT.getVectorElementType() == MVT::i1)
997 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
998
999 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1000 // split/scalarized right now.
1001 if (VT.getVectorElementType() == MVT::f16 ||
1002 VT.getVectorElementType() == MVT::bf16)
1003 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1004 }
1005 }
1006
1007 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1008 // with -msoft-float, disable use of MMX as well.
1009 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1010 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1011 // No operations on x86mmx supported, everything uses intrinsics.
1012 }
1013
1014 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1015 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1016 : &X86::VR128RegClass);
1017
1020
1021 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1022 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1029
1030 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1031 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1032
1038 }
1039
1040 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1041 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1042 : &X86::VR128RegClass);
1043
1044 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1045 // registers cannot be used even for integer operations.
1046 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1047 : &X86::VR128RegClass);
1048 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1049 : &X86::VR128RegClass);
1050 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1051 : &X86::VR128RegClass);
1052 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1053 : &X86::VR128RegClass);
1054 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1055 : &X86::VR128RegClass);
1056
1057 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1060 }
1061
1062 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1063 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1068 }
1069
1070 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1071 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1072 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1073
1074 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1075 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1076 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1077 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1078 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1079 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1080 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1081 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1082 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1083 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1086
1087 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1088 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1089 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1090
1091 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1092 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1094
1095 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1096
1097 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1098 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1099 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1100 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1101 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1102 }
1103
1104 setOperationAction(ISD::ABDU, MVT::v16i8, Custom);
1105 setOperationAction(ISD::ABDS, MVT::v16i8, Custom);
1106 setOperationAction(ISD::ABDU, MVT::v8i16, Custom);
1107 setOperationAction(ISD::ABDS, MVT::v8i16, Custom);
1108 setOperationAction(ISD::ABDU, MVT::v4i32, Custom);
1109 setOperationAction(ISD::ABDS, MVT::v4i32, Custom);
1110
1121
1126
1127 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1131
1132 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1133 // setcc all the way to isel and prefer SETGT in some isel patterns.
1136 }
1137
1138 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1139 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1144
1145 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1151 }
1152
1153 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1157
1158 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1159 continue;
1160
1163 }
1164 setF16Action(MVT::v8f16, Expand);
1165 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1166 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1167 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1168 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1169 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1170 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1172
1173 // Custom lower v2i64 and v2f64 selects.
1180
1187
1188 // Custom legalize these to avoid over promotion or custom promotion.
1189 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1194 }
1195
1200
1203
1206
1207 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1212
1217
1218 // We want to legalize this to an f64 load rather than an i64 load on
1219 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1220 // store.
1221 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1222 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1223 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1224 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1225 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1227
1228 // Add 32-bit vector stores to help vectorization opportunities.
1229 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1231
1235 if (!Subtarget.hasAVX512())
1237
1241
1243
1260
1261 // In the customized shift lowering, the legal v4i32/v2i64 cases
1262 // in AVX2 will be recognized.
1263 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1267 if (VT == MVT::v2i64) continue;
1272 }
1273
1279 }
1280
1281 if (Subtarget.hasGFNI()) {
1286 }
1287
1288 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1289 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1290 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1291 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1292
1293 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1296 }
1297
1298 // These might be better off as horizontal vector ops.
1303 }
1304
1305 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1306 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1309 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1313 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1319
1321 }
1322
1323 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1324 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1325 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1326 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1327 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1328 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1329 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1330 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1331
1332 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1335 }
1336
1340
1341 // FIXME: Do we need to handle scalar-to-vector here?
1342 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1343 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1344
1345 // We directly match byte blends in the backend as they match the VSELECT
1346 // condition form.
1348
1349 // SSE41 brings specific instructions for doing vector sign extend even in
1350 // cases where we don't have SRA.
1351 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1354 }
1355
1356 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1357 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1358 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1359 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1360 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1361 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1362 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1363 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1364 }
1365
1366 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1367 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1368 // do the pre and post work in the vector domain.
1371 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1372 // so that DAG combine doesn't try to turn it into uint_to_fp.
1375 }
1376 }
1377
1378 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1380 }
1381
1382 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1383 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1384 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1387 }
1388
1389 // XOP can efficiently perform BITREVERSE with VPPERM.
1390 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1392 }
1393
1394 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1395 bool HasInt256 = Subtarget.hasInt256();
1396
1397 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1398 : &X86::VR256RegClass);
1399 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1400 : &X86::VR256RegClass);
1401 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1402 : &X86::VR256RegClass);
1403 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1404 : &X86::VR256RegClass);
1405 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1406 : &X86::VR256RegClass);
1407 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1408 : &X86::VR256RegClass);
1409 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1410 : &X86::VR256RegClass);
1411
1412 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1425
1427
1431
1434 }
1435
1436 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1437 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1438
1439 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1440 // even though v8i16 is a legal type.
1441 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1442 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1443 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1444 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1448
1455
1467
1468 if (!Subtarget.hasAVX512())
1470
1471 // In the customized shift lowering, the legal v8i32/v4i64 cases
1472 // in AVX2 will be recognized.
1473 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1479 if (VT == MVT::v4i64) continue;
1484 }
1485
1486 // These types need custom splitting if their input is a 128-bit vector.
1491
1495 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1496 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1499
1500 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1504 }
1505
1510
1511 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1516
1517 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1518 // setcc all the way to isel and prefer SETGT in some isel patterns.
1521 }
1522
1523 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1524 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1529
1530 if (Subtarget.hasAnyFMA()) {
1531 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1532 MVT::v2f64, MVT::v4f64 }) {
1535 }
1536 }
1537
1538 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1539 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1540 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1541 }
1542
1543 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1544 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1545 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1546 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1547
1548 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1549 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1550 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1551 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1552 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1553 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1554 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1555 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1556
1557 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1558 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1559
1560 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1561 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1562 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1563 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1564 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1565
1566 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1567 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1568 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1569 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1570 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1571 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1572 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1573 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1578
1579 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1580 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1581 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1582 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1583 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1584 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1585 }
1586
1587 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1590 }
1591
1592 if (HasInt256) {
1593 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1594 // when we have a 256bit-wide blend with immediate.
1597
1598 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1599 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1600 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1601 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1602 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1603 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1604 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1605 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1606 }
1607 }
1608
1609 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1610 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1611 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1613 }
1614
1615 // Extract subvector is special because the value type
1616 // (result) is 128-bit but the source is 256-bit wide.
1617 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1618 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1620 }
1621
1622 // Custom lower several nodes for 256-bit types.
1623 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1624 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1634 }
1635 setF16Action(MVT::v16f16, Expand);
1636 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1637 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1639 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1640 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1641 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1642 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1643
1644 if (HasInt256) {
1646
1647 // Custom legalize 2x32 to get a little better code.
1650
1651 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1652 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1654 }
1655 }
1656
1657 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1658 Subtarget.hasF16C()) {
1659 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1662 }
1663 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1666 }
1667 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1668 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1669 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1670 }
1671 }
1672
1673 // This block controls legalization of the mask vector sizes that are
1674 // available with AVX512. 512-bit vectors are in a separate block controlled
1675 // by useAVX512Regs.
1676 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1677 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1678 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1679 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1680 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1681 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1682
1686
1687 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1688 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1689 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1690 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1691 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1692 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1693 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1694 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1699
1700 // There is no byte sized k-register load or store without AVX512DQ.
1701 if (!Subtarget.hasDQI()) {
1702 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1703 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1704 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1705 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1706
1711 }
1712
1713 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1714 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1718 }
1719
1720 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1722
1723 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1727
1734 }
1735
1736 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1738 }
1739 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1740 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1743 }
1744 }
1745
1746 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1747 // elements. 512-bits can be disabled based on prefer-vector-width and
1748 // required-vector-width function attributes.
1749 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1750 bool HasBWI = Subtarget.hasBWI();
1751
1752 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1753 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1754 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1755 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1756 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1757 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1758 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1759
1760 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1761 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1762 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1763 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1764 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1765 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1766 if (HasBWI)
1767 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1768 }
1769
1770 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1778 }
1779 setOperationAction(ISD::LRINT, MVT::v16f32,
1780 Subtarget.hasDQI() ? Legal : Custom);
1781 setOperationAction(ISD::LRINT, MVT::v8f64,
1782 Subtarget.hasDQI() ? Legal : Custom);
1783 if (Subtarget.hasDQI())
1784 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1785
1786 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1791 }
1792
1793 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1798 }
1799
1806
1818
1819 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1820 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1821 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1822 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1823 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1824 if (HasBWI)
1825 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1826
1827 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1828 // to 512-bit rather than use the AVX2 instructions so that we can use
1829 // k-masks.
1830 if (!Subtarget.hasVLX()) {
1831 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1832 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1835 }
1836 }
1837
1839 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1840 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1850
1851 if (HasBWI) {
1852 // Extends from v64i1 masks to 512-bit vectors.
1856 }
1857
1858 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1871
1873 }
1874
1875 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1878 }
1879
1880 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1881 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1882 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1883 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1884
1885 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1886 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1887 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1888 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1889
1890 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1891 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1892 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1893 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1894 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1895 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1896 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1897 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1898
1899 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1900 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1901
1902 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1912
1913 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1914 // setcc all the way to isel and prefer SETGT in some isel patterns.
1917 }
1918
1919 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
1920 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
1925
1926 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1933 }
1934
1935 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1936 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
1937 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
1939 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
1940 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
1941 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
1942 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
1947 }
1948
1949 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
1950 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
1951 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
1952 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
1953 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
1954 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
1955
1956 if (Subtarget.hasDQI()) {
1960 setOperationAction(Opc, MVT::v8i64, Custom);
1961 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1962 }
1963
1964 if (Subtarget.hasCDI()) {
1965 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1966 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1968 }
1969 } // Subtarget.hasCDI()
1970
1971 if (Subtarget.hasVPOPCNTDQ()) {
1972 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1974 }
1975
1976 // Extract subvector is special because the value type
1977 // (result) is 256-bit but the source is 512-bit wide.
1978 // 128-bit was made Legal under AVX1.
1979 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1980 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1982
1983 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
1984 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
1994 }
1995 setF16Action(MVT::v32f16, Expand);
2000 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2001 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2002
2003 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2008 }
2009 if (HasBWI) {
2010 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2013 }
2014 } else {
2015 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2016 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2017 }
2018
2019 if (Subtarget.hasVBMI2()) {
2020 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2023 }
2024
2025 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2026 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2027 }
2028 }// useAVX512Regs
2029
2030 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2031 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2032 MVT::v4i64}) {
2035 }
2036 }
2037
2038 // This block controls legalization for operations that don't have
2039 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2040 // narrower widths.
2041 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2042 // These operations are handled on non-VLX by artificially widening in
2043 // isel patterns.
2044
2048
2049 if (Subtarget.hasDQI()) {
2050 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2051 // v2f32 UINT_TO_FP is already custom under SSE2.
2054 "Unexpected operation action!");
2055 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2060 }
2061
2062 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2068 }
2069
2070 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2073 }
2074
2075 // Custom legalize 2x32 to get a little better code.
2078
2079 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2080 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2082
2083 if (Subtarget.hasDQI()) {
2087 setOperationAction(Opc, MVT::v2i64, Custom);
2088 setOperationAction(Opc, MVT::v4i64, Custom);
2089 }
2090 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2091 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2092 }
2093
2094 if (Subtarget.hasCDI()) {
2095 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2097 }
2098 } // Subtarget.hasCDI()
2099
2100 if (Subtarget.hasVPOPCNTDQ()) {
2101 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2103 }
2104 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2105 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2107 }
2108
2109 // This block control legalization of v32i1/v64i1 which are available with
2110 // AVX512BW..
2111 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2112 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2113 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2114
2115 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2126 }
2127
2128 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2130
2131 // Extends from v32i1 masks to 256-bit vectors.
2135
2136 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
2137 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2138 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2139 }
2140
2141 // These operations are handled on non-VLX by artificially widening in
2142 // isel patterns.
2143 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2144
2145 if (Subtarget.hasBITALG()) {
2146 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2148 }
2149 }
2150
2151 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2152 auto setGroup = [&] (MVT VT) {
2163
2176
2178
2181
2187
2193
2197 };
2198
2199 // AVX512_FP16 scalar operations
2200 setGroup(MVT::f16);
2214
2217
2218 if (Subtarget.useAVX512Regs()) {
2219 setGroup(MVT::v32f16);
2225 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2232
2237 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2239 MVT::v32i16);
2240 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2242 MVT::v32i16);
2243 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2245 MVT::v32i16);
2246 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2248 MVT::v32i16);
2249
2253
2254 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2255 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2256 }
2257
2258 if (Subtarget.hasVLX()) {
2259 setGroup(MVT::v8f16);
2260 setGroup(MVT::v16f16);
2261
2272
2283
2284 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2287
2291
2292 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2293 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2294 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2295 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2296
2297 // Need to custom widen these to prevent scalarization.
2298 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2299 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2300 }
2301 }
2302
2303 if (!Subtarget.useSoftFloat() &&
2304 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2305 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2306 : &X86::VR128RegClass);
2307 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2308 : &X86::VR256RegClass);
2309 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2310 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2311 // Set the operation action Custom to do the customization later.
2314 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2315 setF16Action(VT, Expand);
2320 }
2321 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2322 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2323 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2324 }
2326 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2327 }
2328
2329 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16()) {
2330 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2331 setF16Action(MVT::v32bf16, Expand);
2332 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2333 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2335 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2339 }
2340
2341 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2342 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2343 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2344 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2345 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2346 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2347
2348 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2349 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2350 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2351 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2352 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2353
2354 if (Subtarget.hasBWI()) {
2355 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2356 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2357 }
2358
2359 if (Subtarget.hasFP16()) {
2360 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2369 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2378 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2383 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2388 }
2389 }
2390
2391 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2392 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2393 }
2394
2395 // We want to custom lower some of our intrinsics.
2399 if (!Subtarget.is64Bit()) {
2401 }
2402
2403 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2404 // handle type legalization for these operations here.
2405 //
2406 // FIXME: We really should do custom legalization for addition and
2407 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2408 // than generic legalization for 64-bit multiplication-with-overflow, though.
2409 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2410 if (VT == MVT::i64 && !Subtarget.is64Bit())
2411 continue;
2412 // Add/Sub/Mul with overflow operations are custom lowered.
2419
2420 // Support carry in as value rather than glue.
2426 }
2427
2428 if (!Subtarget.is64Bit()) {
2429 // These libcalls are not available in 32-bit.
2430 setLibcallName(RTLIB::SHL_I128, nullptr);
2431 setLibcallName(RTLIB::SRL_I128, nullptr);
2432 setLibcallName(RTLIB::SRA_I128, nullptr);
2433 setLibcallName(RTLIB::MUL_I128, nullptr);
2434 // The MULO libcall is not part of libgcc, only compiler-rt.
2435 setLibcallName(RTLIB::MULO_I64, nullptr);
2436 }
2437 // The MULO libcall is not part of libgcc, only compiler-rt.
2438 setLibcallName(RTLIB::MULO_I128, nullptr);
2439
2440 // Combine sin / cos into _sincos_stret if it is available.
2441 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2442 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2445 }
2446
2447 if (Subtarget.isTargetWin64()) {
2448 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2449 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2450 setOperationAction(ISD::SREM, MVT::i128, Custom);
2451 setOperationAction(ISD::UREM, MVT::i128, Custom);
2460 }
2461
2462 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2463 // is. We should promote the value to 64-bits to solve this.
2464 // This is what the CRT headers do - `fmodf` is an inline header
2465 // function casting to f64 and calling `fmod`.
2466 if (Subtarget.is32Bit() &&
2467 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2468 for (ISD::NodeType Op :
2478 if (isOperationExpand(Op, MVT::f32))
2479 setOperationAction(Op, MVT::f32, Promote);
2480
2481 // We have target-specific dag combine patterns for the following nodes:
2492 ISD::SHL,
2493 ISD::SRA,
2494 ISD::SRL,
2495 ISD::OR,
2496 ISD::AND,
2498 ISD::ADD,
2499 ISD::FADD,
2500 ISD::FSUB,
2501 ISD::FNEG,
2502 ISD::FMA,
2506 ISD::SUB,
2507 ISD::LOAD,
2508 ISD::LRINT,
2510 ISD::MLOAD,
2511 ISD::STORE,
2525 ISD::SETCC,
2526 ISD::MUL,
2527 ISD::XOR,
2535
2537
2538 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2540 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2542 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2544
2545 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2546 // that needs to benchmarked and balanced with the potential use of vector
2547 // load/store types (PR33329, PR33914).
2550
2551 // Default loop alignment, which can be overridden by -align-loops.
2553
2554 // An out-of-order CPU can speculatively execute past a predictable branch,
2555 // but a conditional move could be stalled by an expensive earlier operation.
2556 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2557 EnableExtLdPromotion = true;
2559
2561
2562 // Default to having -disable-strictnode-mutation on
2563 IsStrictFPEnabled = true;
2564}
2565
2566// This has so far only been implemented for 64-bit MachO.
2568 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2569}
2570
2572 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2573 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2574}
2575
2577 const SDLoc &DL) const {
2578 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2579 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2580 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2581 return SDValue(Node, 0);
2582}
2583
2586 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2587 !Subtarget.hasBWI())
2588 return TypeSplitVector;
2589
2590 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2591 !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)
2592 return TypeSplitVector;
2593
2594 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2595 VT.getVectorElementType() != MVT::i1)
2596 return TypeWidenVector;
2597
2599}
2600
2601FastISel *
2603 const TargetLibraryInfo *libInfo) const {
2604 return X86::createFastISel(funcInfo, libInfo);
2605}
2606
2607//===----------------------------------------------------------------------===//
2608// Other Lowering Hooks
2609//===----------------------------------------------------------------------===//
2610
2612 bool AssumeSingleUse) {
2613 if (!AssumeSingleUse && !Op.hasOneUse())
2614 return false;
2615 if (!ISD::isNormalLoad(Op.getNode()))
2616 return false;
2617
2618 // If this is an unaligned vector, make sure the target supports folding it.
2619 auto *Ld = cast<LoadSDNode>(Op.getNode());
2620 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2621 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2622 return false;
2623
2624 // TODO: If this is a non-temporal load and the target has an instruction
2625 // for it, it should not be folded. See "useNonTemporalLoad()".
2626
2627 return true;
2628}
2629
2631 const X86Subtarget &Subtarget,
2632 bool AssumeSingleUse) {
2633 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2634 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2635 return false;
2636
2637 // We can not replace a wide volatile load with a broadcast-from-memory,
2638 // because that would narrow the load, which isn't legal for volatiles.
2639 auto *Ld = cast<LoadSDNode>(Op.getNode());
2640 return !Ld->isVolatile() ||
2641 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2642}
2643
2645 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
2646}
2647
2649 if (Op.hasOneUse()) {
2650 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
2651 return (ISD::ZERO_EXTEND == Opcode);
2652 }
2653 return false;
2654}
2655
2656static bool isLogicOp(unsigned Opcode) {
2657 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2658 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2659}
2660
2661static bool isTargetShuffle(unsigned Opcode) {
2662 switch(Opcode) {
2663 default: return false;
2664 case X86ISD::BLENDI:
2665 case X86ISD::PSHUFB:
2666 case X86ISD::PSHUFD:
2667 case X86ISD::PSHUFHW:
2668 case X86ISD::PSHUFLW:
2669 case X86ISD::SHUFP:
2670 case X86ISD::INSERTPS:
2671 case X86ISD::EXTRQI:
2672 case X86ISD::INSERTQI:
2673 case X86ISD::VALIGN:
2674 case X86ISD::PALIGNR:
2675 case X86ISD::VSHLDQ:
2676 case X86ISD::VSRLDQ:
2677 case X86ISD::MOVLHPS:
2678 case X86ISD::MOVHLPS:
2679 case X86ISD::MOVSHDUP:
2680 case X86ISD::MOVSLDUP:
2681 case X86ISD::MOVDDUP:
2682 case X86ISD::MOVSS:
2683 case X86ISD::MOVSD:
2684 case X86ISD::MOVSH:
2685 case X86ISD::UNPCKL:
2686 case X86ISD::UNPCKH:
2687 case X86ISD::VBROADCAST:
2688 case X86ISD::VPERMILPI:
2689 case X86ISD::VPERMILPV:
2690 case X86ISD::VPERM2X128:
2691 case X86ISD::SHUF128:
2692 case X86ISD::VPERMIL2:
2693 case X86ISD::VPERMI:
2694 case X86ISD::VPPERM:
2695 case X86ISD::VPERMV:
2696 case X86ISD::VPERMV3:
2697 case X86ISD::VZEXT_MOVL:
2698 return true;
2699 }
2700}
2701
2702static bool isTargetShuffleVariableMask(unsigned Opcode) {
2703 switch (Opcode) {
2704 default: return false;
2705 // Target Shuffles.
2706 case X86ISD::PSHUFB:
2707 case X86ISD::VPERMILPV:
2708 case X86ISD::VPERMIL2:
2709 case X86ISD::VPPERM:
2710 case X86ISD::VPERMV:
2711 case X86ISD::VPERMV3:
2712 return true;
2713 // 'Faux' Target Shuffles.
2714 case ISD::OR:
2715 case ISD::AND:
2716 case X86ISD::ANDNP:
2717 return true;
2718 }
2719}
2720
2723 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2725 int ReturnAddrIndex = FuncInfo->getRAIndex();
2726
2727 if (ReturnAddrIndex == 0) {
2728 // Set up a frame object for the return address.
2729 unsigned SlotSize = RegInfo->getSlotSize();
2730 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2731 -(int64_t)SlotSize,
2732 false);
2733 FuncInfo->setRAIndex(ReturnAddrIndex);
2734 }
2735
2736 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2737}
2738
2740 bool HasSymbolicDisplacement) {
2741 // Offset should fit into 32 bit immediate field.
2742 if (!isInt<32>(Offset))
2743 return false;
2744
2745 // If we don't have a symbolic displacement - we don't have any extra
2746 // restrictions.
2747 if (!HasSymbolicDisplacement)
2748 return true;
2749
2750 // We can fold large offsets in the large code model because we always use
2751 // 64-bit offsets.
2752 if (CM == CodeModel::Large)
2753 return true;
2754
2755 // For kernel code model we know that all object resist in the negative half
2756 // of 32bits address space. We may not accept negative offsets, since they may
2757 // be just off and we may accept pretty large positive ones.
2758 if (CM == CodeModel::Kernel)
2759 return Offset >= 0;
2760
2761 // For other non-large code models we assume that latest small object is 16MB
2762 // before end of 31 bits boundary. We may also accept pretty large negative
2763 // constants knowing that all objects are in the positive half of address
2764 // space.
2765 return Offset < 16 * 1024 * 1024;
2766}
2767
2768/// Return true if the condition is an signed comparison operation.
2769static bool isX86CCSigned(unsigned X86CC) {
2770 switch (X86CC) {
2771 default:
2772 llvm_unreachable("Invalid integer condition!");
2773 case X86::COND_E:
2774 case X86::COND_NE:
2775 case X86::COND_B:
2776 case X86::COND_A:
2777 case X86::COND_BE:
2778 case X86::COND_AE:
2779 return false;
2780 case X86::COND_G:
2781 case X86::COND_GE:
2782 case X86::COND_L:
2783 case X86::COND_LE:
2784 return true;
2785 }
2786}
2787
2789 switch (SetCCOpcode) {
2790 // clang-format off
2791 default: llvm_unreachable("Invalid integer condition!");
2792 case ISD::SETEQ: return X86::COND_E;
2793 case ISD::SETGT: return X86::COND_G;
2794 case ISD::SETGE: return X86::COND_GE;
2795 case ISD::SETLT: return X86::COND_L;
2796 case ISD::SETLE: return X86::COND_LE;
2797 case ISD::SETNE: return X86::COND_NE;
2798 case ISD::SETULT: return X86::COND_B;
2799 case ISD::SETUGT: return X86::COND_A;
2800 case ISD::SETULE: return X86::COND_BE;
2801 case ISD::SETUGE: return X86::COND_AE;
2802 // clang-format on
2803 }
2804}
2805
2806/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2807/// condition code, returning the condition code and the LHS/RHS of the
2808/// comparison to make.
2810 bool isFP, SDValue &LHS, SDValue &RHS,
2811 SelectionDAG &DAG) {
2812 if (!isFP) {
2813 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
2814 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2815 // X > -1 -> X == 0, jump !sign.
2816 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2817 return X86::COND_NS;
2818 }
2819 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2820 // X < 0 -> X == 0, jump on sign.
2821 return X86::COND_S;
2822 }
2823 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
2824 // X >= 0 -> X == 0, jump on !sign.
2825 return X86::COND_NS;
2826 }
2827 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
2828 // X < 1 -> X <= 0
2829 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2830 return X86::COND_LE;
2831 }
2832 }
2833
2834 return TranslateIntegerX86CC(SetCCOpcode);
2835 }
2836
2837 // First determine if it is required or is profitable to flip the operands.
2838
2839 // If LHS is a foldable load, but RHS is not, flip the condition.
2840 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
2841 !ISD::isNON_EXTLoad(RHS.getNode())) {
2842 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
2843 std::swap(LHS, RHS);
2844 }
2845
2846 switch (SetCCOpcode) {
2847 default: break;
2848 case ISD::SETOLT:
2849 case ISD::SETOLE:
2850 case ISD::SETUGT:
2851 case ISD::SETUGE:
2852 std::swap(LHS, RHS);
2853 break;
2854 }
2855
2856 // On a floating point condition, the flags are set as follows:
2857 // ZF PF CF op
2858 // 0 | 0 | 0 | X > Y
2859 // 0 | 0 | 1 | X < Y
2860 // 1 | 0 | 0 | X == Y
2861 // 1 | 1 | 1 | unordered
2862 switch (SetCCOpcode) {
2863 // clang-format off
2864 default: llvm_unreachable("Condcode should be pre-legalized away");
2865 case ISD::SETUEQ:
2866 case ISD::SETEQ: return X86::COND_E;
2867 case ISD::SETOLT: // flipped
2868 case ISD::SETOGT:
2869 case ISD::SETGT: return X86::COND_A;
2870 case ISD::SETOLE: // flipped
2871 case ISD::SETOGE:
2872 case ISD::SETGE: return X86::COND_AE;
2873 case ISD::SETUGT: // flipped
2874 case ISD::SETULT:
2875 case ISD::SETLT: return X86::COND_B;
2876 case ISD::SETUGE: // flipped
2877 case ISD::SETULE:
2878 case ISD::SETLE: return X86::COND_BE;
2879 case ISD::SETONE:
2880 case ISD::SETNE: return X86::COND_NE;
2881 case ISD::SETUO: return X86::COND_P;
2882 case ISD::SETO: return X86::COND_NP;
2883 case ISD::SETOEQ:
2884 case ISD::SETUNE: return X86::COND_INVALID;
2885 // clang-format on
2886 }
2887}
2888
2889/// Is there a floating point cmov for the specific X86 condition code?
2890/// Current x86 isa includes the following FP cmov instructions:
2891/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
2892static bool hasFPCMov(unsigned X86CC) {
2893 switch (X86CC) {
2894 default:
2895 return false;
2896 case X86::COND_B:
2897 case X86::COND_BE:
2898 case X86::COND_E:
2899 case X86::COND_P:
2900 case X86::COND_A:
2901 case X86::COND_AE:
2902 case X86::COND_NE:
2903 case X86::COND_NP:
2904 return true;
2905 }
2906}
2907
2908static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
2909 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
2910 VT.is512BitVector();
2911}
2912
2914 const CallInst &I,
2915 MachineFunction &MF,
2916 unsigned Intrinsic) const {
2918 Info.offset = 0;
2919
2920 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
2921 if (!IntrData) {
2922 switch (Intrinsic) {
2923 case Intrinsic::x86_aesenc128kl:
2924 case Intrinsic::x86_aesdec128kl:
2926 Info.ptrVal = I.getArgOperand(1);
2927 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
2928 Info.align = Align(1);
2930 return true;
2931 case Intrinsic::x86_aesenc256kl:
2932 case Intrinsic::x86_aesdec256kl:
2934 Info.ptrVal = I.getArgOperand(1);
2935 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
2936 Info.align = Align(1);
2938 return true;
2939 case Intrinsic::x86_aesencwide128kl:
2940 case Intrinsic::x86_aesdecwide128kl:
2942 Info.ptrVal = I.getArgOperand(0);
2943 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
2944 Info.align = Align(1);
2946 return true;
2947 case Intrinsic::x86_aesencwide256kl:
2948 case Intrinsic::x86_aesdecwide256kl:
2950 Info.ptrVal = I.getArgOperand(0);
2951 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
2952 Info.align = Align(1);
2954 return true;
2955 case Intrinsic::x86_cmpccxadd32:
2956 case Intrinsic::x86_cmpccxadd64:
2957 case Intrinsic::x86_atomic_bts:
2958 case Intrinsic::x86_atomic_btc:
2959 case Intrinsic::x86_atomic_btr: {
2961 Info.ptrVal = I.getArgOperand(0);
2962 unsigned Size = I.getType()->getScalarSizeInBits();
2963 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
2964 Info.align = Align(Size);
2967 return true;
2968 }
2969 case Intrinsic::x86_atomic_bts_rm:
2970 case Intrinsic::x86_atomic_btc_rm:
2971 case Intrinsic::x86_atomic_btr_rm: {
2973 Info.ptrVal = I.getArgOperand(0);
2974 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
2975 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
2976 Info.align = Align(Size);
2979 return true;
2980 }
2981 case Intrinsic::x86_aadd32:
2982 case Intrinsic::x86_aadd64:
2983 case Intrinsic::x86_aand32:
2984 case Intrinsic::x86_aand64:
2985 case Intrinsic::x86_aor32:
2986 case Intrinsic::x86_aor64:
2987 case Intrinsic::x86_axor32:
2988 case Intrinsic::x86_axor64:
2989 case Intrinsic::x86_atomic_add_cc:
2990 case Intrinsic::x86_atomic_sub_cc:
2991 case Intrinsic::x86_atomic_or_cc:
2992 case Intrinsic::x86_atomic_and_cc:
2993 case Intrinsic::x86_atomic_xor_cc: {
2995 Info.ptrVal = I.getArgOperand(0);
2996 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
2997 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
2998 Info.align = Align(Size);
3001 return true;
3002 }
3003 }
3004 return false;
3005 }
3006
3007 switch (IntrData->Type) {
3010 case TRUNCATE_TO_MEM_VI32: {
3012 Info.ptrVal = I.getArgOperand(0);
3013 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3015 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3016 ScalarVT = MVT::i8;
3017 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3018 ScalarVT = MVT::i16;
3019 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3020 ScalarVT = MVT::i32;
3021
3022 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3023 Info.align = Align(1);
3025 break;
3026 }
3027 case GATHER:
3028 case GATHER_AVX2: {
3030 Info.ptrVal = nullptr;
3031 MVT DataVT = MVT::getVT(I.getType());
3032 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3033 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3034 IndexVT.getVectorNumElements());
3035 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3036 Info.align = Align(1);
3038 break;
3039 }
3040 case SCATTER: {
3042 Info.ptrVal = nullptr;
3043 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3044 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3045 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3046 IndexVT.getVectorNumElements());
3047 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3048 Info.align = Align(1);
3050 break;
3051 }
3052 default:
3053 return false;
3054 }
3055
3056 return true;
3057}
3058
3059/// Returns true if the target can instruction select the
3060/// specified FP immediate natively. If false, the legalizer will
3061/// materialize the FP immediate as a load from a constant pool.
3063 bool ForCodeSize) const {
3064 for (const APFloat &FPImm : LegalFPImmediates)
3065 if (Imm.bitwiseIsEqual(FPImm))
3066 return true;
3067 return false;
3068}
3069
3071 ISD::LoadExtType ExtTy,
3072 EVT NewVT) const {
3073 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3074
3075 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3076 // relocation target a movq or addq instruction: don't let the load shrink.
3077 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3078 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3079 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3080 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3081
3082 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3083 // those uses are extracted directly into a store, then the extract + store
3084 // can be store-folded. Therefore, it's probably not worth splitting the load.
3085 EVT VT = Load->getValueType(0);
3086 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
3087 for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
3088 // Skip uses of the chain value. Result 0 of the node is the load value.
3089 if (UI.getUse().getResNo() != 0)
3090 continue;
3091
3092 // If this use is not an extract + store, it's probably worth splitting.
3093 if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
3094 UI->use_begin()->getOpcode() != ISD::STORE)
3095 return true;
3096 }
3097 // All non-chain uses are extract + store.
3098 return false;
3099 }
3100
3101 return true;
3102}
3103
3104/// Returns true if it is beneficial to convert a load of a constant
3105/// to just the constant itself.
3107 Type *Ty) const {
3108 assert(Ty->isIntegerTy());
3109
3110 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3111 if (BitSize == 0 || BitSize > 64)
3112 return false;
3113 return true;
3114}
3115
3117 // If we are using XMM registers in the ABI and the condition of the select is
3118 // a floating-point compare and we have blendv or conditional move, then it is
3119 // cheaper to select instead of doing a cross-register move and creating a
3120 // load that depends on the compare result.
3121 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3122 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3123}
3124
3126 // TODO: It might be a win to ease or lift this restriction, but the generic
3127 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3128 if (VT.isVector() && Subtarget.hasAVX512())
3129 return false;
3130
3131 return true;
3132}
3133
3135 SDValue C) const {
3136 // TODO: We handle scalars using custom code, but generic combining could make
3137 // that unnecessary.
3138 APInt MulC;
3139 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3140 return false;
3141
3142 // Find the type this will be legalized too. Otherwise we might prematurely
3143 // convert this to shl+add/sub and then still have to type legalize those ops.
3144 // Another choice would be to defer the decision for illegal types until
3145 // after type legalization. But constant splat vectors of i64 can't make it
3146 // through type legalization on 32-bit targets so we would need to special
3147 // case vXi64.
3148 while (getTypeAction(Context, VT) != TypeLegal)
3149 VT = getTypeToTransformTo(Context, VT);
3150
3151 // If vector multiply is legal, assume that's faster than shl + add/sub.
3152 // Multiply is a complex op with higher latency and lower throughput in
3153 // most implementations, sub-vXi32 vector multiplies are always fast,
3154 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3155 // is always going to be slow.
3156 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3157 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3158 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3159 return false;
3160
3161 // shl+add, shl+sub, shl+add+neg
3162 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3163 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3164}
3165
3167 unsigned Index) const {
3169 return false;
3170
3171 // Mask vectors support all subregister combinations and operations that
3172 // extract half of vector.
3173 if (ResVT.getVectorElementType() == MVT::i1)
3174 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3175 (Index == ResVT.getVectorNumElements()));
3176
3177 return (Index % ResVT.getVectorNumElements()) == 0;
3178}
3179
3181 unsigned Opc = VecOp.getOpcode();
3182
3183 // Assume target opcodes can't be scalarized.
3184 // TODO - do we have any exceptions?
3185 if (Opc >= ISD::BUILTIN_OP_END)
3186 return false;
3187
3188 // If the vector op is not supported, try to convert to scalar.
3189 EVT VecVT = VecOp.getValueType();
3190 if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
3191 return true;
3192
3193 // If the vector op is supported, but the scalar op is not, the transform may
3194 // not be worthwhile.
3195 EVT ScalarVT = VecVT.getScalarType();
3196 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3197}
3198
3200 bool) const {
3201 // TODO: Allow vectors?
3202 if (VT.isVector())
3203 return false;
3204 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3205}
3206
3208 // Speculate cttz only if we can directly use TZCNT or can promote to i32.
3209 return Subtarget.hasBMI() ||
3210 (!Ty->isVectorTy() && Ty->getScalarSizeInBits() < 32);
3211}
3212
3214 // Speculate ctlz only if we can directly use LZCNT.
3215 return Subtarget.hasLZCNT();
3216}
3217
3219 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3220 // expensive than a straight movsd. On the other hand, it's important to
3221 // shrink long double fp constant since fldt is very slow.
3222 return !Subtarget.hasSSE2() || VT == MVT::f80;
3223}
3224
3226 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3227 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3228}
3229
3231 const SelectionDAG &DAG,
3232 const MachineMemOperand &MMO) const {
3233 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3234 BitcastVT.getVectorElementType() == MVT::i1)
3235 return false;
3236
3237 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3238 return false;
3239
3240 // If both types are legal vectors, it's always ok to convert them.
3241 if (LoadVT.isVector() && BitcastVT.isVector() &&
3242 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
3243 return true;
3244
3245 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3246}
3247
3249 const MachineFunction &MF) const {
3250 // Do not merge to float value size (128 bytes) if no implicit
3251 // float attribute is set.
3252 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3253
3254 if (NoFloat) {
3255 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3256 return (MemVT.getSizeInBits() <= MaxIntSize);
3257 }
3258 // Make sure we don't merge greater than our preferred vector
3259 // width.
3260 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3261 return false;
3262
3263 return true;
3264}
3265
3267 return Subtarget.hasFastLZCNT();
3268}
3269
3271 const Instruction &AndI) const {
3272 return true;
3273}
3274
3276 EVT VT = Y.getValueType();
3277
3278 if (VT.isVector())
3279 return false;
3280
3281 if (!Subtarget.hasBMI())
3282 return false;
3283
3284 // There are only 32-bit and 64-bit forms for 'andn'.
3285 if (VT != MVT::i32 && VT != MVT::i64)
3286 return false;
3287
3288 return !isa<ConstantSDNode>(Y);
3289}
3290
3292 EVT VT = Y.getValueType();
3293
3294 if (!VT.isVector())
3295 return hasAndNotCompare(Y);
3296
3297 // Vector.
3298
3299 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3300 return false;
3301
3302 if (VT == MVT::v4i32)
3303 return true;
3304
3305 return Subtarget.hasSSE2();
3306}
3307
3309 return X.getValueType().isScalarInteger(); // 'bt'
3310}
3311
3315 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3316 SelectionDAG &DAG) const {
3317 // Does baseline recommend not to perform the fold by default?
3319 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3320 return false;
3321 // For scalars this transform is always beneficial.
3322 if (X.getValueType().isScalarInteger())
3323 return true;
3324 // If all the shift amounts are identical, then transform is beneficial even
3325 // with rudimentary SSE2 shifts.
3326 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3327 return true;
3328 // If we have AVX2 with it's powerful shift operations, then it's also good.
3329 if (Subtarget.hasAVX2())
3330 return true;
3331 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3332 return NewShiftOpcode == ISD::SHL;
3333}
3334
3336 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3337 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3338 if (!VT.isInteger())
3339 return ShiftOpc;
3340
3341 bool PreferRotate = false;
3342 if (VT.isVector()) {
3343 // For vectors, if we have rotate instruction support, then its definetly
3344 // best. Otherwise its not clear what the best so just don't make changed.
3345 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3346 VT.getScalarType() == MVT::i64);
3347 } else {
3348 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3349 // rotate unless we have a zext mask+shr.
3350 PreferRotate = Subtarget.hasBMI2();
3351 if (!PreferRotate) {
3352 unsigned MaskBits =
3353 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3354 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3355 }
3356 }
3357
3358 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3359 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3360
3361 if (PreferRotate && MayTransformRotate)
3362 return ISD::ROTL;
3363
3364 // If vector we don't really get much benefit swapping around constants.
3365 // Maybe we could check if the DAG has the flipped node already in the
3366 // future.
3367 if (VT.isVector())
3368 return ShiftOpc;
3369
3370 // See if the beneficial to swap shift type.
3371 if (ShiftOpc == ISD::SHL) {
3372 // If the current setup has imm64 mask, then inverse will have
3373 // at least imm32 mask (or be zext i32 -> i64).
3374 if (VT == MVT::i64)
3375 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3376 : ShiftOpc;
3377
3378 // We can only benefit if req at least 7-bit for the mask. We
3379 // don't want to replace shl of 1,2,3 as they can be implemented
3380 // with lea/add.
3381 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3382 }
3383
3384 if (VT == MVT::i64)
3385 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3386 // extremely efficient.
3387 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3388
3389 // Keep small shifts as shl so we can generate add/lea.
3390 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3391 }
3392
3393 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3394 // (PreferRotate will be set in the latter case).
3395 if (PreferRotate || VT.isVector())
3396 return ShiftOpc;
3397
3398 // Non-vector type and we have a zext mask with SRL.
3399 return ISD::SRL;
3400}
3401
3404 const Value *Lhs,
3405 const Value *Rhs) const {
3406 using namespace llvm::PatternMatch;
3407 int BaseCost = BrMergingBaseCostThresh.getValue();
3408 // a == b && a == c is a fast pattern on x86.
3410 if (BaseCost >= 0 && Opc == Instruction::And &&
3411 match(Lhs, m_ICmp(Pred, m_Value(), m_Value())) &&
3412 Pred == ICmpInst::ICMP_EQ &&
3413 match(Rhs, m_ICmp(Pred, m_Value(), m_Value())) &&
3414 Pred == ICmpInst::ICMP_EQ)
3415 BaseCost += 1;
3416 return {BaseCost, BrMergingLikelyBias.getValue(),
3417 BrMergingUnlikelyBias.getValue()};
3418}
3419
3421 return N->getOpcode() != ISD::FP_EXTEND;
3422}
3423
3425 const SDNode *N, CombineLevel Level) const {
3426 assert(((N->getOpcode() == ISD::SHL &&
3427 N->getOperand(0).getOpcode() == ISD::SRL) ||
3428 (N->getOpcode() == ISD::SRL &&
3429 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3430 "Expected shift-shift mask");
3431 // TODO: Should we always create i64 masks? Or only folded immediates?
3432 EVT VT = N->getValueType(0);
3433 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3434 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3435 // Only fold if the shift values are equal - so it folds to AND.
3436 // TODO - we should fold if either is a non-uniform vector but we don't do
3437 // the fold for non-splats yet.
3438 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3439 }
3441}
3442
3444 EVT VT = Y.getValueType();
3445
3446 // For vectors, we don't have a preference, but we probably want a mask.
3447 if (VT.isVector())
3448 return false;
3449
3450 // 64-bit shifts on 32-bit targets produce really bad bloated code.
3451 if (VT == MVT::i64 && !Subtarget.is64Bit())
3452 return false;
3453
3454 return true;
3455}
3456
3459 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3461 !Subtarget.isOSWindows())
3464 ExpansionFactor);
3465}
3466
3468 // Any legal vector type can be splatted more efficiently than
3469 // loading/spilling from memory.
3470 return isTypeLegal(VT);
3471}
3472
3474 MVT VT = MVT::getIntegerVT(NumBits);
3475 if (isTypeLegal(VT))
3476 return VT;
3477
3478 // PMOVMSKB can handle this.
3479 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3480 return MVT::v16i8;
3481
3482 // VPMOVMSKB can handle this.
3483 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3484 return MVT::v32i8;
3485
3486 // TODO: Allow 64-bit type for 32-bit target.
3487 // TODO: 512-bit types should be allowed, but make sure that those
3488 // cases are handled in combineVectorSizedSetCCEquality().
3489
3491}
3492
3493/// Val is the undef sentinel value or equal to the specified value.
3494static bool isUndefOrEqual(int Val, int CmpVal) {
3495 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3496}
3497
3498/// Return true if every element in Mask is the undef sentinel value or equal to
3499/// the specified value.
3500static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3501 return llvm::all_of(Mask, [CmpVal](int M) {
3502 return (M == SM_SentinelUndef) || (M == CmpVal);
3503 });
3504}
3505
3506/// Return true if every element in Mask, beginning from position Pos and ending
3507/// in Pos+Size is the undef sentinel value or equal to the specified value.
3508static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3509 unsigned Size) {
3510 return llvm::all_of(Mask.slice(Pos, Size),
3511 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3512}
3513
3514/// Val is either the undef or zero sentinel value.
3515static bool isUndefOrZero(int Val) {
3516 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3517}
3518
3519/// Return true if every element in Mask, beginning from position Pos and ending
3520/// in Pos+Size is the undef sentinel value.
3521static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3522 return llvm::all_of(Mask.slice(Pos, Size),
3523 [](int M) { return M == SM_SentinelUndef; });
3524}
3525
3526/// Return true if the mask creates a vector whose lower half is undefined.
3528 unsigned NumElts = Mask.size();
3529 return isUndefInRange(Mask, 0, NumElts / 2);
3530}
3531
3532/// Return true if the mask creates a vector whose upper half is undefined.
3534 unsigned NumElts = Mask.size();
3535 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3536}
3537
3538/// Return true if Val falls within the specified range (L, H].
3539static bool isInRange(int Val, int Low, int Hi) {
3540 return (Val >= Low && Val < Hi);
3541}
3542
3543/// Return true if the value of any element in Mask falls within the specified
3544/// range (L, H].
3545static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3546 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3547}
3548
3549/// Return true if the value of any element in Mask is the zero sentinel value.
3550static bool isAnyZero(ArrayRef<int> Mask) {
3551 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
3552}
3553
3554/// Return true if Val is undef or if its value falls within the
3555/// specified range (L, H].
3556static bool isUndefOrInRange(int Val, int Low, int Hi) {
3557 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3558}
3559
3560/// Return true if every element in Mask is undef or if its value
3561/// falls within the specified range (L, H].
3562static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3563 return llvm::all_of(
3564 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3565}
3566
3567/// Return true if Val is undef, zero or if its value falls within the
3568/// specified range (L, H].
3569static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3570 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3571}
3572
3573/// Return true if every element in Mask is undef, zero or if its value
3574/// falls within the specified range (L, H].
3575static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3576 return llvm::all_of(
3577 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3578}
3579
3580/// Return true if every element in Mask, beginning
3581/// from position Pos and ending in Pos + Size, falls within the specified
3582/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3583static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3584 unsigned Size, int Low, int Step = 1) {
3585 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3586 if (!isUndefOrEqual(Mask[i], Low))
3587 return false;
3588 return true;
3589}
3590
3591/// Return true if every element in Mask, beginning
3592/// from position Pos and ending in Pos+Size, falls within the specified
3593/// sequential range (Low, Low+Size], or is undef or is zero.
3595 unsigned Size, int Low,
3596 int Step = 1) {
3597 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3598 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3599 return false;
3600 return true;
3601}
3602
3603/// Return true if every element in Mask, beginning
3604/// from position Pos and ending in Pos+Size is undef or is zero.
3605static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3606 unsigned Size) {
3607 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3608}
3609
3610/// Return true if every element of a single input is referenced by the shuffle
3611/// mask. i.e. it just permutes them all.
3613 unsigned NumElts = Mask.size();
3614 APInt DemandedElts = APInt::getZero(NumElts);
3615 for (int M : Mask)
3616 if (isInRange(M, 0, NumElts))
3617 DemandedElts.setBit(M);
3618 return DemandedElts.isAllOnes();
3619}
3620
3621/// Helper function to test whether a shuffle mask could be
3622/// simplified by widening the elements being shuffled.
3623///
3624/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
3625/// leaves it in an unspecified state.
3626///
3627/// NOTE: This must handle normal vector shuffle masks and *target* vector
3628/// shuffle masks. The latter have the special property of a '-2' representing
3629/// a zero-ed lane of a vector.
3631 SmallVectorImpl<int> &WidenedMask) {
3632 WidenedMask.assign(Mask.size() / 2, 0);
3633 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
3634 int M0 = Mask[i];
3635 int M1 = Mask[i + 1];
3636
3637 // If both elements are undef, its trivial.
3638 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
3639 WidenedMask[i / 2] = SM_SentinelUndef;
3640 continue;
3641 }
3642
3643 // Check for an undef mask and a mask value properly aligned to fit with
3644 // a pair of values. If we find such a case, use the non-undef mask's value.
3645 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
3646 WidenedMask[i / 2] = M1 / 2;
3647 continue;
3648 }
3649 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
3650 WidenedMask[i / 2] = M0 / 2;
3651 continue;
3652 }
3653
3654 // When zeroing, we need to spread the zeroing across both lanes to widen.
3655 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
3656 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
3658 WidenedMask[i / 2] = SM_SentinelZero;
3659 continue;
3660 }
3661 return false;
3662 }
3663
3664 // Finally check if the two mask values are adjacent and aligned with
3665 // a pair.
3666 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
3667 WidenedMask[i / 2] = M0 / 2;
3668 continue;
3669 }
3670
3671 // Otherwise we can't safely widen the elements used in this shuffle.
3672 return false;
3673 }
3674 assert(WidenedMask.size() == Mask.size() / 2 &&
3675 "Incorrect size of mask after widening the elements!");
3676
3677 return true;
3678}
3679
3681 const APInt &Zeroable,
3682 bool V2IsZero,
3683 SmallVectorImpl<int> &WidenedMask) {
3684 // Create an alternative mask with info about zeroable elements.
3685 // Here we do not set undef elements as zeroable.
3686 SmallVector<int, 64> ZeroableMask(Mask);
3687 if (V2IsZero) {
3688 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3689 for (int i = 0, Size = Mask.size(); i != Size; ++i)
3690 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
3691 ZeroableMask[i] = SM_SentinelZero;
3692 }
3693 return canWidenShuffleElements(ZeroableMask, WidenedMask);
3694}
3695
3697 SmallVector<int, 32> WidenedMask;
3698 return canWidenShuffleElements(Mask, WidenedMask);
3699}
3700
3701// Attempt to narrow/widen shuffle mask until it matches the target number of
3702// elements.
3703static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
3704 SmallVectorImpl<int> &ScaledMask) {
3705 unsigned NumSrcElts = Mask.size();
3706 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
3707 "Illegal shuffle scale factor");
3708
3709 // Narrowing is guaranteed to work.
3710 if (NumDstElts >= NumSrcElts) {
3711 int Scale = NumDstElts / NumSrcElts;
3712 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
3713 return true;
3714 }
3715
3716 // We have to repeat the widening until we reach the target size, but we can
3717 // split out the first widening as it sets up ScaledMask for us.
3718 if (canWidenShuffleElements(Mask, ScaledMask)) {
3719 while (ScaledMask.size() > NumDstElts) {
3720 SmallVector<int, 16> WidenedMask;
3721 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
3722 return false;
3723 ScaledMask = std::move(WidenedMask);
3724 }
3725 return true;
3726 }
3727
3728 return false;
3729}
3730
3731/// Returns true if Elt is a constant zero or a floating point constant +0.0.
3733 return isNullConstant(Elt) || isNullFPConstant(Elt);
3734}
3735
3736// Build a vector of constants.
3737// Use an UNDEF node if MaskElt == -1.
3738// Split 64-bit constants in the 32-bit mode.
3740 const SDLoc &dl, bool IsMask = false) {
3741
3743 bool Split = false;
3744
3745 MVT ConstVecVT = VT;
3746 unsigned NumElts = VT.getVectorNumElements();
3747 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3748 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3749 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3750 Split = true;
3751 }
3752
3753 MVT EltVT = ConstVecVT.getVectorElementType();
3754 for (unsigned i = 0; i < NumElts; ++i) {
3755 bool IsUndef = Values[i] < 0 && IsMask;
3756 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
3757 DAG.getConstant(Values[i], dl, EltVT);
3758 Ops.push_back(OpNode);
3759 if (Split)
3760 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
3761 DAG.getConstant(0, dl, EltVT));
3762 }
3763 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
3764 if (Split)
3765 ConstsNode = DAG.getBitcast(VT, ConstsNode);
3766 return ConstsNode;
3767}
3768
3769static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
3770 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
3771 assert(Bits.size() == Undefs.getBitWidth() &&
3772 "Unequal constant and undef arrays");
3774 bool Split = false;
3775
3776 MVT ConstVecVT = VT;
3777 unsigned NumElts = VT.getVectorNumElements();
3778 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3779 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3780 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3781 Split = true;
3782 }
3783
3784 MVT EltVT = ConstVecVT.getVectorElementType();
3785 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
3786 if (Undefs[i]) {
3787 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
3788 continue;
3789 }
3790 const APInt &V = Bits[i];
3791 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
3792 if (Split) {
3793 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
3794 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
3795 } else if (EltVT == MVT::f32) {
3797 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
3798 } else if (EltVT == MVT::f64) {
3800 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
3801 } else {
3802 Ops.push_back(DAG.getConstant(V, dl, EltVT));
3803 }
3804 }
3805
3806 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
3807 return DAG.getBitcast(VT, ConstsNode);
3808}
3809
3811 SelectionDAG &DAG, const SDLoc &dl) {
3812 APInt Undefs = APInt::getZero(Bits.size());
3813 return getConstVector(Bits, Undefs, VT, DAG, dl);
3814}
3815
3816/// Returns a vector of specified type with all zero elements.
3817static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
3818 SelectionDAG &DAG, const SDLoc &dl) {
3819 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
3820 VT.getVectorElementType() == MVT::i1) &&
3821 "Unexpected vector type");
3822
3823 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
3824 // type. This ensures they get CSE'd. But if the integer type is not
3825 // available, use a floating-point +0.0 instead.
3826 SDValue Vec;
3827 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3828 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
3829 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
3830 } else if (VT.isFloatingPoint() &&
3832 Vec = DAG.getConstantFP(+0.0, dl, VT);
3833 } else if (VT.getVectorElementType() == MVT::i1) {
3834 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
3835 "Unexpected vector type");
3836 Vec = DAG.getConstant(0, dl, VT);
3837 } else {
3838 unsigned Num32BitElts = VT.getSizeInBits() / 32;
3839 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
3840 }
3841 return DAG.getBitcast(VT, Vec);
3842}
3843
3844// Helper to determine if the ops are all the extracted subvectors come from a
3845// single source. If we allow commute they don't have to be in order (Lo/Hi).
3846static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
3847 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
3848 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
3849 LHS.getValueType() != RHS.getValueType() ||
3850 LHS.getOperand(0) != RHS.getOperand(0))
3851 return SDValue();
3852
3853 SDValue Src = LHS.getOperand(0);
3854 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
3855 return SDValue();
3856
3857 unsigned NumElts = LHS.getValueType().getVectorNumElements();
3858 if ((LHS.getConstantOperandAPInt(1) == 0 &&
3859 RHS.getConstantOperandAPInt(1) == NumElts) ||
3860 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
3861 LHS.getConstantOperandAPInt(1) == NumElts))
3862 return Src;
3863
3864 return SDValue();
3865}
3866
3867static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
3868 const SDLoc &dl, unsigned vectorWidth) {
3869 EVT VT = Vec.getValueType();
3870 EVT ElVT = VT.getVectorElementType();
3871 unsigned Factor = VT.getSizeInBits() / vectorWidth;
3872 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
3873 VT.getVectorNumElements() / Factor);
3874
3875 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
3876 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
3877 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
3878
3879 // This is the index of the first element of the vectorWidth-bit chunk
3880 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
3881 IdxVal &= ~(ElemsPerChunk - 1);
3882
3883 // If the input is a buildvector just emit a smaller one.
3884 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
3885 return DAG.getBuildVector(ResultVT, dl,
3886 Vec->ops().slice(IdxVal, ElemsPerChunk));
3887
3888 // Check if we're extracting the upper undef of a widening pattern.
3889 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
3890 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
3891 isNullConstant(Vec.getOperand(2)))
3892 return DAG.getUNDEF(ResultVT);
3893
3894 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
3895 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
3896}
3897
3898/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
3899/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
3900/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
3901/// instructions or a simple subregister reference. Idx is an index in the
3902/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
3903/// lowering EXTRACT_VECTOR_ELT operations easier.
3904static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
3905 SelectionDAG &DAG, const SDLoc &dl) {
3907 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
3908 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
3909}
3910
3911/// Generate a DAG to grab 256-bits from a 512-bit vector.
3912static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
3913 SelectionDAG &DAG, const SDLoc &dl) {
3914 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
3915 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
3916}
3917
3918static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
3919 SelectionDAG &DAG, const SDLoc &dl,
3920 unsigned vectorWidth) {
3921 assert((vectorWidth == 128 || vectorWidth == 256) &&
3922 "Unsupported vector width");
3923 // Inserting UNDEF is Result
3924 if (Vec.isUndef())
3925 return Result;
3926 EVT VT = Vec.getValueType();
3927 EVT ElVT = VT.getVectorElementType();
3928 EVT ResultVT = Result.getValueType();
3929
3930 // Insert the relevant vectorWidth bits.
3931 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
3932 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
3933
3934 // This is the index of the first element of the vectorWidth-bit chunk
3935 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
3936 IdxVal &= ~(ElemsPerChunk - 1);
3937
3938 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
3939 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
3940}
3941
3942/// Generate a DAG to put 128-bits into a vector > 128 bits. This
3943/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
3944/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
3945/// simple superregister reference. Idx is an index in the 128 bits
3946/// we want. It need not be aligned to a 128-bit boundary. That makes
3947/// lowering INSERT_VECTOR_ELT operations easier.
3948static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
3949 SelectionDAG &DAG, const SDLoc &dl) {
3950 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
3951 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
3952}
3953
3954/// Widen a vector to a larger size with the same scalar type, with the new
3955/// elements either zero or undef.
3956static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
3957 const X86Subtarget &Subtarget, SelectionDAG &DAG,
3958 const SDLoc &dl) {
3960 Vec.getValueType().getScalarType() == VT.getScalarType() &&
3961 "Unsupported vector widening type");
3962 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
3963 : DAG.getUNDEF(VT);
3964 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
3965 DAG.getIntPtrConstant(0, dl));
3966}
3967
3968/// Widen a vector to a larger size with the same scalar type, with the new
3969/// elements either zero or undef.
3970static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
3971 const X86Subtarget &Subtarget, SelectionDAG &DAG,
3972 const SDLoc &dl, unsigned WideSizeInBits) {
3973 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
3974 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
3975 "Unsupported vector widening type");
3976 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
3977 MVT SVT = Vec.getSimpleValueType().getScalarType();
3978 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
3979 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
3980}
3981
3982/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
3983/// and bitcast with integer types.
3984static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
3985 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
3986 unsigned NumElts = VT.getVectorNumElements();
3987 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
3988 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
3989 return VT;
3990}
3991
3992/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
3993/// bitcast with integer types.
3994static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
3995 const X86Subtarget &Subtarget, SelectionDAG &DAG,
3996 const SDLoc &dl) {
3997 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
3998 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
3999}
4000
4001// Helper function to collect subvector ops that are concatenated together,
4002// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
4003// The subvectors in Ops are guaranteed to be the same type.
4005 SelectionDAG &DAG) {
4006 assert(Ops.empty() && "Expected an empty ops vector");
4007
4008 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4009 Ops.append(N->op_begin(), N->op_end());
4010 return true;
4011 }
4012
4013 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4014 SDValue Src = N->getOperand(0);
4015 SDValue Sub = N->getOperand(1);
4016 const APInt &Idx = N->getConstantOperandAPInt(2);
4017 EVT VT = Src.getValueType();
4018 EVT SubVT = Sub.getValueType();
4019
4020 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4021 // insert_subvector(undef, x, lo)
4022 if (Idx == 0 && Src.isUndef()) {
4023 Ops.push_back(Sub);
4024 Ops.push_back(DAG.getUNDEF(SubVT));
4025 return true;
4026 }
4027 if (Idx == (VT.getVectorNumElements() / 2)) {
4028 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4029 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4030 Src.getOperand(1).getValueType() == SubVT &&
4031 isNullConstant(Src.getOperand(2))) {
4032 // Attempt to recurse into inner (matching) concats.
4033 SDValue Lo = Src.getOperand(1);
4034 SDValue Hi = Sub;
4035 SmallVector<SDValue, 2> LoOps, HiOps;
4036 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4037 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4038 LoOps.size() == HiOps.size()) {
4039 Ops.append(LoOps);
4040 Ops.append(HiOps);
4041 return true;
4042 }
4043 Ops.push_back(Lo);
4044 Ops.push_back(Hi);
4045 return true;
4046 }
4047 // insert_subvector(x, extract_subvector(x, lo), hi)
4048 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4049 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4050 Ops.append(2, Sub);
4051 return true;
4052 }
4053 // insert_subvector(undef, x, hi)
4054 if (Src.isUndef()) {
4055 Ops.push_back(DAG.getUNDEF(SubVT));
4056 Ops.push_back(Sub);
4057 return true;
4058 }
4059 }
4060 }
4061 }
4062
4063 return false;
4064}
4065
4066// Helper to check if \p V can be split into subvectors and the upper subvectors
4067// are all undef. In which case return the lower subvector.
4069 SelectionDAG &DAG) {
4070 SmallVector<SDValue> SubOps;
4071 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4072 return SDValue();
4073
4074 unsigned NumSubOps = SubOps.size();
4075 unsigned HalfNumSubOps = NumSubOps / 2;
4076 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4077
4078 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4079 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4080 return SDValue();
4081
4082 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4083 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4084 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4085}
4086
4087// Helper to check if we can access all the constituent subvectors without any
4088// extract ops.
4091 return collectConcatOps(N, Ops, DAG);
4092}
4093
4094static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4095 const SDLoc &dl) {
4096 EVT VT = Op.getValueType();
4097 unsigned NumElems = VT.getVectorNumElements();
4098 unsigned SizeInBits = VT.getSizeInBits();
4099 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4100 "Can't split odd sized vector");
4101
4102 // If this is a splat value (with no-undefs) then use the lower subvector,
4103 // which should be a free extraction.
4104 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4105 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4106 return std::make_pair(Lo, Lo);
4107
4108 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4109 return std::make_pair(Lo, Hi);
4110}
4111
4112/// Break an operation into 2 half sized ops and then concatenate the results.
4114 unsigned NumOps = Op.getNumOperands();
4115 EVT VT = Op.getValueType();
4116
4117 // Extract the LHS Lo/Hi vectors
4118 SmallVector<SDValue> LoOps(NumOps, SDValue());
4119 SmallVector<SDValue> HiOps(NumOps, SDValue());
4120 for (unsigned I = 0; I != NumOps; ++I) {
4121 SDValue SrcOp = Op.getOperand(I);
4122 if (!SrcOp.getValueType().isVector()) {
4123 LoOps[I] = HiOps[I] = SrcOp;
4124 continue;
4125 }
4126 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4127 }
4128
4129 EVT LoVT, HiVT;
4130 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4131 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4132 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4133 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4134}
4135
4136/// Break an unary integer operation into 2 half sized ops and then
4137/// concatenate the result back.
4139 const SDLoc &dl) {
4140 // Make sure we only try to split 256/512-bit types to avoid creating
4141 // narrow vectors.
4142 EVT VT = Op.getValueType();
4143 (void)VT;
4144 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4145 Op.getOperand(0).getValueType().is512BitVector()) &&
4146 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4147 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4148 VT.getVectorNumElements() &&
4149 "Unexpected VTs!");
4150 return splitVectorOp(Op, DAG, dl);
4151}
4152
4153/// Break a binary integer operation into 2 half sized ops and then
4154/// concatenate the result back.
4156 const SDLoc &dl) {
4157 // Assert that all the types match.
4158 EVT VT = Op.getValueType();
4159 (void)VT;
4160 assert(Op.getOperand(0).getValueType() == VT &&
4161 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4162 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4163 return splitVectorOp(Op, DAG, dl);
4164}
4165
4166// Helper for splitting operands of an operation to legal target size and
4167// apply a function on each part.
4168// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4169// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4170// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4171// The argument Builder is a function that will be applied on each split part:
4172// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4173template <typename F>
4175 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4176 F Builder, bool CheckBWI = true) {
4177 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4178 unsigned NumSubs = 1;
4179 if ((CheckBWI && Subtarget.useBWIRegs()) ||
4180 (!CheckBWI && Subtarget.useAVX512Regs())) {
4181 if (VT.getSizeInBits() > 512) {
4182 NumSubs = VT.getSizeInBits() / 512;
4183 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4184 }
4185 } else if (Subtarget.hasAVX2()) {
4186 if (VT.getSizeInBits() > 256) {
4187 NumSubs = VT.getSizeInBits() / 256;
4188 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4189 }
4190 } else {
4191 if (VT.getSizeInBits() > 128) {
4192 NumSubs = VT.getSizeInBits() / 128;
4193 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4194 }
4195 }
4196
4197 if (NumSubs == 1)
4198 return Builder(DAG, DL, Ops);
4199
4201 for (unsigned i = 0; i != NumSubs; ++i) {
4203 for (SDValue Op : Ops) {
4204 EVT OpVT = Op.getValueType();
4205 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4206 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4207 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4208 }
4209 Subs.push_back(Builder(DAG, DL, SubOps));
4210 }
4211 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4212}
4213
4214// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4215// targets.
4216static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4218 const X86Subtarget &Subtarget) {
4219 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4220 MVT SVT = VT.getScalarType();
4221
4222 // If we have a 32/64 splatted constant, splat it to DstTy to
4223 // encourage a foldable broadcast'd operand.
4224 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4225 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4226 // AVX512 broadcasts 32/64-bit operands.
4227 // TODO: Support float once getAVX512Node is used by fp-ops.
4228 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4230 return SDValue();
4231 // If we're not widening, don't bother if we're not bitcasting.
4232 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4233 return SDValue();
4234 if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {
4235 APInt SplatValue, SplatUndef;
4236 unsigned SplatBitSize;
4237 bool HasAnyUndefs;
4238 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4239 HasAnyUndefs, OpEltSizeInBits) &&
4240 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4241 return DAG.getConstant(SplatValue, DL, DstVT);
4242 }
4243 return SDValue();
4244 };
4245
4246 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4247
4248 MVT DstVT = VT;
4249 if (Widen)
4250 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4251
4252 // Canonicalize src operands.
4253 SmallVector<SDValue> SrcOps(Ops.begin(), Ops.end());
4254 for (SDValue &Op : SrcOps) {
4255 MVT OpVT = Op.getSimpleValueType();
4256 // Just pass through scalar operands.
4257 if (!OpVT.isVector())
4258 continue;
4259 assert(OpVT == VT && "Vector type mismatch");
4260
4261 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4262 Op = BroadcastOp;
4263 continue;
4264 }
4265
4266 // Just widen the subvector by inserting into an undef wide vector.
4267 if (Widen)
4268 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4269 }
4270
4271 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4272
4273 // Perform the 512-bit op then extract the bottom subvector.
4274 if (Widen)
4275 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4276 return Res;
4277}
4278
4279/// Insert i1-subvector to i1-vector.
4281 const X86Subtarget &Subtarget) {
4282
4283 SDLoc dl(Op);
4284 SDValue Vec = Op.getOperand(0);
4285 SDValue SubVec = Op.getOperand(1);
4286 SDValue Idx = Op.getOperand(2);
4287 unsigned IdxVal = Op.getConstantOperandVal(2);
4288
4289 // Inserting undef is a nop. We can just return the original vector.
4290 if (SubVec.isUndef())
4291 return Vec;
4292
4293 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4294 return Op;
4295
4296 MVT OpVT = Op.getSimpleValueType();
4297 unsigned NumElems = OpVT.getVectorNumElements();
4298 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
4299
4300 // Extend to natively supported kshift.
4301 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4302
4303 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4304 // if necessary.
4305 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4306 // May need to promote to a legal type.
4307 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4308 DAG.getConstant(0, dl, WideOpVT),
4309 SubVec, Idx);
4310 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4311 }
4312
4313 MVT SubVecVT = SubVec.getSimpleValueType();
4314 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4315 assert(IdxVal + SubVecNumElems <= NumElems &&
4316 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4317 "Unexpected index value in INSERT_SUBVECTOR");
4318
4319 SDValue Undef = DAG.getUNDEF(WideOpVT);
4320
4321 if (IdxVal == 0) {
4322 // Zero lower bits of the Vec
4323 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4324 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4325 ZeroIdx);
4326 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4327 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4328 // Merge them together, SubVec should be zero extended.
4329 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4330 DAG.getConstant(0, dl, WideOpVT),
4331 SubVec, ZeroIdx);
4332 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4333 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4334 }
4335
4336 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4337 Undef, SubVec, ZeroIdx);
4338
4339 if (Vec.isUndef()) {
4340 assert(IdxVal != 0 && "Unexpected index");
4341 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4342 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4343 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4344 }
4345
4347 assert(IdxVal != 0 && "Unexpected index");
4348 // If upper elements of Vec are known undef, then just shift into place.
4349 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4350 [](SDValue V) { return V.isUndef(); })) {
4351 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4352 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4353 } else {
4354 NumElems = WideOpVT.getVectorNumElements();
4355 unsigned ShiftLeft = NumElems - SubVecNumElems;
4356 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4357 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4358 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4359 if (ShiftRight != 0)
4360 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4361 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4362 }
4363 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4364 }
4365
4366 // Simple case when we put subvector in the upper part
4367 if (IdxVal + SubVecNumElems == NumElems) {
4368 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4369 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4370 if (SubVecNumElems * 2 == NumElems) {
4371 // Special case, use legal zero extending insert_subvector. This allows
4372 // isel to optimize when bits are known zero.
4373 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4374 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4375 DAG.getConstant(0, dl, WideOpVT),
4376 Vec, ZeroIdx);
4377 } else {
4378 // Otherwise use explicit shifts to zero the bits.
4379 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4380 Undef, Vec, ZeroIdx);
4381 NumElems = WideOpVT.getVectorNumElements();
4382 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4383 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4384 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4385 }
4386 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4387 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4388 }
4389
4390 // Inserting into the middle is more complicated.
4391
4392 NumElems = WideOpVT.getVectorNumElements();
4393
4394 // Widen the vector if needed.
4395 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4396
4397 unsigned ShiftLeft = NumElems - SubVecNumElems;
4398 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4399
4400 // Do an optimization for the most frequently used types.
4401 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4402 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4403 Mask0.flipAllBits();
4404 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4405 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4406 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4407 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4408 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4409 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4410 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4411 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4412
4413 // Reduce to original width if needed.
4414 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4415 }
4416
4417 // Clear the upper bits of the subvector and move it to its insert position.
4418 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4419 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4420 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4421 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4422
4423 // Isolate the bits below the insertion point.
4424 unsigned LowShift = NumElems - IdxVal;
4425 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4426 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4427 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4428 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4429
4430 // Isolate the bits after the last inserted bit.
4431 unsigned HighShift = IdxVal + SubVecNumElems;
4432 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4433 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4434 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4435 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4436
4437 // Now OR all 3 pieces together.
4438 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4439 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4440
4441 // Reduce to original width if needed.
4442 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4443}
4444
4446 const SDLoc &dl) {
4447 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4448 EVT SubVT = V1.getValueType();
4449 EVT SubSVT = SubVT.getScalarType();
4450 unsigned SubNumElts = SubVT.getVectorNumElements();
4451 unsigned SubVectorWidth = SubVT.getSizeInBits();
4452 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4453 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4454 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4455}
4456
4457/// Returns a vector of specified type with all bits set.
4458/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4459/// Then bitcast to their original type, ensuring they get CSE'd.
4460static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4461 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4462 "Expected a 128/256/512-bit vector type");
4463 unsigned NumElts = VT.getSizeInBits() / 32;
4464 SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
4465 return DAG.getBitcast(VT, Vec);
4466}
4467
4468static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
4469 SDValue In, SelectionDAG &DAG) {
4470 EVT InVT = In.getValueType();
4471 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
4472 assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
4473 ISD::ZERO_EXTEND == Opcode) &&
4474 "Unknown extension opcode");
4475
4476 // For 256-bit vectors, we only need the lower (128-bit) input half.
4477 // For 512-bit vectors, we only need the lower input half or quarter.
4478 if (InVT.getSizeInBits() > 128) {
4479 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
4480 "Expected VTs to be the same size!");
4481 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
4482 In = extractSubVector(In, 0, DAG, DL,
4483 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
4484 InVT = In.getValueType();
4485 }
4486
4487 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
4488 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
4489
4490 return DAG.getNode(Opcode, DL, VT, In);
4491}
4492
4493// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
4494static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS,
4495 SDValue Mask, SelectionDAG &DAG) {
4496 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
4497 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
4498 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
4499}
4500
4502 bool Lo, bool Unary) {
4503 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
4504 "Illegal vector type to unpack");
4505 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4506 int NumElts = VT.getVectorNumElements();
4507 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
4508 for (int i = 0; i < NumElts; ++i) {
4509 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4510 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4511 Pos += (Unary ? 0 : NumElts * (i % 2));
4512 Pos += (Lo ? 0 : NumEltsInLane / 2);
4513 Mask.push_back(Pos);
4514 }
4515}
4516
4517/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4518/// imposed by AVX and specific to the unary pattern. Example:
4519/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4520/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4522 bool Lo) {
4523 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4524 int NumElts = VT.getVectorNumElements();
4525 for (int i = 0; i < NumElts; ++i) {
4526 int Pos = i / 2;
4527 Pos += (Lo ? 0 : NumElts / 2);
4528 Mask.push_back(Pos);
4529 }
4530}
4531
4532// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
4533static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
4534 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
4536 (ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {
4537 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
4538 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
4539 int M = Mask[I];
4540 if (M < 0)
4541 continue;
4542 SDValue V = (M < NumElts) ? V1 : V2;
4543 if (V.isUndef())
4544 continue;
4545 Ops[I] = V.getOperand(M % NumElts);
4546 }
4547 return DAG.getBuildVector(VT, dl, Ops);
4548 }
4549
4550 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4551}
4552
4553/// Returns a vector_shuffle node for an unpackl operation.
4554static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4555 SDValue V1, SDValue V2) {
4557 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
4558 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4559}
4560
4561/// Returns a vector_shuffle node for an unpackh operation.
4562static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4563 SDValue V1, SDValue V2) {
4565 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
4566 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4567}
4568
4569/// Returns a node that packs the LHS + RHS nodes together at half width.
4570/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
4571/// TODO: Add subvector splitting if/when we have a need for it.
4572static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
4573 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
4574 bool PackHiHalf = false) {
4575 MVT OpVT = LHS.getSimpleValueType();
4576 unsigned EltSizeInBits = VT.getScalarSizeInBits();
4577 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
4578 assert(OpVT == RHS.getSimpleValueType() &&
4579 VT.getSizeInBits() == OpVT.getSizeInBits() &&
4580 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
4581 "Unexpected PACK operand types");
4582 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4583 "Unexpected PACK result type");
4584
4585 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4586 if (EltSizeInBits == 32) {
4587 SmallVector<int> PackMask;
4588 int Offset = PackHiHalf ? 1 : 0;
4589 int NumElts = VT.getVectorNumElements();
4590 for (int I = 0; I != NumElts; I += 4) {
4591 PackMask.push_back(I + Offset);
4592 PackMask.push_back(I + Offset + 2);
4593 PackMask.push_back(I + Offset + NumElts);
4594 PackMask.push_back(I + Offset + NumElts + 2);
4595 }
4596 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
4597 DAG.getBitcast(VT, RHS), PackMask);
4598 }
4599
4600 // See if we already have sufficient leading bits for PACKSS/PACKUS.
4601 if (!PackHiHalf) {
4602 if (UsePackUS &&
4603 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
4604 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
4605 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4606
4607 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
4608 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
4609 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4610 }
4611
4612 // Fallback to sign/zero extending the requested half and pack.
4613 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
4614 if (UsePackUS) {
4615 if (PackHiHalf) {
4616 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
4617 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
4618 } else {
4619 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4620 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
4621 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
4622 };
4623 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4624 };
4625
4626 if (!PackHiHalf) {
4627 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
4628 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
4629 }
4630 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
4631 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
4632 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4633}
4634
4635/// Return a vector_shuffle of the specified vector of zero or undef vector.
4636/// This produces a shuffle where the low element of V2 is swizzled into the
4637/// zero/undef vector, landing at element Idx.
4638/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
4640 bool IsZero,
4641 const X86Subtarget &Subtarget,
4642 SelectionDAG &DAG) {
4643 MVT VT = V2.getSimpleValueType();
4644 SDValue V1 = IsZero
4645 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4646 int NumElems = VT.getVectorNumElements();
4647 SmallVector<int, 16> MaskVec(NumElems);
4648 for (int i = 0; i != NumElems; ++i)
4649 // If this is the insertion idx, put the low elt of V2 here.
4650 MaskVec[i] = (i == Idx) ? NumElems : i;
4651 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4652}
4653
4655 if (Ptr.getOpcode() == X86ISD::Wrapper ||
4656 Ptr.getOpcode() == X86ISD::WrapperRIP)
4657 Ptr = Ptr.getOperand(0);
4658 return dyn_cast<ConstantPoolSDNode>(Ptr);
4659}
4660
4661// TODO: Add support for non-zero offsets.
4664 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4665 return nullptr;
4666 return CNode->getConstVal();
4667}
4668
4670 if (!Load || !ISD::isNormalLoad(Load))
4671 return nullptr;
4672 return getTargetConstantFromBasePtr(Load->getBasePtr());
4673}
4674
4677 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
4678}
4679
4680const Constant *
4682 assert(LD && "Unexpected null LoadSDNode");
4683 return getTargetConstantFromNode(LD);
4684}
4685
4686// Extract raw constant bits from constant pools.
4687static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
4688 APInt &UndefElts,
4689 SmallVectorImpl<APInt> &EltBits,
4690 bool AllowWholeUndefs = true,
4691 bool AllowPartialUndefs = false) {
4692 assert(EltBits.empty() && "Expected an empty EltBits vector");
4693
4695
4696 EVT VT = Op.getValueType();
4697 unsigned SizeInBits = VT.getSizeInBits();
4698 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
4699 unsigned NumElts = SizeInBits / EltSizeInBits;
4700
4701 // Bitcast a source array of element bits to the target size.
4702 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
4703 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
4704 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
4705 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
4706 "Constant bit sizes don't match");
4707
4708 // Don't split if we don't allow undef bits.
4709 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
4710 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
4711 return false;
4712
4713 // If we're already the right size, don't bother bitcasting.
4714 if (NumSrcElts == NumElts) {
4715 UndefElts = UndefSrcElts;
4716 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
4717 return true;
4718 }
4719
4720 // Extract all the undef/constant element data and pack into single bitsets.
4721 APInt UndefBits(SizeInBits, 0);
4722 APInt MaskBits(SizeInBits, 0);
4723
4724 for (unsigned i = 0; i != NumSrcElts; ++i) {
4725 unsigned BitOffset = i * SrcEltSizeInBits;
4726 if (UndefSrcElts[i])
4727 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
4728 MaskBits.insertBits(SrcEltBits[i], BitOffset);
4729 }
4730
4731 // Split the undef/constant single bitset data into the target elements.
4732 UndefElts = APInt(NumElts, 0);
4733 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
4734
4735 for (unsigned i = 0; i != NumElts; ++i) {
4736 unsigned BitOffset = i * EltSizeInBits;
4737 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
4738
4739 // Only treat an element as UNDEF if all bits are UNDEF.
4740 if (UndefEltBits.isAllOnes()) {
4741 if (!AllowWholeUndefs)
4742 return false;
4743 UndefElts.setBit(i);
4744 continue;
4745 }
4746
4747 // If only some bits are UNDEF then treat them as zero (or bail if not
4748 // supported).
4749 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
4750 return false;
4751
4752 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
4753 }
4754 return true;
4755 };
4756
4757 // Collect constant bits and insert into mask/undef bit masks.
4758 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
4759 unsigned UndefBitIndex) {
4760 if (!Cst)
4761 return false;
4762 if (isa<UndefValue>(Cst)) {
4763 Undefs.setBit(UndefBitIndex);
4764 return true;
4765 }
4766 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
4767 Mask = CInt->getValue();
4768 return true;
4769 }
4770 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
4771 Mask = CFP->getValueAPF().bitcastToAPInt();
4772 return true;
4773 }
4774 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
4775 Type *Ty = CDS->getType();
4777 Type *EltTy = CDS->getElementType();
4778 bool IsInteger = EltTy->isIntegerTy();
4779 bool IsFP =
4780 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
4781 if (!IsInteger && !IsFP)
4782 return false;
4783 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
4784 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
4785 if (IsInteger)
4786 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
4787 else
4788 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
4789 I * EltBits);
4790 return true;
4791 }
4792 return false;
4793 };
4794
4795 // Handle UNDEFs.
4796 if (Op.isUndef()) {
4797 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
4798 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
4799 return CastBitData(UndefSrcElts, SrcEltBits);
4800 }
4801
4802 // Extract scalar constant bits.
4803 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
4804 APInt UndefSrcElts = APInt::getZero(1);
4805 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
4806 return CastBitData(UndefSrcElts, SrcEltBits);
4807 }
4808 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
4809 APInt UndefSrcElts = APInt::getZero(1);
4810 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
4811 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
4812 return CastBitData(UndefSrcElts, SrcEltBits);
4813 }
4814
4815 // Extract constant bits from build vector.
4816 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
4817 BitVector Undefs;
4818 SmallVector<APInt> SrcEltBits;
4819 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4820 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
4821 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
4822 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
4823 if (Undefs[I])
4824 UndefSrcElts.setBit(I);
4825 return CastBitData(UndefSrcElts, SrcEltBits);
4826 }
4827 }
4828
4829 // Extract constant bits from constant pool vector.
4830 if (auto *Cst = getTargetConstantFromNode(Op)) {
4831 Type *CstTy = Cst->getType();
4832 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
4833 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
4834 return false;
4835
4836 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
4837 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
4838 if ((SizeInBits % SrcEltSizeInBits) != 0)
4839 return false;
4840
4841 APInt UndefSrcElts(NumSrcElts, 0);
4842 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
4843 for (unsigned i = 0; i != NumSrcElts; ++i)
4844 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
4845 UndefSrcElts, i))
4846 return false;
4847
4848 return CastBitData(UndefSrcElts, SrcEltBits);
4849 }
4850
4851 // Extract constant bits from a broadcasted constant pool scalar.
4852 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
4853 EltSizeInBits <= VT.getScalarSizeInBits()) {
4854 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
4855 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
4856 return false;
4857
4858 SDValue Ptr = MemIntr->getBasePtr();
4860 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4861 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
4862
4863 APInt UndefSrcElts(NumSrcElts, 0);
4864 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
4865 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
4866 if (UndefSrcElts[0])
4867 UndefSrcElts.setBits(0, NumSrcElts);
4868 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
4869 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
4870 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
4871 return CastBitData(UndefSrcElts, SrcEltBits);
4872 }
4873 }
4874 }
4875
4876 // Extract constant bits from a subvector broadcast.
4877 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
4878 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
4879 SDValue Ptr = MemIntr->getBasePtr();
4880 // The source constant may be larger than the subvector broadcast,
4881 // ensure we extract the correct subvector constants.
4882 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
4883 Type *CstTy = Cst->getType();
4884 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
4885 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
4886 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
4887 (SizeInBits % SubVecSizeInBits) != 0)
4888 return false;
4889 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
4890 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
4891 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
4892 APInt UndefSubElts(NumSubElts, 0);
4893 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
4894 APInt(CstEltSizeInBits, 0));
4895 for (unsigned i = 0; i != NumSubElts; ++i) {
4896 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
4897 UndefSubElts, i))
4898 return false;
4899 for (unsigned j = 1; j != NumSubVecs; ++j)
4900 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
4901 }
4902 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
4903 UndefSubElts);
4904 return CastBitData(UndefSubElts, SubEltBits);
4905 }
4906 }
4907
4908 // Extract a rematerialized scalar constant insertion.
4909 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
4910 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
4911 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
4912 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4913 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
4914
4915 APInt UndefSrcElts(NumSrcElts, 0);
4916 SmallVector<APInt, 64> SrcEltBits;
4917 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
4918 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
4919 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
4920 return CastBitData(UndefSrcElts, SrcEltBits);
4921 }
4922
4923 // Insert constant bits from a base and sub vector sources.
4924 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
4925 // If bitcasts to larger elements we might lose track of undefs - don't
4926 // allow any to be safe.
4927 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
4928 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
4929
4930 APInt UndefSrcElts, UndefSubElts;
4931 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
4932 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
4933 UndefSubElts, EltSubBits,
4934 AllowWholeUndefs && AllowUndefs,
4935 AllowPartialUndefs && AllowUndefs) &&
4936 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
4937 UndefSrcElts, EltSrcBits,
4938 AllowWholeUndefs && AllowUndefs,
4939 AllowPartialUndefs && AllowUndefs)) {
4940 unsigned BaseIdx = Op.getConstantOperandVal(2);
4941 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
4942 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
4943 EltSrcBits[BaseIdx + i] = EltSubBits[i];
4944 return CastBitData(UndefSrcElts, EltSrcBits);
4945 }
4946 }
4947
4948 // Extract constant bits from a subvector's source.
4949 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
4950 // TODO - support extract_subvector through bitcasts.
4951 if (EltSizeInBits != VT.getScalarSizeInBits())
4952 return false;
4953
4954 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
4955 UndefElts, EltBits, AllowWholeUndefs,
4956 AllowPartialUndefs)) {
4957 EVT SrcVT = Op.getOperand(0).getValueType();
4958 unsigned NumSrcElts = SrcVT.getVectorNumElements();
4959 unsigned NumSubElts = VT.getVectorNumElements();
4960 unsigned BaseIdx = Op.getConstantOperandVal(1);
4961 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
4962 if ((BaseIdx + NumSubElts) != NumSrcElts)
4963 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
4964 if (BaseIdx != 0)
4965 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
4966 return true;
4967 }
4968 }
4969
4970 // Extract constant bits from shuffle node sources.
4971 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
4972 // TODO - support shuffle through bitcasts.
4973 if (EltSizeInBits != VT.getScalarSizeInBits())
4974 return false;
4975
4976 ArrayRef<int> Mask = SVN->getMask();
4977 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
4978 llvm::any_of(Mask, [](int M) { return M < 0; }))
4979 return false;
4980
4981 APInt UndefElts0, UndefElts1;
4982 SmallVector<APInt, 32> EltBits0, EltBits1;
4983 if (isAnyInRange(Mask, 0, NumElts) &&
4984 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
4985 UndefElts0, EltBits0, AllowWholeUndefs,
4986 AllowPartialUndefs))
4987 return false;
4988 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
4989 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
4990 UndefElts1, EltBits1, AllowWholeUndefs,
4991 AllowPartialUndefs))
4992 return false;
4993
4994 UndefElts = APInt::getZero(NumElts);
4995 for (int i = 0; i != (int)NumElts; ++i) {
4996 int M = Mask[i];
4997 if (M < 0) {
4998 UndefElts.setBit(i);
4999 EltBits.push_back(APInt::getZero(EltSizeInBits));
5000 } else if (M < (int)NumElts) {
5001 if (UndefElts0[M])
5002 UndefElts.setBit(i);
5003 EltBits.push_back(EltBits0[M]);
5004 } else {
5005 if (UndefElts1[M - NumElts])
5006 UndefElts.setBit(i);
5007 EltBits.push_back(EltBits1[M - NumElts]);
5008 }
5009 }
5010 return true;
5011 }
5012
5013 return false;
5014}
5015
5016namespace llvm {
5017namespace X86 {
5018bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5019 APInt UndefElts;
5020 SmallVector<APInt, 16> EltBits;
5022 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5023 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5024 int SplatIndex = -1;
5025 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5026 if (UndefElts[i])
5027 continue;
5028 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5029 SplatIndex = -1;
5030 break;
5031 }
5032 SplatIndex = i;
5033 }
5034 if (0 <= SplatIndex) {
5035 SplatVal = EltBits[SplatIndex];
5036 return true;
5037 }
5038 }
5039
5040 return false;
5041}
5042} // namespace X86
5043} // namespace llvm
5044
5046 unsigned MaskEltSizeInBits,
5048 APInt &UndefElts) {
5049 // Extract the raw target constant bits.
5050 SmallVector<APInt, 64> EltBits;
5051 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5052 EltBits, /* AllowWholeUndefs */ true,
5053 /* AllowPartialUndefs */ false))
5054 return false;
5055
5056 // Insert the extracted elements into the mask.
5057 for (const APInt &Elt : EltBits)
5058 RawMask.push_back(Elt.getZExtValue());
5059
5060 return true;
5061}
5062
5063// Match not(xor X, -1) -> X.
5064// Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5065// Match not(extract_subvector(xor X, -1)) -> extract_subvector(X).
5066// Match not(concat_vectors(xor X, -1, xor Y, -1)) -> concat_vectors(X, Y).
5068 V = peekThroughBitcasts(V);
5069 if (V.getOpcode() == ISD::XOR &&
5070 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5071 isAllOnesConstant(V.getOperand(1))))
5072 return V.getOperand(0);
5073 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5074 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5075 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5076 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5077 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),
5078 Not, V.getOperand(1));
5079 }
5080 }
5081 if (V.getOpcode() == X86ISD::PCMPGT &&
5082 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5083 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5084 V.getOperand(0).hasOneUse()) {
5085 APInt UndefElts;
5086 SmallVector<APInt> EltBits;
5087 if (getTargetConstantBitsFromNode(V.getOperand(0),
5088 V.getScalarValueSizeInBits(), UndefElts,
5089 EltBits)) {
5090 // Don't fold min_signed_value -> (min_signed_value - 1)
5091 bool MinSigned = false;
5092 for (APInt &Elt : EltBits) {
5093 MinSigned |= Elt.isMinSignedValue();
5094 Elt -= 1;
5095 }
5096 if (!MinSigned) {
5097 SDLoc DL(V);
5098 MVT VT = V.getSimpleValueType();
5099 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5100 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5101 }
5102 }
5103 }
5105 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5106 for (SDValue &CatOp : CatOps) {
5107 SDValue NotCat = IsNOT(CatOp, DAG);
5108 if (!NotCat) return SDValue();
5109 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5110 }
5111 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);
5112 }
5113 return SDValue();
5114}
5115
5116/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5117/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5118/// Note: This ignores saturation, so inputs must be checked first.
5120 bool Unary, unsigned NumStages = 1) {
5121 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5122 unsigned NumElts = VT.getVectorNumElements();
5123 unsigned NumLanes = VT.getSizeInBits() / 128;
5124 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5125 unsigned Offset = Unary ? 0 : NumElts;
5126 unsigned Repetitions = 1u << (NumStages - 1);
5127 unsigned Increment = 1u << NumStages;
5128 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5129
5130 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5131 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5132 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5133 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5134 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5135 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5136 }
5137 }
5138}
5139
5140// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5141static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5142 APInt &DemandedLHS, APInt &DemandedRHS) {
5143 int NumLanes = VT.getSizeInBits() / 128;
5144 int NumElts = DemandedElts.getBitWidth();
5145 int NumInnerElts = NumElts / 2;
5146 int NumEltsPerLane = NumElts / NumLanes;
5147 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5148
5149 DemandedLHS = APInt::getZero(NumInnerElts);
5150 DemandedRHS = APInt::getZero(NumInnerElts);
5151
5152 // Map DemandedElts to the packed operands.
5153 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5154 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5155 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5156 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5157 if (DemandedElts[OuterIdx])
5158 DemandedLHS.setBit(InnerIdx);
5159 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5160 DemandedRHS.setBit(InnerIdx);
5161 }
5162 }
5163}
5164
5165// Split the demanded elts of a HADD/HSUB node between its operands.
5166static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5167 APInt &DemandedLHS, APInt &DemandedRHS) {
5168 int NumLanes = VT.getSizeInBits() / 128;
5169 int NumElts = DemandedElts.getBitWidth();
5170 int NumEltsPerLane = NumElts / NumLanes;
5171 int HalfEltsPerLane = NumEltsPerLane / 2;
5172
5173 DemandedLHS = APInt::getZero(NumElts);
5174 DemandedRHS = APInt::getZero(NumElts);
5175
5176 // Map DemandedElts to the horizontal operands.
5177 for (int Idx = 0; Idx != NumElts; ++Idx) {
5178 if (!DemandedElts[Idx])
5179 continue;
5180 int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
5181 int LocalIdx = Idx % NumEltsPerLane;
5182 if (LocalIdx < HalfEltsPerLane) {
5183 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);
5184 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);
5185 } else {
5186 LocalIdx -= HalfEltsPerLane;
5187 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);
5188 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);
5189 }
5190 }
5191}
5192
5193/// Calculates the shuffle mask corresponding to the target-specific opcode.
5194/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5195/// operands in \p Ops, and returns true.
5196/// Sets \p IsUnary to true if only one source is used. Note that this will set
5197/// IsUnary for shuffles which use a single input multiple times, and in those
5198/// cases it will adjust the mask to only have indices within that single input.
5199/// It is an error to call this with non-empty Mask/Ops vectors.
5200static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5202 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5203 if (!isTargetShuffle(N.getOpcode()))
5204 return false;
5205
5206 MVT VT = N.getSimpleValueType();
5207 unsigned NumElems = VT.getVectorNumElements();
5208 unsigned MaskEltSize = VT.getScalarSizeInBits();
5210 APInt RawUndefs;
5211 uint64_t ImmN;
5212
5213 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5214 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5215
5216 IsUnary = false;
5217 bool IsFakeUnary = false;
5218 switch (N.getOpcode()) {
5219 case X86ISD::BLENDI:
5220 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5221 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5222 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5223 DecodeBLENDMask(NumElems, ImmN, Mask);
5224 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5225 break;
5226 case X86ISD::SHUFP:
5227 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5228 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5229 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5230 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5231 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5232 break;
5233 case X86ISD::INSERTPS:
5234 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5235 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5236 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5237 DecodeINSERTPSMask(ImmN, Mask);
5238 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5239 break;
5240 case X86ISD::EXTRQI:
5241 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5242 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5243 isa<ConstantSDNode>(N.getOperand(2))) {
5244 int BitLen = N.getConstantOperandVal(1);
5245 int BitIdx = N.getConstantOperandVal(2);
5246 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5247 IsUnary = true;
5248 }
5249 break;
5250 case X86ISD::INSERTQI:
5251 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5252 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5253 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5254 isa<ConstantSDNode>(N.getOperand(3))) {
5255 int BitLen = N.getConstantOperandVal(2);
5256 int BitIdx = N.getConstantOperandVal(3);
5257 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5258 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5259 }
5260 break;
5261 case X86ISD::UNPCKH:
5262 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5263 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5264 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5265 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5266 break;
5267 case X86ISD::UNPCKL:
5268 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5269 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5270 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5271 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5272 break;
5273 case X86ISD::MOVHLPS:
5274 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5275 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5276 DecodeMOVHLPSMask(NumElems, Mask);
5277 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5278 break;
5279 case X86ISD::MOVLHPS:
5280 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5281 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5282 DecodeMOVLHPSMask(NumElems, Mask);
5283 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5284 break;
5285 case X86ISD::VALIGN:
5286 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5287 "Only 32-bit and 64-bit elements are supported!");
5288 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5289 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5290 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5291 DecodeVALIGNMask(NumElems, ImmN, Mask);
5292 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5293 Ops.push_back(N.getOperand(1));
5294 Ops.push_back(N.getOperand(0));
5295 break;
5296 case X86ISD::PALIGNR:
5297 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5298 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5299 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5300 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5301 DecodePALIGNRMask(NumElems, ImmN, Mask);
5302 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5303 Ops.push_back(N.getOperand(1));
5304 Ops.push_back(N.getOperand(0));
5305 break;
5306 case X86ISD::VSHLDQ:
5307 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5308 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5309 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5310 DecodePSLLDQMask(NumElems, ImmN, Mask);
5311 IsUnary = true;
5312 break;
5313 case X86ISD::VSRLDQ:
5314 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5315 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5316 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5317 DecodePSRLDQMask(NumElems, ImmN, Mask);
5318 IsUnary = true;
5319 break;
5320 case X86ISD::PSHUFD:
5321 case X86ISD::VPERMILPI:
5322 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5323 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5324 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5325 IsUnary = true;
5326 break;
5327 case X86ISD::PSHUFHW:
5328 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5329 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5330 DecodePSHUFHWMask(NumElems, ImmN, Mask);
5331 IsUnary = true;
5332 break;
5333 case X86ISD::PSHUFLW:
5334 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5335 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5336 DecodePSHUFLWMask(NumElems, ImmN, Mask);
5337 IsUnary = true;
5338 break;
5339 case X86ISD::VZEXT_MOVL:
5340 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5341 DecodeZeroMoveLowMask(NumElems, Mask);
5342 IsUnary = true;
5343 break;
5344 case X86ISD::VBROADCAST:
5345 // We only decode broadcasts of same-sized vectors, peeking through to
5346 // extracted subvectors is likely to cause hasOneUse issues with
5347 // SimplifyDemandedBits etc.
5348 if (N.getOperand(0).getValueType() == VT) {
5349 DecodeVectorBroadcast(NumElems, Mask);
5350 IsUnary = true;
5351 break;
5352 }
5353 return false;
5354 case X86ISD::VPERMILPV: {
5355 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5356 IsUnary = true;
5357 SDValue MaskNode = N.getOperand(1);
5358 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5359 RawUndefs)) {
5360 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
5361 break;
5362 }
5363 return false;
5364 }
5365 case X86ISD::PSHUFB: {
5366 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5367 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5368 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5369 IsUnary = true;
5370 SDValue MaskNode = N.getOperand(1);
5371 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5372 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
5373 break;
5374 }
5375 return false;
5376 }
5377 case X86ISD::VPERMI:
5378 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5379 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5380 DecodeVPERMMask(NumElems, ImmN, Mask);
5381 IsUnary = true;
5382 break;
5383 case X86ISD::MOVSS:
5384 case X86ISD::MOVSD:
5385 case X86ISD::MOVSH:
5386 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5387 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5388 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5389 break;
5390 case X86ISD::VPERM2X128:
5391 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5392 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5393 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5394 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
5395 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5396 break;
5397 case X86ISD::SHUF128:
5398 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5399 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5400 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5401 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
5402 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5403 break;
5404 case X86ISD::MOVSLDUP:
5405 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5406 DecodeMOVSLDUPMask(NumElems, Mask);
5407 IsUnary = true;
5408 break;
5409 case X86ISD::MOVSHDUP:
5410 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5411 DecodeMOVSHDUPMask(NumElems, Mask);
5412 IsUnary = true;
5413 break;
5414 case X86ISD::MOVDDUP:
5415 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5416 DecodeMOVDDUPMask(NumElems, Mask);
5417 IsUnary = true;
5418 break;
5419 case X86ISD::VPERMIL2: {
5420 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5421 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5422 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5423 SDValue MaskNode = N.getOperand(2);
5424 SDValue CtrlNode = N.getOperand(3);
5425 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5426 unsigned CtrlImm = CtrlOp->getZExtValue();
5427 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5428 RawUndefs)) {
5429 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
5430 Mask);
5431 break;
5432 }
5433 }
5434 return false;
5435 }
5436 case X86ISD::VPPERM: {
5437 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5438 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5439 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5440 SDValue MaskNode = N.getOperand(2);
5441 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5442 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
5443 break;
5444 }
5445 return false;
5446 }
5447 case X86ISD::VPERMV: {
5448 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5449 IsUnary = true;
5450 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5451 Ops.push_back(N.getOperand(1));
5452 SDValue MaskNode = N.getOperand(0);
5453 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5454 RawUndefs)) {
5455 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
5456 break;
5457 }
5458 return false;
5459 }
5460 case X86ISD::VPERMV3: {
5461 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5462 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
5463 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
5464 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5465 Ops.push_back(N.getOperand(0));
5466 Ops.push_back(N.getOperand(2));
5467 SDValue MaskNode = N.getOperand(1);
5468 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5469 RawUndefs)) {
5470 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
5471 break;
5472 }
5473 return false;
5474 }
5475 default:
5476 llvm_unreachable("unknown target shuffle node");
5477 }
5478
5479 // Empty mask indicates the decode failed.
5480 if (Mask.empty())
5481 return false;
5482
5483 // Check if we're getting a shuffle mask with zero'd elements.
5484 if (!AllowSentinelZero && isAnyZero(Mask))
5485 return false;
5486
5487 // If we have a fake unary shuffle, the shuffle mask is spread across two
5488 // inputs that are actually the same node. Re-map the mask to always point
5489 // into the first input.
5490 if (IsFakeUnary)
5491 for (int &M : Mask)
5492 if (M >= (int)Mask.size())
5493 M -= Mask.size();
5494
5495 // If we didn't already add operands in the opcode-specific code, default to
5496 // adding 1 or 2 operands starting at 0.
5497 if (Ops.empty()) {
5498 Ops.push_back(N.getOperand(0));
5499 if (!IsUnary || IsFakeUnary)
5500 Ops.push_back(N.getOperand(1));
5501 }
5502
5503 return true;
5504}
5505
5506// Wrapper for getTargetShuffleMask with InUnary;
5507static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5509 SmallVectorImpl<int> &Mask) {
5510 bool IsUnary;
5511 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
5512}
5513
5514/// Compute whether each element of a shuffle is zeroable.
5515///
5516/// A "zeroable" vector shuffle element is one which can be lowered to zero.
5517/// Either it is an undef element in the shuffle mask, the element of the input
5518/// referenced is undef, or the element of the input referenced is known to be
5519/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
5520/// as many lanes with this technique as possible to simplify the remaining
5521/// shuffle.
5523 SDValue V1, SDValue V2,
5524 APInt &KnownUndef, APInt &KnownZero) {
5525 int Size = Mask.size();
5526 KnownUndef = KnownZero = APInt::getZero(Size);
5527
5528 V1 = peekThroughBitcasts(V1);
5529 V2 = peekThroughBitcasts(V2);
5530
5531 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
5532 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
5533
5534 int VectorSizeInBits = V1.getValueSizeInBits();
5535 int ScalarSizeInBits = VectorSizeInBits / Size;
5536 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
5537
5538 for (int i = 0; i < Size; ++i) {
5539 int M = Mask[i];
5540 // Handle the easy cases.
5541 if (M < 0) {
5542 KnownUndef.setBit(i);
5543 continue;
5544 }
5545 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
5546 KnownZero.setBit(i);
5547 continue;
5548 }
5549
5550 // Determine shuffle input and normalize the mask.
5551 SDValue V = M < Size ? V1 : V2;
5552 M %= Size;
5553
5554 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5555 if (V.getOpcode() != ISD::BUILD_VECTOR)
5556 continue;
5557
5558 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
5559 // the (larger) source element must be UNDEF/ZERO.
5560 if ((Size % V.getNumOperands()) == 0) {
5561 int Scale = Size / V->getNumOperands();
5562 SDValue Op = V.getOperand(M / Scale);
5563 if (Op.isUndef())
5564 KnownUndef.setBit(i);
5565 if (X86::isZeroNode(Op))
5566 KnownZero.setBit(i);
5567 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
5568 APInt Val = Cst->getAPIntValue();
5569 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5570 if (Val == 0)
5571 KnownZero.setBit(i);
5572 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5573 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5574 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5575 if (Val == 0)
5576 KnownZero.setBit(i);
5577 }
5578 continue;
5579 }
5580
5581 // If the BUILD_VECTOR has more elements then all the (smaller) source
5582 // elements must be UNDEF or ZERO.
5583 if ((V.getNumOperands() % Size) == 0) {
5584 int Scale = V->getNumOperands() / Size;
5585 bool AllUndef = true;
5586 bool AllZero = true;
5587 for (int j = 0; j < Scale; ++j) {
5588 SDValue Op = V.getOperand((M * Scale) + j);
5589 AllUndef &= Op.isUndef();
5590 AllZero &= X86::isZeroNode(Op);
5591 }
5592 if (AllUndef)
5593 KnownUndef.setBit(i);
5594 if (AllZero)
5595 KnownZero.setBit(i);
5596 continue;
5597 }
5598 }
5599}
5600
5601/// Decode a target shuffle mask and inputs and see if any values are
5602/// known to be undef or zero from their inputs.
5603/// Returns true if the target shuffle mask was decoded.
5604/// FIXME: Merge this with computeZeroableShuffleElements?
5607 APInt &KnownUndef, APInt &KnownZero) {
5608 bool IsUnary;
5609 if (!isTargetShuffle(N.getOpcode()))
5610 return false;
5611
5612 MVT VT = N.getSimpleValueType();
5613 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
5614 return false;
5615
5616 int Size = Mask.size();
5617 SDValue V1 = Ops[0];
5618 SDValue V2 = IsUnary ? V1 : Ops[1];
5619 KnownUndef = KnownZero = APInt::getZero(Size);
5620
5621 V1 = peekThroughBitcasts(V1);
5622 V2 = peekThroughBitcasts(V2);
5623
5624 assert((VT.getSizeInBits() % Size) == 0 &&
5625 "Illegal split of shuffle value type");
5626 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
5627
5628 // Extract known constant input data.
5629 APInt UndefSrcElts[2];
5630 SmallVector<APInt, 32> SrcEltBits[2];
5631 bool IsSrcConstant[2] = {
5632 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5633 SrcEltBits[0], /*AllowWholeUndefs*/ true,
5634 /*AllowPartialUndefs*/ false),
5635 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5636 SrcEltBits[1], /*AllowWholeUndefs*/ true,
5637 /*AllowPartialUndefs*/ false)};
5638
5639 for (int i = 0; i < Size; ++i) {
5640 int M = Mask[i];
5641
5642 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5643 if (M < 0) {
5644 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
5645 if (SM_SentinelUndef == M)
5646 KnownUndef.setBit(i);
5647 if (SM_SentinelZero == M)
5648 KnownZero.setBit(i);
5649 continue;
5650 }
5651
5652 // Determine shuffle input and normalize the mask.
5653 unsigned SrcIdx = M / Size;
5654 SDValue V = M < Size ? V1 : V2;
5655 M %= Size;
5656
5657 // We are referencing an UNDEF input.
5658 if (V.isUndef()) {
5659 KnownUndef.setBit(i);
5660 continue;
5661 }
5662
5663 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5664 // TODO: We currently only set UNDEF for integer types - floats use the same
5665 // registers as vectors and many of the scalar folded loads rely on the
5666 // SCALAR_TO_VECTOR pattern.
5667 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5668 (Size % V.getValueType().getVectorNumElements()) == 0) {
5669 int Scale = Size / V.getValueType().getVectorNumElements();
5670 int Idx = M / Scale;
5671 if (Idx != 0 && !VT.isFloatingPoint())
5672 KnownUndef.setBit(i);
5673 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5674 KnownZero.setBit(i);
5675 continue;
5676 }
5677
5678 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
5679 // base vectors.
5680 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
5681 SDValue Vec = V.getOperand(0);
5682 int NumVecElts = Vec.getValueType().getVectorNumElements();
5683 if (Vec.isUndef() && Size == NumVecElts) {
5684 int Idx = V.getConstantOperandVal(2);
5685 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
5686 if (M < Idx || (Idx + NumSubElts) <= M)
5687 KnownUndef.setBit(i);
5688 }
5689 continue;
5690 }
5691
5692 // Attempt to extract from the source's constant bits.
5693 if (IsSrcConstant[SrcIdx]) {
5694 if (UndefSrcElts[SrcIdx][M])
5695 KnownUndef.setBit(i);
5696 else if (SrcEltBits[SrcIdx][M] == 0)
5697 KnownZero.setBit(i);
5698 }
5699 }
5700
5701 assert(VT.getVectorNumElements() == (unsigned)Size &&
5702 "Different mask size from vector size!");
5703 return true;
5704}
5705
5706// Replace target shuffle mask elements with known undef/zero sentinels.
5708 const APInt &KnownUndef,
5709 const APInt &KnownZero,
5710 bool ResolveKnownZeros= true) {
5711 unsigned NumElts = Mask.size();
5712 assert(KnownUndef.getBitWidth() == NumElts &&
5713 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
5714
5715 for (unsigned i = 0; i != NumElts; ++i) {
5716 if (KnownUndef[i])
5717 Mask[i] = SM_SentinelUndef;
5718 else if (ResolveKnownZeros && KnownZero[i])
5719 Mask[i] = SM_SentinelZero;
5720 }
5721}
5722
5723// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
5725 APInt &KnownUndef,
5726 APInt &KnownZero) {
5727 unsigned NumElts = Mask.size();
5728 KnownUndef = KnownZero = APInt::getZero(NumElts);
5729
5730 for (unsigned i = 0; i != NumElts; ++i) {
5731 int M = Mask[i];
5732 if (SM_SentinelUndef == M)
5733 KnownUndef.setBit(i);
5734 if (SM_SentinelZero == M)
5735 KnownZero.setBit(i);
5736 }
5737}
5738
5739// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
5741 SDValue Cond, bool IsBLENDV = false) {
5742 EVT CondVT = Cond.getValueType();
5743 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
5744 unsigned NumElts = CondVT.getVectorNumElements();
5745
5746 APInt UndefElts;
5747 SmallVector<APInt, 32> EltBits;
5748 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
5749 /*AllowWholeUndefs*/ true,
5750 /*AllowPartialUndefs*/ false))
5751 return false;
5752
5753 Mask.resize(NumElts, SM_SentinelUndef);
5754
5755 for (int i = 0; i != (int)NumElts; ++i) {
5756 Mask[i] = i;
5757 // Arbitrarily choose from the 2nd operand if the select condition element
5758 // is undef.
5759 // TODO: Can we do better by matching patterns such as even/odd?
5760 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
5761 (IsBLENDV && EltBits[i].isNonNegative()))
5762 Mask[i] += NumElts;
5763 }
5764
5765 return true;
5766}
5767
5768// Forward declaration (for getFauxShuffleMask recursive check).
5769static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
5772 const SelectionDAG &DAG, unsigned Depth,
5773 bool ResolveKnownElts);
5774
5775// Attempt to decode ops that could be represented as a shuffle mask.
5776// The decoded shuffle mask may contain a different number of elements to the
5777// destination value type.
5778// TODO: Merge into getTargetShuffleInputs()
5779static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
5782 const SelectionDAG &DAG, unsigned Depth,
5783 bool ResolveKnownElts) {
5784 Mask.clear();
5785 Ops.clear();
5786
5787 MVT VT = N.getSimpleValueType();
5788 unsigned NumElts = VT.getVectorNumElements();
5789 unsigned NumSizeInBits = VT.getSizeInBits();
5790 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5791 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
5792 return false;
5793 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
5794 unsigned NumSizeInBytes = NumSizeInBits / 8;
5795 unsigned NumBytesPerElt = NumBitsPerElt / 8;
5796
5797 unsigned Opcode = N.getOpcode();
5798 switch (Opcode) {
5799 case ISD::VECTOR_SHUFFLE: {
5800 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
5801 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
5802 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
5803 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
5804 Ops.push_back(N.getOperand(0));
5805 Ops.push_back(N.getOperand(1));
5806 return true;
5807 }
5808 return false;
5809 }
5810 case ISD::AND:
5811 case X86ISD::ANDNP: {
5812 // Attempt to decode as a per-byte mask.
5813 APInt UndefElts;
5814 SmallVector<APInt, 32> EltBits;
5815 SDValue N0 = N.getOperand(0);
5816 SDValue N1 = N.getOperand(1);
5817 bool IsAndN = (X86ISD::ANDNP == Opcode);
5818 uint64_t ZeroMask = IsAndN ? 255 : 0;
5819 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
5820 /*AllowWholeUndefs*/ false,
5821 /*AllowPartialUndefs*/ false))
5822 return false;
5823 // We can't assume an undef src element gives an undef dst - the other src
5824 // might be zero.
5825 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
5826 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5827 const APInt &ByteBits = EltBits[i];
5828 if (ByteBits != 0 && ByteBits != 255)
5829 return false;
5830 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
5831 }
5832 Ops.push_back(IsAndN ? N1 : N0);
5833 return true;
5834 }
5835 case ISD::OR: {
5836 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
5837 // is a valid shuffle index.
5838 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
5839 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
5840 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
5841 return false;
5842
5843 SmallVector<int, 64> SrcMask0, SrcMask1;
5844 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
5847 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
5848 Depth + 1, true) ||
5849 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
5850 Depth + 1, true))
5851 return false;
5852
5853 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
5854 SmallVector<int, 64> Mask0, Mask1;
5855 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
5856 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
5857 for (int i = 0; i != (int)MaskSize; ++i) {
5858 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
5859 // loops converting between OR and BLEND shuffles due to
5860 // canWidenShuffleElements merging away undef elements, meaning we
5861 // fail to recognise the OR as the undef element isn't known zero.
5862 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
5863 Mask.push_back(SM_SentinelZero);
5864 else if (Mask1[i] == SM_SentinelZero)
5865 Mask.push_back(i);
5866 else if (Mask0[i] == SM_SentinelZero)
5867 Mask.push_back(i + MaskSize);
5868 else
5869 return false;
5870 }
5871 Ops.push_back(N0);
5872 Ops.push_back(N1);
5873 return true;
5874 }
5875 case ISD::INSERT_SUBVECTOR: {
5876 SDValue Src = N.getOperand(0);
5877 SDValue Sub = N.getOperand(1);
5878 EVT SubVT = Sub.getValueType();
5879 unsigned NumSubElts = SubVT.getVectorNumElements();
5880 if (!N->isOnlyUserOf(Sub.getNode()))
5881 return false;
5882 SDValue SubBC = peekThroughBitcasts(Sub);
5883 uint64_t InsertIdx = N.getConstantOperandVal(2);
5884 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
5885 if (SubBC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5886 SubBC.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
5887 uint64_t ExtractIdx = SubBC.getConstantOperandVal(1);
5888 SDValue SubBCSrc = SubBC.getOperand(0);
5889 unsigned NumSubSrcBCElts = SubBCSrc.getValueType().getVectorNumElements();
5890 unsigned MaxElts = std::max(NumElts, NumSubSrcBCElts);
5891 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcBCElts) == 0 &&
5892 "Subvector valuetype mismatch");
5893 InsertIdx *= (MaxElts / NumElts);
5894 ExtractIdx *= (MaxElts / NumSubSrcBCElts);
5895 NumSubElts *= (MaxElts / NumElts);
5896 bool SrcIsUndef = Src.isUndef();
5897 for (int i = 0; i != (int)MaxElts; ++i)
5898 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
5899 for (int i = 0; i != (int)NumSubElts; ++i)
5900 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
5901 if (!SrcIsUndef)
5902 Ops.push_back(Src);
5903 Ops.push_back(SubBCSrc);
5904 return true;
5905 }
5906 // Handle CONCAT(SUB0, SUB1).
5907 // Limit this to vXi64 512-bit vector cases to make the most of AVX512
5908 // cross lane shuffles.
5909 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
5910 NumBitsPerElt == 64 && NumSizeInBits == 512 &&
5911 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
5912 Src.getOperand(0).isUndef() &&
5913 Src.getOperand(1).getValueType() == SubVT &&
5914 Src.getConstantOperandVal(2) == 0) {
5915 for (int i = 0; i != (int)NumSubElts; ++i)
5916 Mask.push_back(i);
5917 for (int i = 0; i != (int)NumSubElts; ++i)
5918 Mask.push_back(i + NumElts);
5919 Ops.push_back(Src.getOperand(1));
5920 Ops.push_back(Sub);
5921 return true;
5922 }
5923 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
5924 SmallVector<int, 64> SubMask;
5925 SmallVector<SDValue, 2> SubInputs;
5926 SDValue SubSrc = peekThroughOneUseBitcasts(Sub);
5927 EVT SubSrcVT = SubSrc.getValueType();
5928 if (!SubSrcVT.isVector())
5929 return false;
5930
5931 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
5932 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
5933 Depth + 1, ResolveKnownElts))
5934 return false;
5935
5936 // Subvector shuffle inputs must not be larger than the subvector.
5937 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
5938 return SubVT.getFixedSizeInBits() <
5939 SubInput.getValueSizeInBits().getFixedValue();
5940 }))
5941 return false;
5942
5943 if (SubMask.size() != NumSubElts) {
5944 assert(((SubMask.size() % NumSubElts) == 0 ||
5945 (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
5946 if ((NumSubElts % SubMask.size()) == 0) {
5947 int Scale = NumSubElts / SubMask.size();
5948 SmallVector<int,64> ScaledSubMask;
5949 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
5950 SubMask = ScaledSubMask;
5951 } else {
5952 int Scale = SubMask.size() / NumSubElts;
5953 NumSubElts = SubMask.size();
5954 NumElts *= Scale;
5955 InsertIdx *= Scale;
5956 }
5957 }
5958 Ops.push_back(Src);
5959 Ops.append(SubInputs.begin(), SubInputs.end());
5960 if (ISD::isBuildVectorAllZeros(Src.getNode()))
5961 Mask.append(NumElts, SM_SentinelZero);
5962 else
5963 for (int i = 0; i != (int)NumElts; ++i)
5964 Mask.push_back(i);
5965 for (int i = 0; i != (int)NumSubElts; ++i) {
5966 int M = SubMask[i];
5967 if (0 <= M) {
5968 int InputIdx = M / NumSubElts;
5969 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
5970 }
5971 Mask[i + InsertIdx] = M;
5972 }
5973 return true;
5974 }
5975 case X86ISD::PINSRB:
5976 case X86ISD::PINSRW:
5979 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
5980 // vector, for matching src/dst vector types.
5981 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
5982
5983 unsigned DstIdx = 0;
5984 if (Opcode != ISD::SCALAR_TO_VECTOR) {
5985 // Check we have an in-range constant insertion index.
5986 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
5987 N.getConstantOperandAPInt(2).uge(NumElts))
5988 return false;
5989 DstIdx = N.getConstantOperandVal(2);
5990
5991 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
5992 if (X86::isZeroNode(Scl)) {
5993 Ops.push_back(N.getOperand(0));
5994 for (unsigned i = 0; i != NumElts; ++i)
5995 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
5996 return true;
5997 }
5998 }
5999
6000 // Peek through trunc/aext/zext/bitcast.
6001 // TODO: aext shouldn't require SM_SentinelZero padding.
6002 // TODO: handle shift of scalars.
6003 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
6004 while (Scl.getOpcode() == ISD::TRUNCATE ||
6005 Scl.getOpcode() == ISD::ANY_EXTEND ||
6006 Scl.getOpcode() == ISD::ZERO_EXTEND ||
6007 (Scl.getOpcode() == ISD::BITCAST &&
6010 Scl = Scl.getOperand(0);
6011 MinBitsPerElt =
6012 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
6013 }
6014 if ((MinBitsPerElt % 8) != 0)
6015 return false;
6016
6017 // Attempt to find the source vector the scalar was extracted from.
6018 SDValue SrcExtract;
6019 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6020 Scl.getOpcode() == X86ISD::PEXTRW ||
6021 Scl.getOpcode() == X86ISD::PEXTRB) &&
6022 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6023 SrcExtract = Scl;
6024 }
6025 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6026 return false;
6027
6028 SDValue SrcVec = SrcExtract.getOperand(0);
6029 EVT SrcVT = SrcVec.getValueType();
6030 if (!SrcVT.getScalarType().isByteSized())
6031 return false;
6032 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6033 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6034 unsigned DstByte = DstIdx * NumBytesPerElt;
6035 MinBitsPerElt =
6036 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6037
6038 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6039 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6040 Ops.push_back(SrcVec);
6041 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6042 } else {
6043 Ops.push_back(SrcVec);
6044 Ops.push_back(N.getOperand(0));
6045 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6046 Mask.push_back(NumSizeInBytes + i);
6047 }
6048
6049 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6050 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6051 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6052 Mask[DstByte + i] = SrcByte + i;
6053 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6054 Mask[DstByte + i] = SM_SentinelZero;
6055 return true;
6056 }
6057 case X86ISD::PACKSS:
6058 case X86ISD::PACKUS: {
6059 SDValue N0 = N.getOperand(0);
6060 SDValue N1 = N.getOperand(1);
6061 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6062 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6063 "Unexpected input value type");
6064
6065 APInt EltsLHS, EltsRHS;
6066 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6067
6068 // If we know input saturation won't happen (or we don't care for particular
6069 // lanes), we can treat this as a truncation shuffle.
6070 bool Offset0 = false, Offset1 = false;
6071 if (Opcode == X86ISD::PACKSS) {
6072 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6073 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6074 (!(N1.isUndef() || EltsRHS.isZero()) &&
6075 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6076 return false;
6077 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6078 // PACKSS then it was likely being used for sign-extension for a
6079 // truncation, so just peek through and adjust the mask accordingly.
6080 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6081 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6082 Offset0 = true;
6083 N0 = N0.getOperand(0);
6084 }
6085 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6086 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6087 Offset1 = true;
6088 N1 = N1.getOperand(0);
6089 }
6090 } else {
6091 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6092 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6093 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6094 (!(N1.isUndef() || EltsRHS.isZero()) &&
6095 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6096 return false;
6097 }
6098
6099 bool IsUnary = (N0 == N1);
6100
6101 Ops.push_back(N0);
6102 if (!IsUnary)
6103 Ops.push_back(N1);
6104
6105 createPackShuffleMask(VT, Mask, IsUnary);
6106
6107 if (Offset0 || Offset1) {
6108 for (int &M : Mask)
6109 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6110 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6111 ++M;
6112 }
6113 return true;
6114 }
6115 case ISD::VSELECT:
6116 case X86ISD::BLENDV: {
6117 SDValue Cond = N.getOperand(0);
6118 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6119 Ops.push_back(N.getOperand(1));
6120 Ops.push_back(N.getOperand(2));
6121 return true;
6122 }
6123 return false;
6124 }
6125 case X86ISD::VTRUNC: {
6126 SDValue Src = N.getOperand(0);
6127 EVT SrcVT = Src.getValueType();
6128 // Truncated source must be a simple vector.
6129 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6130 (SrcVT.getScalarSizeInBits() % 8) != 0)
6131 return false;
6132 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6133 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6134 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6135 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6136 for (unsigned i = 0; i != NumSrcElts; ++i)
6137 Mask.push_back(i * Scale);
6138 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6139 Ops.push_back(Src);
6140 return true;
6141 }
6142 case X86ISD::VSHLI:
6143 case X86ISD::VSRLI: {
6144 uint64_t ShiftVal = N.getConstantOperandVal(1);
6145 // Out of range bit shifts are guaranteed to be zero.
6146 if (NumBitsPerElt <= ShiftVal) {
6147 Mask.append(NumElts, SM_SentinelZero);
6148 return true;
6149 }
6150
6151 // We can only decode 'whole byte' bit shifts as shuffles.
6152 if ((ShiftVal % 8) != 0)
6153 break;
6154
6155 uint64_t ByteShift = ShiftVal / 8;
6156 Ops.push_back(N.getOperand(0));
6157
6158 // Clear mask to all zeros and insert the shifted byte indices.
6159 Mask.append(NumSizeInBytes, SM_SentinelZero);
6160
6161 if (X86ISD::VSHLI == Opcode) {
6162 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6163 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6164 Mask[i + j] = i + j - ByteShift;
6165 } else {
6166 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6167 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6168 Mask[i + j - ByteShift] = i + j;
6169 }
6170 return true;
6171 }
6172 case X86ISD::VROTLI:
6173 case X86ISD::VROTRI: {
6174 // We can only decode 'whole byte' bit rotates as shuffles.
6175 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6176 if ((RotateVal % 8) != 0)
6177 return false;
6178 Ops.push_back(N.getOperand(0));
6179 int Offset = RotateVal / 8;
6180 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6181 for (int i = 0; i != (int)NumElts; ++i) {
6182 int BaseIdx = i * NumBytesPerElt;
6183 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
6184 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
6185 }
6186 }
6187 return true;
6188 }
6189 case X86ISD::VBROADCAST: {
6190 SDValue Src = N.getOperand(0);
6191 if (!Src.getSimpleValueType().isVector()) {
6192 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6193 !isNullConstant(Src.getOperand(1)) ||
6194 Src.getOperand(0).getValueType().getScalarType() !=
6195 VT.getScalarType())
6196 return false;
6197 Src = Src.getOperand(0);
6198 }
6199 Ops.push_back(Src);
6200 Mask.append(NumElts, 0);
6201 return true;
6202 }
6204 SDValue Src = N.getOperand(0);
6205 EVT SrcVT = Src.getValueType();
6206 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6207
6208 // Extended source must be a simple vector.
6209 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6210 (NumBitsPerSrcElt % 8) != 0)
6211 return false;
6212
6213 // We can only handle all-signbits extensions.
6214 APInt DemandedSrcElts =
6215 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
6216 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
6217 return false;
6218
6219 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
6220 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6221 for (unsigned I = 0; I != NumElts; ++I)
6222 Mask.append(Scale, I);
6223 Ops.push_back(Src);
6224 return true;
6225 }
6226 case ISD::ZERO_EXTEND:
6227 case ISD::ANY_EXTEND:
6230 SDValue Src = N.getOperand(0);
6231 EVT SrcVT = Src.getValueType();
6232
6233 // Extended source must be a simple vector.
6234 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6235 (SrcVT.getScalarSizeInBits() % 8) != 0)
6236 return false;
6237
6238 bool IsAnyExtend =
6239 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
6240 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6241 IsAnyExtend, Mask);
6242 Ops.push_back(Src);
6243 return true;
6244 }
6245 }
6246
6247 return false;
6248}
6249
6250/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
6252 SmallVectorImpl<int> &Mask) {
6253 int MaskWidth = Mask.size();
6254 SmallVector<SDValue, 16> UsedInputs;
6255 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6256 int lo = UsedInputs.size() * MaskWidth;
6257 int hi = lo + MaskWidth;
6258
6259 // Strip UNDEF input usage.
6260 if (Inputs[i].isUndef())
6261 for (int &M : Mask)
6262 if ((lo <= M) && (M < hi))
6263 M = SM_SentinelUndef;
6264
6265 // Check for unused inputs.
6266 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6267 for (int &M : Mask)
6268 if (lo <= M)
6269 M -= MaskWidth;
6270 continue;
6271 }
6272
6273 // Check for repeated inputs.
6274 bool IsRepeat = false;
6275 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
6276 if (UsedInputs[j] != Inputs[i])
6277 continue;
6278 for (int &M : Mask)
6279 if (lo <= M)
6280 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6281 IsRepeat = true;
6282 break;
6283 }
6284 if (IsRepeat)
6285 continue;
6286
6287 UsedInputs.push_back(Inputs[i]);
6288 }
6289 Inputs = UsedInputs;
6290}
6291
6292/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
6293/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
6294/// Returns true if the target shuffle mask was decoded.
6295static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6298 APInt &KnownUndef, APInt &KnownZero,
6299 const SelectionDAG &DAG, unsigned Depth,
6300 bool ResolveKnownElts) {
6302 return false; // Limit search depth.
6303
6304 EVT VT = Op.getValueType();
6305 if (!VT.isSimple() || !VT.isVector())
6306 return false;
6307
6308 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
6309 if (ResolveKnownElts)
6310 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
6311 return true;
6312 }
6313 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6314 ResolveKnownElts)) {
6315 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
6316 return true;
6317 }
6318 return false;
6319}
6320
6321static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6324 const SelectionDAG &DAG, unsigned Depth,
6325 bool ResolveKnownElts) {
6326 APInt KnownUndef, KnownZero;
6327 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
6328 KnownZero, DAG, Depth, ResolveKnownElts);
6329}
6330
6333 const SelectionDAG &DAG, unsigned Depth = 0,
6334 bool ResolveKnownElts = true) {
6335 EVT VT = Op.getValueType();
6336 if (!VT.isSimple() || !VT.isVector())
6337 return false;
6338
6339 unsigned NumElts = Op.getValueType().getVectorNumElements();
6340 APInt DemandedElts = APInt::getAllOnes(NumElts);
6341 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6342 ResolveKnownElts);
6343}
6344
6345// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
6346static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
6347 EVT MemVT, MemSDNode *Mem, unsigned Offset,
6348 SelectionDAG &DAG) {
6349 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
6350 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
6351 "Unknown broadcast load type");
6352
6353 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6354 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6355 return SDValue();
6356
6359 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6360 SDValue Ops[] = {Mem->getChain(), Ptr};
6361 SDValue BcstLd = DAG.getMemIntrinsicNode(
6362 Opcode, DL, Tys, Ops, MemVT,
6364 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6365 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
6366 return BcstLd;
6367}
6368
6369/// Returns the scalar element that will make up the i'th
6370/// element of the result of the vector shuffle.
6372 SelectionDAG &DAG, unsigned Depth) {
6374 return SDValue(); // Limit search depth.
6375
6376 EVT VT = Op.getValueType();
6377 unsigned Opcode = Op.getOpcode();
6378 unsigned NumElems = VT.getVectorNumElements();
6379
6380 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6381 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
6382 int Elt = SV->getMaskElt(Index);
6383
6384 if (Elt < 0)
6385 return DAG.getUNDEF(VT.getVectorElementType());
6386
6387 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6388 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6389 }
6390
6391 // Recurse into target specific vector shuffles to find scalars.
6392 if (isTargetShuffle(Opcode)) {
6393 MVT ShufVT = VT.getSimpleVT();
6394 MVT ShufSVT = ShufVT.getVectorElementType();
6395 int NumElems = (int)ShufVT.getVectorNumElements();
6396 SmallVector<int, 16> ShuffleMask;
6398 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
6399 return SDValue();
6400
6401 int Elt = ShuffleMask[Index];
6402 if (Elt == SM_SentinelZero)
6403 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
6404 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
6405 if (Elt == SM_SentinelUndef)
6406 return DAG.getUNDEF(ShufSVT);
6407
6408 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
6409 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6410 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6411 }
6412
6413 // Recurse into insert_subvector base/sub vector to find scalars.
6414 if (Opcode == ISD::INSERT_SUBVECTOR) {
6415 SDValue Vec = Op.getOperand(0);
6416 SDValue Sub = Op.getOperand(1);
6417 uint64_t SubIdx = Op.getConstantOperandVal(2);
6418 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
6419
6420 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6421 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6422 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6423 }
6424
6425 // Recurse into concat_vectors sub vector to find scalars.
6426 if (Opcode == ISD::CONCAT_VECTORS) {
6427 EVT SubVT = Op.getOperand(0).getValueType();
6428 unsigned NumSubElts = SubVT.getVectorNumElements();
6429 uint64_t SubIdx = Index / NumSubElts;
6430 uint64_t SubElt = Index % NumSubElts;
6431 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6432 }
6433
6434 // Recurse into extract_subvector src vector to find scalars.
6435 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
6436 SDValue Src = Op.getOperand(0);
6437 uint64_t SrcIdx = Op.getConstantOperandVal(1);
6438 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6439 }
6440
6441 // We only peek through bitcasts of the same vector width.
6442 if (Opcode == ISD::BITCAST) {
6443 SDValue Src = Op.getOperand(0);
6444 EVT SrcVT = Src.getValueType();
6445 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
6446 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6447 return SDValue();
6448 }
6449
6450 // Actual nodes that may contain scalar elements
6451
6452 // For insert_vector_elt - either return the index matching scalar or recurse
6453 // into the base vector.
6454 if (Opcode == ISD::INSERT_VECTOR_ELT &&
6455 isa<ConstantSDNode>(Op.getOperand(2))) {
6456 if (Op.getConstantOperandAPInt(2) == Index)
6457 return Op.getOperand(1);
6458 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6459 }
6460
6461 if (Opcode == ISD::SCALAR_TO_VECTOR)
6462 return (Index == 0) ? Op.getOperand(0)
6463 : DAG.getUNDEF(VT.getVectorElementType());
6464
6465 if (Opcode == ISD::BUILD_VECTOR)
6466 return Op.getOperand(Index);
6467
6468 return SDValue();
6469}
6470
6471// Use PINSRB/PINSRW/PINSRD to create a build vector.
6473 const APInt &NonZeroMask,
6474 unsigned NumNonZero, unsigned NumZero,
6475 SelectionDAG &DAG,
6476 const X86Subtarget &Subtarget) {
6477 MVT VT = Op.getSimpleValueType();
6478 unsigned NumElts = VT.getVectorNumElements();
6479 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6480 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6481 "Illegal vector insertion");
6482
6483 SDValue V;
6484 bool First = true;
6485
6486 for (unsigned i = 0; i < NumElts; ++i) {
6487 bool IsNonZero = NonZeroMask[i];
6488 if (!IsNonZero)
6489 continue;
6490
6491 // If the build vector contains zeros or our first insertion is not the
6492 // first index then insert into zero vector to break any register
6493 // dependency else use SCALAR_TO_VECTOR.
6494 if (First) {
6495 First = false;
6496 if (NumZero || 0 != i)
6497 V = getZeroVector(VT, Subtarget, DAG, DL);
6498 else {
6499 assert(0 == i && "Expected insertion into zero-index");
6500 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6501 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6502 V = DAG.getBitcast(VT, V);
6503 continue;
6504 }
6505 }
6506 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
6507 DAG.getIntPtrConstant(i, DL));
6508 }
6509
6510 return V;
6511}
6512
6513/// Custom lower build_vector of v16i8.
6515 const APInt &NonZeroMask,
6516 unsigned NumNonZero, unsigned NumZero,
6517 SelectionDAG &DAG,
6518 const X86Subtarget &Subtarget) {
6519 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6520 return SDValue();
6521
6522 // SSE4.1 - use PINSRB to insert each byte directly.
6523 if (Subtarget.hasSSE41())
6524 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
6525 DAG, Subtarget);
6526
6527 SDValue V;
6528
6529 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6530 // If both the lowest 16-bits are non-zero, then convert to MOVD.
6531 if (!NonZeroMask.extractBits(2, 0).isZero() &&
6532 !NonZeroMask.extractBits(2, 2).isZero()) {
6533 for (unsigned I = 0; I != 4; ++I) {
6534 if (!NonZeroMask[I])
6535 continue;
6536 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
6537 if (I != 0)
6538 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
6539 DAG.getConstant(I * 8, DL, MVT::i8));
6540 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
6541 }
6542 assert(V && "Failed to fold v16i8 vector to zero");
6543 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6544 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
6545 V = DAG.getBitcast(MVT::v8i16, V);
6546 }
6547 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
6548 bool ThisIsNonZero = NonZeroMask[i];
6549 bool NextIsNonZero = NonZeroMask[i + 1];
6550 if (!ThisIsNonZero && !NextIsNonZero)
6551 continue;
6552
6553 SDValue Elt;
6554 if (ThisIsNonZero) {
6555 if (NumZero || NextIsNonZero)
6556 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6557 else
6558 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6559 }
6560
6561 if (NextIsNonZero) {
6562 SDValue NextElt = Op.getOperand(i + 1);
6563 if (i == 0 && NumZero)
6564 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
6565 else
6566 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
6567 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
6568 DAG.getConstant(8, DL, MVT::i8));
6569 if (ThisIsNonZero)
6570 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
6571 else
6572 Elt = NextElt;
6573 }
6574
6575 // If our first insertion is not the first index or zeros are needed, then
6576 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
6577 // elements undefined).
6578 if (!V) {
6579 if (i != 0 || NumZero)
6580 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
6581 else {
6582 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
6583 V = DAG.getBitcast(MVT::v8i16, V);
6584 continue;
6585 }
6586 }
6587 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6588 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
6589 DAG.getIntPtrConstant(i / 2, DL));
6590 }
6591
6592 return DAG.getBitcast(MVT::v16i8, V);
6593}
6594
6595/// Custom lower build_vector of v8i16.
6597 const APInt &NonZeroMask,
6598 unsigned NumNonZero, unsigned NumZero,
6599 SelectionDAG &DAG,
6600 const X86Subtarget &Subtarget) {
6601 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6602 return SDValue();
6603
6604 // Use PINSRW to insert each byte directly.
6605 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
6606 Subtarget);
6607}
6608
6609/// Custom lower build_vector of v4i32 or v4f32.
6611 SelectionDAG &DAG,
6612 const X86Subtarget &Subtarget) {
6613 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
6614 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
6615 // Because we're creating a less complicated build vector here, we may enable
6616 // further folding of the MOVDDUP via shuffle transforms.
6617 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
6618 Op.getOperand(0) == Op.getOperand(2) &&
6619 Op.getOperand(1) == Op.getOperand(3) &&
6620 Op.getOperand(0) != Op.getOperand(1)) {
6621 MVT VT = Op.getSimpleValueType();
6622 MVT EltVT = VT.getVectorElementType();
6623 // Create a new build vector with the first 2 elements followed by undef
6624 // padding, bitcast to v2f64, duplicate, and bitcast back.
6625 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
6626 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
6627 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
6628 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
6629 return DAG.getBitcast(VT, Dup);
6630 }
6631
6632 // Find all zeroable elements.
6633 std::bitset<4> Zeroable, Undefs;
6634 for (int i = 0; i < 4; ++i) {
6635 SDValue Elt = Op.getOperand(i);
6636 Undefs[i] = Elt.isUndef();
6637 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6638 }
6639 assert(Zeroable.size() - Zeroable.count() > 1 &&
6640 "We expect at least two non-zero elements!");
6641
6642 // We only know how to deal with build_vector nodes where elements are either
6643 // zeroable or extract_vector_elt with constant index.
6644 SDValue FirstNonZero;
6645 unsigned FirstNonZeroIdx;
6646 for (unsigned i = 0; i < 4; ++i) {
6647 if (Zeroable[i])
6648 continue;
6649 SDValue Elt = Op.getOperand(i);
6650 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6651 !isa<ConstantSDNode>(Elt.getOperand(1)))
6652 return SDValue();
6653 // Make sure that this node is extracting from a 128-bit vector.
6654 MVT VT = Elt.getOperand(0).getSimpleValueType();
6655 if (!VT.is128BitVector())
6656 return SDValue();
6657 if (!FirstNonZero.getNode()) {
6658 FirstNonZero = Elt;
6659 FirstNonZeroIdx = i;
6660 }
6661 }
6662
6663 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6664 SDValue V1 = FirstNonZero.getOperand(0);
6665 MVT VT = V1.getSimpleValueType();
6666
6667 // See if this build_vector can be lowered as a blend with zero.
6668 SDValue Elt;
6669 unsigned EltMaskIdx, EltIdx;
6670 int Mask[4];
6671 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6672 if (Zeroable[EltIdx]) {
6673 // The zero vector will be on the right hand side.
6674 Mask[EltIdx] = EltIdx+4;
6675 continue;
6676 }
6677
6678 Elt = Op->getOperand(EltIdx);
6679 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6680 EltMaskIdx = Elt.getConstantOperandVal(1);
6681 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6682 break;
6683 Mask[EltIdx] = EltIdx;
6684 }
6685
6686 if (EltIdx == 4) {
6687 // Let the shuffle legalizer deal with blend operations.
6688 SDValue VZeroOrUndef = (Zeroable == Undefs)
6689 ? DAG.getUNDEF(VT)
6690 : getZeroVector(VT, Subtarget, DAG, DL);
6691 if (V1.getSimpleValueType() != VT)
6692 V1 = DAG.getBitcast(VT, V1);
6693 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
6694 }
6695
6696 // See if we can lower this build_vector to a INSERTPS.
6697 if (!Subtarget.hasSSE41())
6698 return SDValue();
6699
6700 SDValue V2 = Elt.getOperand(0);
6701 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6702 V1 = SDValue();
6703
6704 bool CanFold = true;
6705 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6706 if (Zeroable[i])
6707 continue;
6708
6709 SDValue Current = Op->getOperand(i);
6710 SDValue SrcVector = Current->getOperand(0);
6711 if (!V1.getNode())
6712 V1 = SrcVector;
6713 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
6714 }
6715
6716 if (!CanFold)
6717 return SDValue();
6718
6719 assert(V1.getNode() && "Expected at least two non-zero elements!");
6720 if (V1.getSimpleValueType() != MVT::v4f32)
6721 V1 = DAG.getBitcast(MVT::v4f32, V1);
6722 if (V2.getSimpleValueType() != MVT::v4f32)
6723 V2 = DAG.getBitcast(MVT::v4f32, V2);
6724
6725 // Ok, we can emit an INSERTPS instruction.
6726 unsigned ZMask = Zeroable.to_ulong();
6727
6728 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6729 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6730 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6731 DAG.getIntPtrConstant(InsertPSMask, DL, true));
6732 return DAG.getBitcast(VT, Result);
6733}
6734
6735/// Return a vector logical shift node.
6736static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6737 SelectionDAG &DAG, const TargetLowering &TLI,
6738 const SDLoc &dl) {
6739 assert(VT.is128BitVector() && "Unknown type for VShift");
6740 MVT ShVT = MVT::v16i8;
6741 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6742 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6743 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6744 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
6745 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6746}
6747
6749 SelectionDAG &DAG) {
6750
6751 // Check if the scalar load can be widened into a vector load. And if
6752 // the address is "base + cst" see if the cst can be "absorbed" into
6753 // the shuffle mask.
6754 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6755 SDValue Ptr = LD->getBasePtr();
6756 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
6757 return SDValue();
6758 EVT PVT = LD->getValueType(0);
6759 if (PVT != MVT::i32 && PVT != MVT::f32)
6760 return SDValue();
6761
6762 int FI = -1;
6763 int64_t Offset = 0;
6764 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6765 FI = FINode->getIndex();
6766 Offset = 0;
6767 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6768 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6769 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6770 Offset = Ptr.getConstantOperandVal(1);
6771 Ptr = Ptr.getOperand(0);
6772 } else {
6773 return SDValue();
6774 }
6775
6776 // FIXME: 256-bit vector instructions don't require a strict alignment,
6777 // improve this code to support it better.
6778 Align RequiredAlign(VT.getSizeInBits() / 8);
6779 SDValue Chain = LD->getChain();
6780 // Make sure the stack object alignment is at least 16 or 32.
6782 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
6783 if (!InferredAlign || *InferredAlign < RequiredAlign) {
6784 if (MFI.isFixedObjectIndex(FI)) {
6785 // Can't change the alignment. FIXME: It's possible to compute
6786 // the exact stack offset and reference FI + adjust offset instead.
6787 // If someone *really* cares about this. That's the way to implement it.
6788 return SDValue();
6789 } else {
6790 MFI.setObjectAlignment(FI, RequiredAlign);
6791 }
6792 }
6793
6794 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6795 // Ptr + (Offset & ~15).
6796 if (Offset < 0)
6797 return SDValue();
6798 if ((Offset % RequiredAlign.value()) & 3)
6799 return SDValue();
6800 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
6801 if (StartOffset) {
6802 SDLoc DL(Ptr);
6803 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6804 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6805 }
6806
6807 int EltNo = (Offset - StartOffset) >> 2;
6808 unsigned NumElems = VT.getVectorNumElements();
6809
6810 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6811 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6812 LD->getPointerInfo().getWithOffset(StartOffset));
6813
6814 SmallVector<int, 8> Mask(NumElems, EltNo);
6815
6816 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6817 }
6818
6819 return SDValue();
6820}
6821
6822// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
6823static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
6824 if (ISD::isNON_EXTLoad(Elt.getNode())) {
6825 auto *BaseLd = cast<LoadSDNode>(Elt);
6826 if (!BaseLd->isSimple())
6827 return false;
6828 Ld = BaseLd;
6829 ByteOffset = 0;
6830 return true;
6831 }
6832
6833 switch (Elt.getOpcode()) {
6834 case ISD::BITCAST:
6835 case ISD::TRUNCATE:
6837 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
6838 case ISD::SRL:
6839 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
6840 uint64_t Amt = AmtC->getZExtValue();
6841 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
6842 ByteOffset += Amt / 8;
6843 return true;
6844 }
6845 }
6846 break;
6848 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
6849 SDValue Src = Elt.getOperand(0);
6850 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
6851 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
6852 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
6853 findEltLoadSrc(Src, Ld, ByteOffset)) {
6854 uint64_t Idx = IdxC->getZExtValue();
6855 ByteOffset += Idx * (SrcSizeInBits / 8);
6856 return true;
6857 }
6858 }
6859 break;
6860 }
6861
6862 return false;
6863}
6864
6865/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6866/// elements can be replaced by a single large load which has the same value as
6867/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6868///
6869/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6871 const SDLoc &DL, SelectionDAG &DAG,
6872 const X86Subtarget &Subtarget,
6873 bool IsAfterLegalize) {
6874 if ((VT.getScalarSizeInBits() % 8) != 0)
6875 return SDValue();
6876
6877 unsigned NumElems = Elts.size();
6878
6879 int LastLoadedElt = -1;
6880 APInt LoadMask = APInt::getZero(NumElems);
6881 APInt ZeroMask = APInt::getZero(NumElems);
6882 APInt UndefMask = APInt::getZero(NumElems);
6883
6884 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
6885 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
6886
6887 // For each element in the initializer, see if we've found a load, zero or an
6888 // undef.
6889 for (unsigned i = 0; i < NumElems; ++i) {
6890 SDValue Elt = peekThroughBitcasts(Elts[i]);
6891 if (!Elt.getNode())
6892 return SDValue();
6893 if (Elt.isUndef()) {
6894 UndefMask.setBit(i);
6895 continue;
6896 }
6898 ZeroMask.setBit(i);
6899 continue;
6900 }
6901
6902 // Each loaded element must be the correct fractional portion of the
6903 // requested vector load.
6904 unsigned EltSizeInBits = Elt.getValueSizeInBits();
6905 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
6906 return SDValue();
6907
6908 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
6909 return SDValue();
6910 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
6911 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
6912 return SDValue();
6913
6914 LoadMask.setBit(i);
6915 LastLoadedElt = i;
6916 }
6917 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
6918 NumElems &&
6919 "Incomplete element masks");
6920
6921 // Handle Special Cases - all undef or undef/zero.
6922 if (UndefMask.popcount() == NumElems)
6923 return DAG.getUNDEF(VT);
6924 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
6925 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6926 : DAG.getConstantFP(0.0, DL, VT);
6927
6928 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6929 int FirstLoadedElt = LoadMask.countr_zero();
6930 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6931 EVT EltBaseVT = EltBase.getValueType();
6932 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
6933 "Register/Memory size mismatch");
6934 LoadSDNode *LDBase = Loads[FirstLoadedElt];
6935 assert(LDBase && "Did not find base load for merging consecutive loads");
6936 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
6937 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
6938 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
6939 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
6940 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
6941
6942 // TODO: Support offsetting the base load.
6943 if (ByteOffsets[FirstLoadedElt] != 0)
6944 return SDValue();
6945
6946 // Check to see if the element's load is consecutive to the base load
6947 // or offset from a previous (already checked) load.
6948 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
6949 LoadSDNode *Ld = Loads[EltIdx];
6950 int64_t ByteOffset = ByteOffsets[EltIdx];
6951 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
6952 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
6953 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
6954 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
6955 }
6956 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
6957 EltIdx - FirstLoadedElt);
6958 };
6959
6960 // Consecutive loads can contain UNDEFS but not ZERO elements.
6961 // Consecutive loads with UNDEFs and ZEROs elements require a
6962 // an additional shuffle stage to clear the ZERO elements.
6963 bool IsConsecutiveLoad = true;
6964 bool IsConsecutiveLoadWithZeros = true;
6965 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6966 if (LoadMask[i]) {
6967 if (!CheckConsecutiveLoad(LDBase, i)) {
6968 IsConsecutiveLoad = false;
6969 IsConsecutiveLoadWithZeros = false;
6970 break;
6971 }
6972 } else if (ZeroMask[i]) {
6973 IsConsecutiveLoad = false;
6974 }
6975 }
6976
6977 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
6978 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6979 assert(LDBase->isSimple() &&
6980 "Cannot merge volatile or atomic loads.");
6981 SDValue NewLd =
6982 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6983 LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
6984 MMOFlags);
6985 for (auto *LD : Loads)
6986 if (LD)
6987 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
6988 return NewLd;
6989 };
6990
6991 // Check if the base load is entirely dereferenceable.
6992 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
6993 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
6994
6995 // LOAD - all consecutive load/undefs (must start/end with a load or be
6996 // entirely dereferenceable). If we have found an entire vector of loads and
6997 // undefs, then return a large load of the entire vector width starting at the
6998 // base pointer. If the vector contains zeros, then attempt to shuffle those
6999 // elements.
7000 if (FirstLoadedElt == 0 &&
7001 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
7002 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7003 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7004 return SDValue();
7005
7006 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7007 // will lower to regular temporal loads and use the cache.
7008 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7009 VT.is256BitVector() && !Subtarget.hasInt256())
7010 return SDValue();
7011
7012 if (NumElems == 1)
7013 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
7014
7015 if (!ZeroMask)
7016 return CreateLoad(VT, LDBase);
7017
7018 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7019 // vector and a zero vector to clear out the zero elements.
7020 if (!IsAfterLegalize && VT.isVector()) {
7021 unsigned NumMaskElts = VT.getVectorNumElements();
7022 if ((NumMaskElts % NumElems) == 0) {
7023 unsigned Scale = NumMaskElts / NumElems;
7024 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7025 for (unsigned i = 0; i < NumElems; ++i) {
7026 if (UndefMask[i])
7027 continue;
7028 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7029 for (unsigned j = 0; j != Scale; ++j)
7030 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7031 }
7032 SDValue V = CreateLoad(VT, LDBase);
7033 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7034 : DAG.getConstantFP(0.0, DL, VT);
7035 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7036 }
7037 }
7038 }
7039
7040 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7041 if (VT.is256BitVector() || VT.is512BitVector()) {
7042 unsigned HalfNumElems = NumElems / 2;
7043 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7044 EVT HalfVT =
7045 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7046 SDValue HalfLD =
7047 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7048 DAG, Subtarget, IsAfterLegalize);
7049 if (HalfLD)
7050 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7051 HalfLD, DAG.getIntPtrConstant(0, DL));
7052 }
7053 }
7054
7055 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7056 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7057 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7058 LoadSizeInBits == 64) &&
7059 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7060 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7061 : MVT::getIntegerVT(LoadSizeInBits);
7062 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7063 // Allow v4f32 on SSE1 only targets.
7064 // FIXME: Add more isel patterns so we can just use VT directly.
7065 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7066 VecVT = MVT::v4f32;
7067 if (TLI.isTypeLegal(VecVT)) {
7068 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7069 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7070 SDValue ResNode = DAG.getMemIntrinsicNode(
7071 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7073 for (auto *LD : Loads)
7074 if (LD)
7075 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7076 return DAG.getBitcast(VT, ResNode);
7077 }
7078 }
7079
7080 // BROADCAST - match the smallest possible repetition pattern, load that
7081 // scalar/subvector element and then broadcast to the entire vector.
7082 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7083 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7084 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7085 unsigned RepeatSize = SubElems * BaseSizeInBits;
7086 unsigned ScalarSize = std::min(RepeatSize, 64u);
7087 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7088 continue;
7089
7090 // Don't attempt a 1:N subvector broadcast - it should be caught by
7091 // combineConcatVectorOps, else will cause infinite loops.
7092 if (RepeatSize > ScalarSize && SubElems == 1)
7093 continue;
7094
7095 bool Match = true;
7096 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7097 for (unsigned i = 0; i != NumElems && Match; ++i) {
7098 if (!LoadMask[i])
7099 continue;
7100 SDValue Elt = peekThroughBitcasts(Elts[i]);
7101 if (RepeatedLoads[i % SubElems].isUndef())
7102 RepeatedLoads[i % SubElems] = Elt;
7103 else
7104 Match &= (RepeatedLoads[i % SubElems] == Elt);
7105 }
7106
7107 // We must have loads at both ends of the repetition.
7108 Match &= !RepeatedLoads.front().isUndef();
7109 Match &= !RepeatedLoads.back().isUndef();
7110 if (!Match)
7111 continue;
7112
7113 EVT RepeatVT =
7114 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7115 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7116 : EVT::getFloatingPointVT(ScalarSize);
7117 if (RepeatSize > ScalarSize)
7118 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7119 RepeatSize / ScalarSize);
7120 EVT BroadcastVT =
7121 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7122 VT.getSizeInBits() / ScalarSize);
7123 if (TLI.isTypeLegal(BroadcastVT)) {
7124 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7125 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
7126 SDValue Broadcast = RepeatLoad;
7127 if (RepeatSize > ScalarSize) {
7128 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7129 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7130 } else {
7131 if (!Subtarget.hasAVX2() &&
7133 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7134 Subtarget,
7135 /*AssumeSingleUse=*/true))
7136 return SDValue();
7137 Broadcast =
7138 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
7139 }
7140 return DAG.getBitcast(VT, Broadcast);
7141 }
7142 }
7143 }
7144 }
7145
7146 return SDValue();
7147}
7148
7149// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
7150// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
7151// are consecutive, non-overlapping, and in the right order.
7153 SelectionDAG &DAG,
7154 const X86Subtarget &Subtarget,
7155 bool IsAfterLegalize) {
7157 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
7158 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
7159 Elts.push_back(Elt);
7160 continue;
7161 }
7162 return SDValue();
7163 }
7164 assert(Elts.size() == VT.getVectorNumElements());
7165 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
7166 IsAfterLegalize);
7167}
7168
7170 const APInt &Undefs, LLVMContext &C) {
7171 unsigned ScalarSize = VT.getScalarSizeInBits();
7172 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
7173
7174 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7175 if (VT.isFloatingPoint()) {
7176 if (ScalarSize == 16)
7177 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7178 if (ScalarSize == 32)
7179 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7180 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7181 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7182 }
7183 return Constant::getIntegerValue(Ty, Val);
7184 };
7185
7186 SmallVector<Constant *, 32> ConstantVec;
7187 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
7188 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
7189 : getConstantScalar(Bits[I]));
7190
7191 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7192}
7193
7194static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
7195 unsigned SplatBitSize, LLVMContext &C) {
7196 unsigned ScalarSize = VT.getScalarSizeInBits();
7197
7198 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7199 if (VT.isFloatingPoint()) {
7200 if (ScalarSize == 16)
7201 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7202 if (ScalarSize == 32)
7203 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7204 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7205 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7206 }
7207 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
7208 };
7209
7210 if (ScalarSize == SplatBitSize)
7211 return getConstantScalar(SplatValue);
7212
7213 unsigned NumElm = SplatBitSize / ScalarSize;
7214 SmallVector<Constant *, 32> ConstantVec;
7215 for (unsigned I = 0; I != NumElm; ++I) {
7216 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
7217 ConstantVec.push_back(getConstantScalar(Val));
7218 }
7219 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7220}
7221
7223 for (auto *U : N->uses()) {
7224 unsigned Opc = U->getOpcode();
7225 // VPERMV/VPERMV3 shuffles can never fold their index operands.
7226 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7227 return false;
7228 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7229 return false;
7230 if (isTargetShuffle(Opc))
7231 return true;
7232 if (Opc == ISD::BITCAST) // Ignore bitcasts
7233 return isFoldableUseOfShuffle(U);
7234 if (N->hasOneUse()) {
7235 // TODO, there may be some general way to know if a SDNode can
7236 // be folded. We now only know whether an MI is foldable.
7237 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7238 return false;
7239 return true;
7240 }
7241 }
7242 return false;
7243}
7244
7245/// Attempt to use the vbroadcast instruction to generate a splat value
7246/// from a splat BUILD_VECTOR which uses:
7247/// a. A single scalar load, or a constant.
7248/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7249///
7250/// The VBROADCAST node is returned when a pattern is found,
7251/// or SDValue() otherwise.
7253 const SDLoc &dl,
7254 const X86Subtarget &Subtarget,
7255 SelectionDAG &DAG) {
7256 // VBROADCAST requires AVX.
7257 // TODO: Splats could be generated for non-AVX CPUs using SSE
7258 // instructions, but there's less potential gain for only 128-bit vectors.
7259 if (!Subtarget.hasAVX())
7260 return SDValue();
7261
7262 MVT VT = BVOp->getSimpleValueType(0);
7263 unsigned NumElts = VT.getVectorNumElements();
7264 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7265 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7266 "Unsupported vector type for broadcast.");
7267
7268 // See if the build vector is a repeating sequence of scalars (inc. splat).
7269 SDValue Ld;
7270 BitVector UndefElements;
7271 SmallVector<SDValue, 16> Sequence;
7272 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7273 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
7274 if (Sequence.size() == 1)
7275 Ld = Sequence[0];
7276 }
7277
7278 // Attempt to use VBROADCASTM
7279 // From this pattern:
7280 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7281 // b. t1 = (build_vector t0 t0)
7282 //
7283 // Create (VBROADCASTM v2i1 X)
7284 if (!Sequence.empty() && Subtarget.hasCDI()) {
7285 // If not a splat, are the upper sequence values zeroable?
7286 unsigned SeqLen = Sequence.size();
7287 bool UpperZeroOrUndef =
7288 SeqLen == 1 ||
7289 llvm::all_of(ArrayRef(Sequence).drop_front(), [](SDValue V) {
7290 return !V || V.isUndef() || isNullConstant(V);
7291 });
7292 SDValue Op0 = Sequence[0];
7293 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
7294 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
7295 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
7296 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
7297 ? Op0.getOperand(0)
7298 : Op0.getOperand(0).getOperand(0);
7299 MVT MaskVT = BOperand.getSimpleValueType();
7300 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
7301 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7302 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7303 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
7304 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
7305 unsigned Scale = 512 / VT.getSizeInBits();
7306 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
7307 }
7308 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
7309 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
7310 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
7311 return DAG.getBitcast(VT, Bcst);
7312 }
7313 }
7314 }
7315
7316 unsigned NumUndefElts = UndefElements.count();
7317 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7318 APInt SplatValue, Undef;
7319 unsigned SplatBitSize;
7320 bool HasUndef;
7321 // Check if this is a repeated constant pattern suitable for broadcasting.
7322 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7323 SplatBitSize > VT.getScalarSizeInBits() &&
7324 SplatBitSize < VT.getSizeInBits()) {
7325 // Avoid replacing with broadcast when it's a use of a shuffle
7326 // instruction to preserve the present custom lowering of shuffles.
7327 if (isFoldableUseOfShuffle(BVOp))
7328 return SDValue();
7329 // replace BUILD_VECTOR with broadcast of the repeated constants.
7330 LLVMContext *Ctx = DAG.getContext();
7331 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7332 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7333 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
7334 // Load the constant scalar/subvector and broadcast it.
7335 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7336 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7337 SDValue CP = DAG.getConstantPool(C, PVT);
7338 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7339
7340 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7341 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
7342 SDValue Ops[] = {DAG.getEntryNode(), CP};
7343 MachinePointerInfo MPI =
7345 SDValue Brdcst =
7346 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7347 MPI, Alignment, MachineMemOperand::MOLoad);
7348 return DAG.getBitcast(VT, Brdcst);
7349 }
7350 if (SplatBitSize > 64) {
7351 // Load the vector of constants and broadcast it.
7352 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7353 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7354 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7355 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
7356 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7357 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7358 SDValue Ops[] = {DAG.getEntryNode(), VCP};
7359 MachinePointerInfo MPI =
7362 Ops, VVT, MPI, Alignment,
7364 }
7365 }
7366
7367 // If we are moving a scalar into a vector (Ld must be set and all elements
7368 // but 1 are undef) and that operation is not obviously supported by
7369 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7370 // That's better than general shuffling and may eliminate a load to GPR and
7371 // move from scalar to vector register.
7372 if (!Ld || NumElts - NumUndefElts != 1)
7373 return SDValue();
7374 unsigned ScalarSize = Ld.getValueSizeInBits();
7375 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7376 return SDValue();
7377 }
7378
7379 bool ConstSplatVal =
7380 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7381 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7382
7383 // TODO: Handle broadcasts of non-constant sequences.
7384
7385 // Make sure that all of the users of a non-constant load are from the
7386 // BUILD_VECTOR node.
7387 // FIXME: Is the use count needed for non-constant, non-load case?
7388 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7389 return SDValue();
7390
7391 unsigned ScalarSize = Ld.getValueSizeInBits();
7392 bool IsGE256 = (VT.getSizeInBits() >= 256);
7393
7394 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7395 // instruction to save 8 or more bytes of constant pool data.
7396 // TODO: If multiple splats are generated to load the same constant,
7397 // it may be detrimental to overall size. There needs to be a way to detect
7398 // that condition to know if this is truly a size win.
7399 bool OptForSize = DAG.shouldOptForSize();
7400
7401 // Handle broadcasting a single constant scalar from the constant pool
7402 // into a vector.
7403 // On Sandybridge (no AVX2), it is still better to load a constant vector
7404 // from the constant pool and not to broadcast it from a scalar.
7405 // But override that restriction when optimizing for size.
7406 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7407 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7408 EVT CVT = Ld.getValueType();
7409 assert(!CVT.isVector() && "Must not broadcast a vector type");
7410
7411 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
7412 // For size optimization, also splat v2f64 and v2i64, and for size opt
7413 // with AVX2, also splat i8 and i16.
7414 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7415 if (ScalarSize == 32 ||
7416 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7417 CVT == MVT::f16 ||
7418 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7419 const Constant *C = nullptr;
7420 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
7421 C = CI->getConstantIntValue();
7422 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
7423 C = CF->getConstantFPValue();
7424
7425 assert(C && "Invalid constant type");
7426
7427 SDValue CP =
7429 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7430
7431 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7432 SDValue Ops[] = {DAG.getEntryNode(), CP};
7433 MachinePointerInfo MPI =
7435 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7436 MPI, Alignment, MachineMemOperand::MOLoad);
7437 }
7438 }
7439
7440 // Handle AVX2 in-register broadcasts.
7441 if (!IsLoad && Subtarget.hasInt256() &&
7442 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7443 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7444
7445 // The scalar source must be a normal load.
7446 if (!IsLoad)
7447 return SDValue();
7448
7449 // Make sure the non-chain result is only used by this build vector.
7450 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7451 return SDValue();
7452
7453 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7454 (Subtarget.hasVLX() && ScalarSize == 64)) {
7455 auto *LN = cast<LoadSDNode>(Ld);
7456 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7457 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7458 SDValue BCast =
7460 LN->getMemoryVT(), LN->getMemOperand());
7461 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7462 return BCast;
7463 }
7464
7465 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7466 // double since there is no vbroadcastsd xmm
7467 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
7468 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
7469 auto *LN = cast<LoadSDNode>(Ld);
7470 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7471 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7472 SDValue BCast =
7474 LN->getMemoryVT(), LN->getMemOperand());
7475 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7476 return BCast;
7477 }
7478
7479 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
7480 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7481
7482 // Unsupported broadcast.
7483 return SDValue();
7484}
7485
7486/// For an EXTRACT_VECTOR_ELT with a constant index return the real
7487/// underlying vector and index.
7488///
7489/// Modifies \p ExtractedFromVec to the real vector and returns the real
7490/// index.
7491static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7492 SDValue ExtIdx) {
7493 int Idx = ExtIdx->getAsZExtVal();
7494 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7495 return Idx;
7496
7497 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7498 // lowered this:
7499 // (extract_vector_elt (v8f32 %1), Constant<6>)
7500 // to:
7501 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7502 // (extract_subvector (v8f32 %0), Constant<4>),
7503 // undef)
7504 // Constant<0>)
7505 // In this case the vector is the extract_subvector expression and the index
7506 // is 2, as specified by the shuffle.
7507 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7508 SDValue ShuffleVec = SVOp->getOperand(0);
7509 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7510 assert(ShuffleVecVT.getVectorElementType() ==
7511 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7512
7513 int ShuffleIdx = SVOp->getMaskElt(Idx);
7514 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7515 ExtractedFromVec = ShuffleVec;
7516 return ShuffleIdx;
7517 }
7518 return Idx;
7519}
7520
7522 SelectionDAG &DAG) {
7523 MVT VT = Op.getSimpleValueType();
7524
7525 // Skip if insert_vec_elt is not supported.
7526 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7528 return SDValue();
7529
7530 unsigned NumElems = Op.getNumOperands();
7531 SDValue VecIn1;
7532 SDValue VecIn2;
7533 SmallVector<unsigned, 4> InsertIndices;
7534 SmallVector<int, 8> Mask(NumElems, -1);
7535
7536 for (unsigned i = 0; i != NumElems; ++i) {
7537 unsigned Opc = Op.getOperand(i).getOpcode();
7538
7539 if (Opc == ISD::UNDEF)
7540 continue;
7541
7542 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
7543 // Quit if more than 1 elements need inserting.
7544 if (InsertIndices.size() > 1)
7545 return SDValue();
7546
7547 InsertIndices.push_back(i);
7548 continue;
7549 }
7550
7551 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7552 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7553
7554 // Quit if non-constant index.
7555 if (!isa<ConstantSDNode>(ExtIdx))
7556 return SDValue();
7557 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7558
7559 // Quit if extracted from vector of different type.
7560 if (ExtractedFromVec.getValueType() != VT)
7561 return SDValue();
7562
7563 if (!VecIn1.getNode())
7564 VecIn1 = ExtractedFromVec;
7565 else if (VecIn1 != ExtractedFromVec) {
7566 if (!VecIn2.getNode())
7567 VecIn2 = ExtractedFromVec;
7568 else if (VecIn2 != ExtractedFromVec)
7569 // Quit if more than 2 vectors to shuffle
7570 return SDValue();
7571 }
7572
7573 if (ExtractedFromVec == VecIn1)
7574 Mask[i] = Idx;
7575 else if (ExtractedFromVec == VecIn2)
7576 Mask[i] = Idx + NumElems;
7577 }
7578
7579 if (!VecIn1.getNode())
7580 return SDValue();
7581
7582 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7583 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7584
7585 for (unsigned Idx : InsertIndices)
7586 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7587 DAG.getIntPtrConstant(Idx, DL));
7588
7589 return NV;
7590}
7591
7592// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
7594 const X86Subtarget &Subtarget) {
7595 MVT VT = Op.getSimpleValueType();
7596 MVT IVT =
7597 VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);
7599 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
7600 NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,
7601 Op.getOperand(I)));
7602 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
7603 return DAG.getBitcast(VT, Res);
7604}
7605
7606// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7608 SelectionDAG &DAG,
7609 const X86Subtarget &Subtarget) {
7610
7611 MVT VT = Op.getSimpleValueType();
7612 assert((VT.getVectorElementType() == MVT::i1) &&
7613 "Unexpected type in LowerBUILD_VECTORvXi1!");
7614 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
7615 ISD::isBuildVectorAllOnes(Op.getNode()))
7616 return Op;
7617
7618 uint64_t Immediate = 0;
7619 SmallVector<unsigned, 16> NonConstIdx;
7620 bool IsSplat = true;
7621 bool HasConstElts = false;
7622 int SplatIdx = -1;
7623 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7624 SDValue In = Op.getOperand(idx);
7625 if (In.isUndef())
7626 continue;
7627 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
7628 Immediate |= (InC->getZExtValue() & 0x1) << idx;
7629 HasConstElts = true;
7630 } else {
7631 NonConstIdx.push_back(idx);
7632 }
7633 if (SplatIdx < 0)
7634 SplatIdx = idx;
7635 else if (In != Op.getOperand(SplatIdx))
7636 IsSplat = false;
7637 }
7638
7639 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7640 if (IsSplat) {
7641 // The build_vector allows the scalar element to be larger than the vector
7642 // element type. We need to mask it to use as a condition unless we know
7643 // the upper bits are zero.
7644 // FIXME: Use computeKnownBits instead of checking specific opcode?
7645 SDValue Cond = Op.getOperand(SplatIdx);
7646 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
7647 if (Cond.getOpcode() != ISD::SETCC)
7648 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
7649 DAG.getConstant(1, dl, MVT::i8));
7650
7651 // Perform the select in the scalar domain so we can use cmov.
7652 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7653 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
7654 DAG.getAllOnesConstant(dl, MVT::i32),
7655 DAG.getConstant(0, dl, MVT::i32));
7656 Select = DAG.getBitcast(MVT::v32i1, Select);
7657 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
7658 } else {
7659 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
7660 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
7661 DAG.getAllOnesConstant(dl, ImmVT),
7662 DAG.getConstant(0, dl, ImmVT));
7663 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
7664 Select = DAG.getBitcast(VecVT, Select);
7665 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
7666 DAG.getIntPtrConstant(0, dl));
7667 }
7668 }
7669
7670 // insert elements one by one
7671 SDValue DstVec;
7672 if (HasConstElts) {
7673 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7674 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
7675 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
7676 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
7677 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
7678 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
7679 } else {
7680 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
7681 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
7682 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
7683 DstVec = DAG.getBitcast(VecVT, Imm);
7684 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
7685 DAG.getIntPtrConstant(0, dl));
7686 }
7687 } else
7688 DstVec = DAG.getUNDEF(VT);
7689
7690 for (unsigned InsertIdx : NonConstIdx) {
7691 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7692 Op.getOperand(InsertIdx),
7693 DAG.getIntPtrConstant(InsertIdx, dl));
7694 }
7695 return DstVec;
7696}
7697
7698LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
7699 switch (Opcode) {
7700 case X86ISD::PACKSS:
7701 case X86ISD::PACKUS:
7702 case X86ISD::FHADD:
7703 case X86ISD::FHSUB:
7704 case X86ISD::HADD:
7705 case X86ISD::HSUB:
7706 return true;
7707 }
7708 return false;
7709}
7710
7711/// This is a helper function of LowerToHorizontalOp().
7712/// This function checks that the build_vector \p N in input implements a
7713/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
7714/// may not match the layout of an x86 256-bit horizontal instruction.
7715/// In other words, if this returns true, then some extraction/insertion will
7716/// be required to produce a valid horizontal instruction.
7717///
7718/// Parameter \p Opcode defines the kind of horizontal operation to match.
7719/// For example, if \p Opcode is equal to ISD::ADD, then this function
7720/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7721/// is equal to ISD::SUB, then this function checks if this is a horizontal
7722/// arithmetic sub.
7723///
7724/// This function only analyzes elements of \p N whose indices are
7725/// in range [BaseIdx, LastIdx).
7726///
7727/// TODO: This function was originally used to match both real and fake partial
7728/// horizontal operations, but the index-matching logic is incorrect for that.
7729/// See the corrected implementation in isHopBuildVector(). Can we reduce this
7730/// code because it is only used for partial h-op matching now?
7731static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
7732 const SDLoc &DL, SelectionDAG &DAG,
7733 unsigned BaseIdx, unsigned LastIdx,
7734 SDValue &V0, SDValue &V1) {
7735 EVT VT = N->getValueType(0);
7736 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
7737 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7738 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7739 "Invalid Vector in input!");
7740
7741 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7742 bool CanFold = true;
7743 unsigned ExpectedVExtractIdx = BaseIdx;
7744 unsigned NumElts = LastIdx - BaseIdx;
7745 V0 = DAG.getUNDEF(VT);
7746 V1 = DAG.getUNDEF(VT);
7747
7748 // Check if N implements a horizontal binop.
7749 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7750 SDValue Op = N->getOperand(i + BaseIdx);
7751
7752 // Skip UNDEFs.
7753 if (Op->isUndef()) {
7754 // Update the expected vector extract index.
7755 if (i * 2 == NumElts)
7756 ExpectedVExtractIdx = BaseIdx;
7757 ExpectedVExtractIdx += 2;
7758 continue;
7759 }
7760
7761 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7762
7763 if (!CanFold)
7764 break;
7765
7766 SDValue Op0 = Op.getOperand(0);
7767 SDValue Op1 = Op.getOperand(1);
7768
7769 // Try to match the following pattern:
7770 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7771 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7773 Op0.getOperand(0) == Op1.getOperand(0) &&
7774 isa<ConstantSDNode>(Op0.getOperand(1)) &&
7775 isa<ConstantSDNode>(Op1.getOperand(1)));
7776 if (!CanFold)
7777 break;
7778
7779 unsigned I0 = Op0.getConstantOperandVal(1);
7780 unsigned I1 = Op1.getConstantOperandVal(1);
7781
7782 if (i * 2 < NumElts) {
7783 if (V0.isUndef()) {
7784 V0 = Op0.getOperand(0);
7785 if (V0.getValueType() != VT)
7786 return false;
7787 }
7788 } else {
7789 if (V1.isUndef()) {
7790 V1 = Op0.getOperand(0);
7791 if (V1.getValueType() != VT)
7792 return false;
7793 }
7794 if (i * 2 == NumElts)
7795 ExpectedVExtractIdx = BaseIdx;
7796 }
7797
7798 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7799 if (I0 == ExpectedVExtractIdx)
7800 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7801 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7802 // Try to match the following dag sequence:
7803 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7804 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7805 } else
7806 CanFold = false;
7807
7808 ExpectedVExtractIdx += 2;
7809 }
7810
7811 return CanFold;
7812}
7813
7814/// Emit a sequence of two 128-bit horizontal add/sub followed by
7815/// a concat_vector.
7816///
7817/// This is a helper function of LowerToHorizontalOp().
7818/// This function expects two 256-bit vectors called V0 and V1.
7819/// At first, each vector is split into two separate 128-bit vectors.
7820/// Then, the resulting 128-bit vectors are used to implement two
7821/// horizontal binary operations.
7822///
7823/// The kind of horizontal binary operation is defined by \p X86Opcode.
7824///
7825/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7826/// the two new horizontal binop.
7827/// When Mode is set, the first horizontal binop dag node would take as input
7828/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7829/// horizontal binop dag node would take as input the lower 128-bit of V1
7830/// and the upper 128-bit of V1.
7831/// Example:
7832/// HADD V0_LO, V0_HI
7833/// HADD V1_LO, V1_HI
7834///
7835/// Otherwise, the first horizontal binop dag node takes as input the lower
7836/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7837/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7838/// Example:
7839/// HADD V0_LO, V1_LO
7840/// HADD V0_HI, V1_HI
7841///
7842/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7843/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7844/// the upper 128-bits of the result.
7845static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7846 const SDLoc &DL, SelectionDAG &DAG,
7847 unsigned X86Opcode, bool Mode,
7848 bool isUndefLO, bool isUndefHI) {
7849 MVT VT = V0.getSimpleValueType();
7850 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7851 "Invalid nodes in input!");
7852
7853 unsigned NumElts = VT.getVectorNumElements();
7854 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7855 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7856 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7857 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7858 MVT NewVT = V0_LO.getSimpleValueType();
7859
7860 SDValue LO = DAG.getUNDEF(NewVT);
7861 SDValue HI = DAG.getUNDEF(NewVT);
7862
7863 if (Mode) {
7864 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7865 if (!isUndefLO && !V0->isUndef())
7866 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7867 if (!isUndefHI && !V1->isUndef())
7868 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7869 } else {
7870 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7871 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7872 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7873
7874 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7875 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7876 }
7877
7878 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7879}
7880
7881/// Returns true iff \p BV builds a vector with the result equivalent to
7882/// the result of ADDSUB/SUBADD operation.
7883/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
7884/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
7885/// \p Opnd0 and \p Opnd1.
7887 const X86Subtarget &Subtarget, SelectionDAG &DAG,
7888 SDValue &Opnd0, SDValue &Opnd1,
7889 unsigned &NumExtracts,
7890 bool &IsSubAdd) {
7891
7892 MVT VT = BV->getSimpleValueType(0);
7893 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
7894 return false;
7895
7896 unsigned NumElts = VT.getVectorNumElements();
7897 SDValue InVec0 = DAG.getUNDEF(VT);
7898 SDValue InVec1 = DAG.getUNDEF(VT);
7899
7900 NumExtracts = 0;
7901
7902 // Odd-numbered elements in the input build vector are obtained from
7903 // adding/subtracting two integer/float elements.
7904 // Even-numbered elements in the input build vector are obtained from
7905 // subtracting/adding two integer/float elements.
7906 unsigned Opc[2] = {0, 0};
7907 for (unsigned i = 0, e = NumElts; i != e; ++i) {
7908 SDValue Op = BV->getOperand(i);
7909
7910 // Skip 'undef' values.
7911 unsigned Opcode = Op.getOpcode();
7912 if (Opcode == ISD::UNDEF)
7913 continue;
7914
7915 // Early exit if we found an unexpected opcode.
7916 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
7917 return false;
7918
7919 SDValue Op0 = Op.getOperand(0);
7920 SDValue Op1 = Op.getOperand(1);
7921
7922 // Try to match the following pattern:
7923 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7924 // Early exit if we cannot match that sequence.
7925 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7927 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7928 Op0.getOperand(1) != Op1.getOperand(1))
7929 return false;
7930
7931 unsigned I0 = Op0.getConstantOperandVal(1);
7932 if (I0 != i)
7933 return false;
7934
7935 // We found a valid add/sub node, make sure its the same opcode as previous
7936 // elements for this parity.
7937 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
7938 return false;
7939 Opc[i % 2] = Opcode;
7940
7941 // Update InVec0 and InVec1.
7942 if (InVec0.isUndef()) {
7943 InVec0 = Op0.getOperand(0);
7944 if (InVec0.getSimpleValueType() != VT)
7945 return false;
7946 }
7947 if (InVec1.isUndef()) {
7948 InVec1 = Op1.getOperand(0);
7949 if (InVec1.getSimpleValueType() != VT)
7950 return false;
7951 }
7952
7953 // Make sure that operands in input to each add/sub node always
7954 // come from a same pair of vectors.
7955 if (InVec0 != Op0.getOperand(0)) {
7956 if (Opcode == ISD::FSUB)
7957 return false;
7958
7959 // FADD is commutable. Try to commute the operands
7960 // and then test again.
7961 std::swap(Op0, Op1);
7962 if (InVec0 != Op0.getOperand(0))
7963 return false;
7964 }
7965
7966 if (InVec1 != Op1.getOperand(0))
7967 return false;
7968
7969 // Increment the number of extractions done.
7970 ++NumExtracts;
7971 }
7972
7973 // Ensure we have found an opcode for both parities and that they are
7974 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
7975 // inputs are undef.
7976 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
7977 InVec0.isUndef() || InVec1.isUndef())
7978 return false;
7979
7980 IsSubAdd = Opc[0] == ISD::FADD;
7981
7982 Opnd0 = InVec0;
7983 Opnd1 = InVec1;
7984 return true;
7985}
7986
7987/// Returns true if is possible to fold MUL and an idiom that has already been
7988/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
7989/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
7990/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
7991///
7992/// Prior to calling this function it should be known that there is some
7993/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7994/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7995/// before replacement of such SDNode with ADDSUB operation. Thus the number
7996/// of \p Opnd0 uses is expected to be equal to 2.
7997/// For example, this function may be called for the following IR:
7998/// %AB = fmul fast <2 x double> %A, %B
7999/// %Sub = fsub fast <2 x double> %AB, %C
8000/// %Add = fadd fast <2 x double> %AB, %C
8001/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8002/// <2 x i32> <i32 0, i32 3>
8003/// There is a def for %Addsub here, which potentially can be replaced by
8004/// X86ISD::ADDSUB operation:
8005/// %Addsub = X86ISD::ADDSUB %AB, %C
8006/// and such ADDSUB can further be replaced with FMADDSUB:
8007/// %Addsub = FMADDSUB %A, %B, %C.
8008///
8009/// The main reason why this method is called before the replacement of the
8010/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8011/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8012/// FMADDSUB is.
8013static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8014 SelectionDAG &DAG,
8015 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
8016 unsigned ExpectedUses) {
8017 if (Opnd0.getOpcode() != ISD::FMUL ||
8018 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8019 return false;
8020
8021 // FIXME: These checks must match the similar ones in
8022 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8023 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8024 // or MUL + ADDSUB to FMADDSUB.
8025 const TargetOptions &Options = DAG.getTarget().Options;
8026 bool AllowFusion =
8027 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
8028 if (!AllowFusion)
8029 return false;
8030
8031 Opnd2 = Opnd1;
8032 Opnd1 = Opnd0.getOperand(1);
8033 Opnd0 = Opnd0.getOperand(0);
8034
8035 return true;
8036}
8037
8038/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8039/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8040/// X86ISD::FMSUBADD node.
8042 const SDLoc &DL,
8043 const X86Subtarget &Subtarget,
8044 SelectionDAG &DAG) {
8045 SDValue Opnd0, Opnd1;
8046 unsigned NumExtracts;
8047 bool IsSubAdd;
8048 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
8049 IsSubAdd))
8050 return SDValue();
8051
8052 MVT VT = BV->getSimpleValueType(0);
8053
8054 // Try to generate X86ISD::FMADDSUB node here.
8055 SDValue Opnd2;
8056 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
8057 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8058 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8059 }
8060
8061 // We only support ADDSUB.
8062 if (IsSubAdd)
8063 return SDValue();
8064
8065 // There are no known X86 targets with 512-bit ADDSUB instructions!
8066 // Convert to blend(fsub,fadd).
8067 if (VT.is512BitVector()) {
8068 SmallVector<int> Mask;
8069 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8070 Mask.push_back(I);
8071 Mask.push_back(I + E + 1);
8072 }
8073 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8074 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8075 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8076 }
8077
8078 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8079}
8080
8082 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8083 // Initialize outputs to known values.
8084 MVT VT = BV->getSimpleValueType(0);
8085 HOpcode = ISD::DELETED_NODE;
8086 V0 = DAG.getUNDEF(VT);
8087 V1 = DAG.getUNDEF(VT);
8088
8089 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8090 // half of the result is calculated independently from the 128-bit halves of
8091 // the inputs, so that makes the index-checking logic below more complicated.
8092 unsigned NumElts = VT.getVectorNumElements();
8093 unsigned GenericOpcode = ISD::DELETED_NODE;
8094 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8095 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8096 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8097 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8098 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8099 // Ignore undef elements.
8100 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8101 if (Op.isUndef())
8102 continue;
8103
8104 // If there's an opcode mismatch, we're done.
8105 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8106 return false;
8107
8108 // Initialize horizontal opcode.
8109 if (HOpcode == ISD::DELETED_NODE) {
8110 GenericOpcode = Op.getOpcode();
8111 switch (GenericOpcode) {
8112 // clang-format off
8113 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8114 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8115 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8116 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8117 default: return false;
8118 // clang-format on
8119 }
8120 }
8121
8122 SDValue Op0 = Op.getOperand(0);
8123 SDValue Op1 = Op.getOperand(1);
8124 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8126 Op0.getOperand(0) != Op1.getOperand(0) ||
8127 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
8128 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8129 return false;
8130
8131 // The source vector is chosen based on which 64-bit half of the
8132 // destination vector is being calculated.
8133 if (j < NumEltsIn64Bits) {
8134 if (V0.isUndef())
8135 V0 = Op0.getOperand(0);
8136 } else {
8137 if (V1.isUndef())
8138 V1 = Op0.getOperand(0);
8139 }
8140
8141 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8142 if (SourceVec != Op0.getOperand(0))
8143 return false;
8144
8145 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8146 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8147 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8148 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8149 (j % NumEltsIn64Bits) * 2;
8150 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8151 continue;
8152
8153 // If this is not a commutative op, this does not match.
8154 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
8155 return false;
8156
8157 // Addition is commutative, so try swapping the extract indexes.
8158 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
8159 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8160 continue;
8161
8162 // Extract indexes do not match horizontal requirement.
8163 return false;
8164 }
8165 }
8166 // We matched. Opcode and operands are returned by reference as arguments.
8167 return true;
8168}
8169
8171 const SDLoc &DL, SelectionDAG &DAG,
8172 unsigned HOpcode, SDValue V0, SDValue V1) {
8173 // If either input vector is not the same size as the build vector,
8174 // extract/insert the low bits to the correct size.
8175 // This is free (examples: zmm --> xmm, xmm --> ymm).
8176 MVT VT = BV->getSimpleValueType(0);
8177 unsigned Width = VT.getSizeInBits();
8178 if (V0.getValueSizeInBits() > Width)
8179 V0 = extractSubVector(V0, 0, DAG, DL, Width);
8180 else if (V0.getValueSizeInBits() < Width)
8181 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
8182
8183 if (V1.getValueSizeInBits() > Width)
8184 V1 = extractSubVector(V1, 0, DAG, DL, Width);
8185 else if (V1.getValueSizeInBits() < Width)
8186 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
8187
8188 unsigned NumElts = VT.getVectorNumElements();
8189 APInt DemandedElts = APInt::getAllOnes(NumElts);
8190 for (unsigned i = 0; i != NumElts; ++i)
8191 if (BV->getOperand(i).isUndef())
8192 DemandedElts.clearBit(i);
8193
8194 // If we don't need the upper xmm, then perform as a xmm hop.
8195 unsigned HalfNumElts = NumElts / 2;
8196 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
8197 MVT HalfVT = VT.getHalfNumVectorElementsVT();
8198 V0 = extractSubVector(V0, 0, DAG, DL, 128);
8199 V1 = extractSubVector(V1, 0, DAG, DL, 128);
8200 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
8201 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
8202 }
8203
8204 return DAG.getNode(HOpcode, DL, VT, V0, V1);
8205}
8206
8207/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
8209 const X86Subtarget &Subtarget,
8210 SelectionDAG &DAG) {
8211 // We need at least 2 non-undef elements to make this worthwhile by default.
8212 unsigned NumNonUndefs =
8213 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8214 if (NumNonUndefs < 2)
8215 return SDValue();
8216
8217 // There are 4 sets of horizontal math operations distinguished by type:
8218 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8219 // subtarget feature. Try to match those "native" patterns first.
8220 MVT VT = BV->getSimpleValueType(0);
8221 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
8222 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
8223 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
8224 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
8225 unsigned HOpcode;
8226 SDValue V0, V1;
8227 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8228 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
8229 }
8230
8231 // Try harder to match 256-bit ops by using extract/concat.
8232 if (!Subtarget.hasAVX() || !VT.is256BitVector())
8233 return SDValue();
8234
8235 // Count the number of UNDEF operands in the build_vector in input.
8236 unsigned NumElts = VT.getVectorNumElements();
8237 unsigned Half = NumElts / 2;
8238 unsigned NumUndefsLO = 0;
8239 unsigned NumUndefsHI = 0;
8240 for (unsigned i = 0, e = Half; i != e; ++i)
8241 if (BV->getOperand(i)->isUndef())
8242 NumUndefsLO++;
8243
8244 for (unsigned i = Half, e = NumElts; i != e; ++i)
8245 if (BV->getOperand(i)->isUndef())
8246 NumUndefsHI++;
8247
8248 SDValue InVec0, InVec1;
8249 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8250 SDValue InVec2, InVec3;
8251 unsigned X86Opcode;
8252 bool CanFold = true;
8253
8254 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
8255 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
8256 InVec3) &&
8257 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8258 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8259 X86Opcode = X86ISD::HADD;
8260 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
8261 InVec1) &&
8262 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
8263 InVec3) &&
8264 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8265 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8266 X86Opcode = X86ISD::HSUB;
8267 else
8268 CanFold = false;
8269
8270 if (CanFold) {
8271 // Do not try to expand this build_vector into a pair of horizontal
8272 // add/sub if we can emit a pair of scalar add/sub.
8273 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8274 return SDValue();
8275
8276 // Convert this build_vector into a pair of horizontal binops followed by
8277 // a concat vector. We must adjust the outputs from the partial horizontal
8278 // matching calls above to account for undefined vector halves.
8279 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8280 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8281 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8282 bool isUndefLO = NumUndefsLO == Half;
8283 bool isUndefHI = NumUndefsHI == Half;
8284 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8285 isUndefHI);
8286 }
8287 }
8288
8289 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8290 VT == MVT::v16i16) {
8291 unsigned X86Opcode;
8292 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
8293 InVec1))
8294 X86Opcode = X86ISD::HADD;
8295 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
8296 InVec1))
8297 X86Opcode = X86ISD::HSUB;
8298 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
8299 InVec1))
8300 X86Opcode = X86ISD::FHADD;
8301 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
8302 InVec1))
8303 X86Opcode = X86ISD::FHSUB;
8304 else
8305 return SDValue();
8306
8307 // Don't try to expand this build_vector into a pair of horizontal add/sub
8308 // if we can simply emit a pair of scalar add/sub.
8309 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8310 return SDValue();
8311
8312 // Convert this build_vector into two horizontal add/sub followed by
8313 // a concat vector.
8314 bool isUndefLO = NumUndefsLO == Half;
8315 bool isUndefHI = NumUndefsHI == Half;
8316 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8317 isUndefLO, isUndefHI);
8318 }
8319
8320 return SDValue();
8321}
8322
8323static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
8324 SelectionDAG &DAG);
8325
8326/// If a BUILD_VECTOR's source elements all apply the same bit operation and
8327/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8328/// just apply the bit to the vectors.
8329/// NOTE: Its not in our interest to start make a general purpose vectorizer
8330/// from this, but enough scalar bit operations are created from the later
8331/// legalization + scalarization stages to need basic support.
8333 const X86Subtarget &Subtarget,
8334 SelectionDAG &DAG) {
8335 MVT VT = Op->getSimpleValueType(0);
8336 unsigned NumElems = VT.getVectorNumElements();
8337 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8338
8339 // Check that all elements have the same opcode.
8340 // TODO: Should we allow UNDEFS and if so how many?
8341 unsigned Opcode = Op->getOperand(0).getOpcode();
8342 for (unsigned i = 1; i < NumElems; ++i)
8343 if (Opcode != Op->getOperand(i).getOpcode())
8344 return SDValue();
8345
8346 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
8347 bool IsShift = false;
8348 switch (Opcode) {
8349 default:
8350 return SDValue();
8351 case ISD::SHL:
8352 case ISD::SRL:
8353 case ISD::SRA:
8354 IsShift = true;
8355 break;
8356 case ISD::AND:
8357 case ISD::XOR:
8358 case ISD::OR:
8359 // Don't do this if the buildvector is a splat - we'd replace one
8360 // constant with an entire vector.
8361 if (Op->getSplatValue())
8362 return SDValue();
8363 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8364 return SDValue();
8365 break;
8366 }
8367
8368 SmallVector<SDValue, 4> LHSElts, RHSElts;
8369 for (SDValue Elt : Op->ops()) {
8370 SDValue LHS = Elt.getOperand(0);
8371 SDValue RHS = Elt.getOperand(1);
8372
8373 // We expect the canonicalized RHS operand to be the constant.
8374 if (!isa<ConstantSDNode>(RHS))
8375 return SDValue();
8376
8377 // Extend shift amounts.
8378 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
8379 if (!IsShift)
8380 return SDValue();
8381 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
8382 }
8383
8384 LHSElts.push_back(LHS);
8385 RHSElts.push_back(RHS);
8386 }
8387
8388 // Limit to shifts by uniform immediates.
8389 // TODO: Only accept vXi8/vXi64 special cases?
8390 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8391 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
8392 return SDValue();
8393
8394 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
8395 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
8396 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
8397
8398 if (!IsShift)
8399 return Res;
8400
8401 // Immediately lower the shift to ensure the constant build vector doesn't
8402 // get converted to a constant pool before the shift is lowered.
8403 return LowerShift(Res, Subtarget, DAG);
8404}
8405
8406/// Create a vector constant without a load. SSE/AVX provide the bare minimum
8407/// functionality to do this, so it's all zeros, all ones, or some derivation
8408/// that is cheap to calculate.
8410 SelectionDAG &DAG,
8411 const X86Subtarget &Subtarget) {
8412 MVT VT = Op.getSimpleValueType();
8413
8414 // Vectors containing all zeros can be matched by pxor and xorps.
8415 if (ISD::isBuildVectorAllZeros(Op.getNode()))
8416 return Op;
8417
8418 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8419 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8420 // vpcmpeqd on 256-bit vectors.
8421 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8422 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8423 return Op;
8424
8425 return getOnesVector(VT, DAG, DL);
8426 }
8427
8428 return SDValue();
8429}
8430
8431/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8432/// from a vector of source values and a vector of extraction indices.
8433/// The vectors might be manipulated to match the type of the permute op.
8434static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8435 const SDLoc &DL, SelectionDAG &DAG,
8436 const X86Subtarget &Subtarget) {
8437 MVT ShuffleVT = VT;
8438 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8439 unsigned NumElts = VT.getVectorNumElements();
8440 unsigned SizeInBits = VT.getSizeInBits();
8441
8442 // Adjust IndicesVec to match VT size.
8443 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8444 "Illegal variable permute mask size");
8445 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
8446 // Narrow/widen the indices vector to the correct size.
8447 if (IndicesVec.getValueSizeInBits() > SizeInBits)
8448 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8449 NumElts * VT.getScalarSizeInBits());
8450 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
8451 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
8452 SDLoc(IndicesVec), SizeInBits);
8453 // Zero-extend the index elements within the vector.
8454 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8455 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
8456 IndicesVT, IndicesVec);
8457 }
8458 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8459
8460 // Handle SrcVec that don't match VT type.
8461 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8462 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8463 // Handle larger SrcVec by treating it as a larger permute.
8464 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8465 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8466 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8467 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8468 Subtarget, DAG, SDLoc(IndicesVec));
8469 SDValue NewSrcVec =
8470 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8471 if (NewSrcVec)
8472 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
8473 return SDValue();
8474 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8475 // Widen smaller SrcVec to match VT.
8476 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8477 } else
8478 return SDValue();
8479 }
8480
8481 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8482 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8483 EVT SrcVT = Idx.getValueType();
8484 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8485 uint64_t IndexScale = 0;
8486 uint64_t IndexOffset = 0;
8487
8488 // If we're scaling a smaller permute op, then we need to repeat the
8489 // indices, scaling and offsetting them as well.
8490 // e.g. v4i32 -> v16i8 (Scale = 4)
8491 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8492 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8493 for (uint64_t i = 0; i != Scale; ++i) {
8494 IndexScale |= Scale << (i * NumDstBits);
8495 IndexOffset |= i << (i * NumDstBits);
8496 }
8497
8498 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8499 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8500 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8501 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8502 return Idx;
8503 };
8504
8505 unsigned Opcode = 0;
8506 switch (VT.SimpleTy) {
8507 default:
8508 break;
8509 case MVT::v16i8:
8510 if (Subtarget.hasSSSE3())
8511 Opcode = X86ISD::PSHUFB;
8512 break;
8513 case MVT::v8i16:
8514 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8515 Opcode = X86ISD::VPERMV;
8516 else if (Subtarget.hasSSSE3()) {
8517 Opcode = X86ISD::PSHUFB;
8518 ShuffleVT = MVT::v16i8;
8519 }
8520 break;
8521 case MVT::v4f32:
8522 case MVT::v4i32:
8523 if (Subtarget.hasAVX()) {
8524 Opcode = X86ISD::VPERMILPV;
8525 ShuffleVT = MVT::v4f32;
8526 } else if (Subtarget.hasSSSE3()) {
8527 Opcode = X86ISD::PSHUFB;
8528 ShuffleVT = MVT::v16i8;
8529 }
8530 break;
8531 case MVT::v2f64:
8532 case MVT::v2i64:
8533 if (Subtarget.hasAVX()) {
8534 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8535 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8536 Opcode = X86ISD::VPERMILPV;
8537 ShuffleVT = MVT::v2f64;
8538 } else if (Subtarget.hasSSE41()) {
8539 // SSE41 can compare v2i64 - select between indices 0 and 1.
8540 return DAG.getSelectCC(
8541 DL, IndicesVec,
8542 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8543 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8544 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8546 }
8547 break;
8548 case MVT::v32i8:
8549 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8550 Opcode = X86ISD::VPERMV;
8551 else if (Subtarget.hasXOP()) {
8552 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8553 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
8554 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
8555 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
8556 return DAG.getNode(
8558 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
8559 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
8560 } else if (Subtarget.hasAVX()) {
8561 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
8562 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
8563 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
8564 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
8565 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
8566 ArrayRef<SDValue> Ops) {
8567 // Permute Lo and Hi and then select based on index range.
8568 // This works as SHUFB uses bits[3:0] to permute elements and we don't
8569 // care about the bit[7] as its just an index vector.
8570 SDValue Idx = Ops[2];
8571 EVT VT = Idx.getValueType();
8572 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
8573 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
8574 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
8576 };
8577 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
8578 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
8579 PSHUFBBuilder);
8580 }
8581 break;
8582 case MVT::v16i16:
8583 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8584 Opcode = X86ISD::VPERMV;
8585 else if (Subtarget.hasAVX()) {
8586 // Scale to v32i8 and perform as v32i8.
8587 IndicesVec = ScaleIndices(IndicesVec, 2);
8588 return DAG.getBitcast(
8590 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
8591 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
8592 }
8593 break;
8594 case MVT::v8f32:
8595 case MVT::v8i32:
8596 if (Subtarget.hasAVX2())
8597 Opcode = X86ISD::VPERMV;
8598 else if (Subtarget.hasAVX()) {
8599 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
8600 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8601 {0, 1, 2, 3, 0, 1, 2, 3});
8602 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8603 {4, 5, 6, 7, 4, 5, 6, 7});
8604 if (Subtarget.hasXOP())
8605 return DAG.getBitcast(
8606 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
8607 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
8608 // Permute Lo and Hi and then select based on index range.
8609 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
8610 SDValue Res = DAG.getSelectCC(
8611 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
8612 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
8613 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
8615 return DAG.getBitcast(VT, Res);
8616 }
8617 break;
8618 case MVT::v4i64:
8619 case MVT::v4f64:
8620 if (Subtarget.hasAVX512()) {
8621 if (!Subtarget.hasVLX()) {
8622 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
8623 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
8624 SDLoc(SrcVec));
8625 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
8626 DAG, SDLoc(IndicesVec));
8627 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
8628 DAG, Subtarget);
8629 return extract256BitVector(Res, 0, DAG, DL);
8630 }
8631 Opcode = X86ISD::VPERMV;
8632 } else if (Subtarget.hasAVX()) {
8633 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
8634 SDValue LoLo =
8635 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
8636 SDValue HiHi =
8637 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
8638 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
8639 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8640 if (Subtarget.hasXOP())
8641 return DAG.getBitcast(
8642 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
8643 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
8644 // Permute Lo and Hi and then select based on index range.
8645 // This works as VPERMILPD only uses index bit[1] to permute elements.
8646 SDValue Res = DAG.getSelectCC(
8647 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
8648 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
8649 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
8651 return DAG.getBitcast(VT, Res);
8652 }
8653 break;
8654 case MVT::v64i8:
8655 if (Subtarget.hasVBMI())
8656 Opcode = X86ISD::VPERMV;
8657 break;
8658 case MVT::v32i16:
8659 if (Subtarget.hasBWI())
8660 Opcode = X86ISD::VPERMV;
8661 break;
8662 case MVT::v16f32:
8663 case MVT::v16i32:
8664 case MVT::v8f64:
8665 case MVT::v8i64:
8666 if (Subtarget.hasAVX512())
8667 Opcode = X86ISD::VPERMV;
8668 break;
8669 }
8670 if (!Opcode)
8671 return SDValue();
8672
8673 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
8674 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
8675 "Illegal variable permute shuffle type");
8676
8677 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
8678 if (Scale > 1)
8679 IndicesVec = ScaleIndices(IndicesVec, Scale);
8680
8681 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
8682 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
8683
8684 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
8685 SDValue Res = Opcode == X86ISD::VPERMV
8686 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
8687 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
8688 return DAG.getBitcast(VT, Res);
8689}
8690
8691// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
8692// reasoned to be a permutation of a vector by indices in a non-constant vector.
8693// (build_vector (extract_elt V, (extract_elt I, 0)),
8694// (extract_elt V, (extract_elt I, 1)),
8695// ...
8696// ->
8697// (vpermv I, V)
8698//
8699// TODO: Handle undefs
8700// TODO: Utilize pshufb and zero mask blending to support more efficient
8701// construction of vectors with constant-0 elements.
8702static SDValue
8704 SelectionDAG &DAG,
8705 const X86Subtarget &Subtarget) {
8706 SDValue SrcVec, IndicesVec;
8707 // Check for a match of the permute source vector and permute index elements.
8708 // This is done by checking that the i-th build_vector operand is of the form:
8709 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
8710 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
8711 SDValue Op = V.getOperand(Idx);
8712 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8713 return SDValue();
8714
8715 // If this is the first extract encountered in V, set the source vector,
8716 // otherwise verify the extract is from the previously defined source
8717 // vector.
8718 if (!SrcVec)
8719 SrcVec = Op.getOperand(0);
8720 else if (SrcVec != Op.getOperand(0))
8721 return SDValue();
8722 SDValue ExtractedIndex = Op->getOperand(1);
8723 // Peek through extends.
8724 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
8725 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
8726 ExtractedIndex = ExtractedIndex.getOperand(0);
8727 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8728 return SDValue();
8729
8730 // If this is the first extract from the index vector candidate, set the
8731 // indices vector, otherwise verify the extract is from the previously
8732 // defined indices vector.
8733 if (!IndicesVec)
8734 IndicesVec = ExtractedIndex.getOperand(0);
8735 else if (IndicesVec != ExtractedIndex.getOperand(0))
8736 return SDValue();
8737
8738 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
8739 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
8740 return SDValue();
8741 }
8742
8743 MVT VT = V.getSimpleValueType();
8744 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8745}
8746
8747SDValue
8748X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
8749 SDLoc dl(Op);
8750
8751 MVT VT = Op.getSimpleValueType();
8752 MVT EltVT = VT.getVectorElementType();
8753 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
8754 unsigned NumElems = Op.getNumOperands();
8755
8756 // Generate vectors for predicate vectors.
8757 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
8758 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
8759
8760 if (VT.getVectorElementType() == MVT::bf16 &&
8761 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
8762 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
8763
8764 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
8765 return VectorCst;
8766
8767 unsigned EVTBits = EltVT.getSizeInBits();
8768 APInt UndefMask = APInt::getZero(NumElems);
8769 APInt FrozenUndefMask = APInt::getZero(NumElems);
8770 APInt ZeroMask = APInt::getZero(NumElems);
8771 APInt NonZeroMask = APInt::getZero(NumElems);
8772 bool IsAllConstants = true;
8773 bool OneUseFrozenUndefs = true;
8774 SmallSet<SDValue, 8> Values;
8775 unsigned NumConstants = NumElems;
8776 for (unsigned i = 0; i < NumElems; ++i) {
8777 SDValue Elt = Op.getOperand(i);
8778 if (Elt.isUndef()) {
8779 UndefMask.setBit(i);
8780 continue;
8781 }
8782 if (ISD::isFreezeUndef(Elt.getNode())) {
8783 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
8784 FrozenUndefMask.setBit(i);
8785 continue;
8786 }
8787 Values.insert(Elt);
8788 if (!isIntOrFPConstant(Elt)) {
8789 IsAllConstants = false;
8790 NumConstants--;
8791 }
8792 if (X86::isZeroNode(Elt)) {
8793 ZeroMask.setBit(i);
8794 } else {
8795 NonZeroMask.setBit(i);
8796 }
8797 }
8798
8799 // All undef vector. Return an UNDEF.
8800 if (UndefMask.isAllOnes())
8801 return DAG.getUNDEF(VT);
8802
8803 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
8804 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
8805 return DAG.getFreeze(DAG.getUNDEF(VT));
8806
8807 // All undef/freeze(undef)/zero vector. Return a zero vector.
8808 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
8809 return getZeroVector(VT, Subtarget, DAG, dl);
8810
8811 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
8812 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
8813 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
8814 // and blend the FREEZE-UNDEF operands back in.
8815 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
8816 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
8817 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
8818 SmallVector<int, 16> BlendMask(NumElems, -1);
8819 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
8820 for (unsigned i = 0; i < NumElems; ++i) {
8821 if (UndefMask[i]) {
8822 BlendMask[i] = -1;
8823 continue;
8824 }
8825 BlendMask[i] = i;
8826 if (!FrozenUndefMask[i])
8827 Elts[i] = Op.getOperand(i);
8828 else
8829 BlendMask[i] += NumElems;
8830 }
8831 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
8832 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
8833 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
8834 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
8835 }
8836
8837 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
8838
8839 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
8840 // be better off lowering to a smaller build vector and padding with
8841 // undef/zero.
8842 if ((VT.is256BitVector() || VT.is512BitVector()) &&
8844 unsigned UpperElems = NumElems / 2;
8845 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
8846 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
8847 if (NumUpperUndefsOrZeros >= UpperElems) {
8848 if (VT.is512BitVector() &&
8849 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
8850 UpperElems = NumElems - (NumElems / 4);
8851 // If freeze(undef) is in any upper elements, force to zero.
8852 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
8853 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
8854 SDValue NewBV =
8855 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
8856 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
8857 }
8858 }
8859
8860 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
8861 return AddSub;
8862 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
8863 return HorizontalOp;
8864 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
8865 return Broadcast;
8866 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
8867 return BitOp;
8868
8869 unsigned NumZero = ZeroMask.popcount();
8870 unsigned NumNonZero = NonZeroMask.popcount();
8871
8872 // If we are inserting one variable into a vector of non-zero constants, try
8873 // to avoid loading each constant element as a scalar. Load the constants as a
8874 // vector and then insert the variable scalar element. If insertion is not
8875 // supported, fall back to a shuffle to get the scalar blended with the
8876 // constants. Insertion into a zero vector is handled as a special-case
8877 // somewhere below here.
8878 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
8879 FrozenUndefMask.isZero() &&
8882 // Create an all-constant vector. The variable element in the old
8883 // build vector is replaced by undef in the constant vector. Save the
8884 // variable scalar element and its index for use in the insertelement.
8885 LLVMContext &Context = *DAG.getContext();
8886 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
8887 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
8888 SDValue VarElt;
8889 SDValue InsIndex;
8890 for (unsigned i = 0; i != NumElems; ++i) {
8891 SDValue Elt = Op.getOperand(i);
8892 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
8893 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
8894 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
8895 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
8896 else if (!Elt.isUndef()) {
8897 assert(!VarElt.getNode() && !InsIndex.getNode() &&
8898 "Expected one variable element in this vector");
8899 VarElt = Elt;
8900 InsIndex = DAG.getVectorIdxConstant(i, dl);
8901 }
8902 }
8903 Constant *CV = ConstantVector::get(ConstVecOps);
8904 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
8905
8906 // The constants we just created may not be legal (eg, floating point). We
8907 // must lower the vector right here because we can not guarantee that we'll
8908 // legalize it before loading it. This is also why we could not just create
8909 // a new build vector here. If the build vector contains illegal constants,
8910 // it could get split back up into a series of insert elements.
8911 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
8912 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
8915 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
8916 unsigned InsertC = InsIndex->getAsZExtVal();
8917 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
8918 if (InsertC < NumEltsInLow128Bits)
8919 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
8920
8921 // There's no good way to insert into the high elements of a >128-bit
8922 // vector, so use shuffles to avoid an extract/insert sequence.
8923 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
8924 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
8925 SmallVector<int, 8> ShuffleMask;
8926 unsigned NumElts = VT.getVectorNumElements();
8927 for (unsigned i = 0; i != NumElts; ++i)
8928 ShuffleMask.push_back(i == InsertC ? NumElts : i);
8929 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
8930 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
8931 }
8932
8933 // Special case for single non-zero, non-undef, element.
8934 if (NumNonZero == 1) {
8935 unsigned Idx = NonZeroMask.countr_zero();
8936 SDValue Item = Op.getOperand(Idx);
8937
8938 // If we have a constant or non-constant insertion into the low element of
8939 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
8940 // the rest of the elements. This will be matched as movd/movq/movss/movsd
8941 // depending on what the source datatype is.
8942 if (Idx == 0) {
8943 if (NumZero == 0)
8944 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8945
8946 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
8947 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
8948 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
8949 assert((VT.is128BitVector() || VT.is256BitVector() ||
8950 VT.is512BitVector()) &&
8951 "Expected an SSE value type!");
8952 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8953 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
8954 // zero vector.
8955 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8956 }
8957
8958 // We can't directly insert an i8 or i16 into a vector, so zero extend
8959 // it to i32 first.
8960 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
8961 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
8962 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
8963 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
8964 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8965 return DAG.getBitcast(VT, Item);
8966 }
8967 }
8968
8969 // Is it a vector logical left shift?
8970 if (NumElems == 2 && Idx == 1 &&
8971 X86::isZeroNode(Op.getOperand(0)) &&
8972 !X86::isZeroNode(Op.getOperand(1))) {
8973 unsigned NumBits = VT.getSizeInBits();
8974 return getVShift(true, VT,
8976 VT, Op.getOperand(1)),
8977 NumBits/2, DAG, *this, dl);
8978 }
8979
8980 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
8981 return SDValue();
8982
8983 // Otherwise, if this is a vector with i32 or f32 elements, and the element
8984 // is a non-constant being inserted into an element other than the low one,
8985 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
8986 // movd/movss) to move this into the low element, then shuffle it into
8987 // place.
8988 if (EVTBits == 32) {
8989 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8990 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
8991 }
8992 }
8993
8994 // Splat is obviously ok. Let legalizer expand it to a shuffle.
8995 if (Values.size() == 1) {
8996 if (EVTBits == 32) {
8997 // Instead of a shuffle like this:
8998 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
8999 // Check if it's possible to issue this instead.
9000 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9001 unsigned Idx = NonZeroMask.countr_zero();
9002 SDValue Item = Op.getOperand(Idx);
9003 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9004 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9005 }
9006 return SDValue();
9007 }
9008
9009 // A vector full of immediates; various special cases are already
9010 // handled, so this is best done with a single constant-pool load.
9011 if (IsAllConstants)
9012 return SDValue();
9013
9014 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
9015 return V;
9016
9017 // See if we can use a vector load to get all of the elements.
9018 {
9019 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
9020 if (SDValue LD =
9021 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9022 return LD;
9023 }
9024
9025 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9026 // build_vector and broadcast it.
9027 // TODO: We could probably generalize this more.
9028 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
9029 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9030 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9031 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
9032 // Make sure all the even/odd operands match.
9033 for (unsigned i = 2; i != NumElems; ++i)
9034 if (Ops[i % 2] != Op.getOperand(i))
9035 return false;
9036 return true;
9037 };
9038 if (CanSplat(Op, NumElems, Ops)) {
9039 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
9040 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
9041 // Create a new build vector and cast to v2i64/v2f64.
9042 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
9043 DAG.getBuildVector(NarrowVT, dl, Ops));
9044 // Broadcast from v2i64/v2f64 and cast to final VT.
9045 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
9046 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
9047 NewBV));
9048 }
9049 }
9050
9051 // For AVX-length vectors, build the individual 128-bit pieces and use
9052 // shuffles to put them in place.
9053 if (VT.getSizeInBits() > 128) {
9054 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
9055
9056 // Build both the lower and upper subvector.
9057 SDValue Lower =
9058 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9060 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9061
9062 // Recreate the wider vector with the lower and upper part.
9063 return concatSubVectors(Lower, Upper, DAG, dl);
9064 }
9065
9066 // Let legalizer expand 2-wide build_vectors.
9067 if (EVTBits == 64) {
9068 if (NumNonZero == 1) {
9069 // One half is zero or undef.
9070 unsigned Idx = NonZeroMask.countr_zero();
9072 Op.getOperand(Idx));
9073 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
9074 }
9075 return SDValue();
9076 }
9077
9078 // If element VT is < 32 bits, convert it to inserts into a zero vector.
9079 if (EVTBits == 8 && NumElems == 16)
9080 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
9081 NumZero, DAG, Subtarget))
9082 return V;
9083
9084 if (EltVT == MVT::i16 && NumElems == 8)
9085 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
9086 NumZero, DAG, Subtarget))
9087 return V;
9088
9089 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
9090 if (EVTBits == 32 && NumElems == 4)
9091 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
9092 return V;
9093
9094 // If element VT is == 32 bits, turn it into a number of shuffles.
9095 if (NumElems == 4 && NumZero > 0) {
9096 SmallVector<SDValue, 8> Ops(NumElems);
9097 for (unsigned i = 0; i < 4; ++i) {
9098 bool isZero = !NonZeroMask[i];
9099 if (isZero)
9100 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
9101 else
9102 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9103 }
9104
9105 for (unsigned i = 0; i < 2; ++i) {
9106 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
9107 default: llvm_unreachable("Unexpected NonZero count");
9108 case 0:
9109 Ops[i] = Ops[i*2]; // Must be a zero vector.
9110 break;
9111 case 1:
9112 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9113 break;
9114 case 2:
9115 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9116 break;
9117 case 3:
9118 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9119 break;
9120 }
9121 }
9122
9123 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
9124 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
9125 int MaskVec[] = {
9126 Reverse1 ? 1 : 0,
9127 Reverse1 ? 0 : 1,
9128 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9129 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9130 };
9131 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
9132 }
9133
9134 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9135
9136 // Check for a build vector from mostly shuffle plus few inserting.
9137 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
9138 return Sh;
9139
9140 // For SSE 4.1, use insertps to put the high elements into the low element.
9141 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
9143 if (!Op.getOperand(0).isUndef())
9144 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
9145 else
9146 Result = DAG.getUNDEF(VT);
9147
9148 for (unsigned i = 1; i < NumElems; ++i) {
9149 if (Op.getOperand(i).isUndef()) continue;
9150 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
9151 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
9152 }
9153 return Result;
9154 }
9155
9156 // Otherwise, expand into a number of unpckl*, start by extending each of
9157 // our (non-undef) elements to the full vector width with the element in the
9158 // bottom slot of the vector (which generates no code for SSE).
9159 SmallVector<SDValue, 8> Ops(NumElems);
9160 for (unsigned i = 0; i < NumElems; ++i) {
9161 if (!Op.getOperand(i).isUndef())
9162 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9163 else
9164 Ops[i] = DAG.getUNDEF(VT);
9165 }
9166
9167 // Next, we iteratively mix elements, e.g. for v4f32:
9168 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
9169 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
9170 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
9171 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9172 // Generate scaled UNPCKL shuffle mask.
9174 for(unsigned i = 0; i != Scale; ++i)
9175 Mask.push_back(i);
9176 for (unsigned i = 0; i != Scale; ++i)
9177 Mask.push_back(NumElems+i);
9178 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9179
9180 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
9181 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
9182 }
9183 return Ops[0];
9184}
9185
9186// 256-bit AVX can use the vinsertf128 instruction
9187// to create 256-bit vectors from two other 128-bit ones.
9188// TODO: Detect subvector broadcast here instead of DAG combine?
9190 const X86Subtarget &Subtarget) {
9191 SDLoc dl(Op);
9192 MVT ResVT = Op.getSimpleValueType();
9193
9194 assert((ResVT.is256BitVector() ||
9195 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
9196
9197 unsigned NumOperands = Op.getNumOperands();
9198 unsigned NumFreezeUndef = 0;
9199 unsigned NumZero = 0;
9200 unsigned NumNonZero = 0;
9201 unsigned NonZeros = 0;
9202 for (unsigned i = 0; i != NumOperands; ++i) {
9203 SDValue SubVec = Op.getOperand(i);
9204 if (SubVec.isUndef())
9205 continue;
9206 if (ISD::isFreezeUndef(SubVec.getNode())) {
9207 // If the freeze(undef) has multiple uses then we must fold to zero.
9208 if (SubVec.hasOneUse())
9209 ++NumFreezeUndef;
9210 else
9211 ++NumZero;
9212 }
9213 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9214 ++NumZero;
9215 else {
9216 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9217 NonZeros |= 1 << i;
9218 ++NumNonZero;
9219 }
9220 }
9221
9222 // If we have more than 2 non-zeros, build each half separately.
9223 if (NumNonZero > 2) {
9224 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9225 ArrayRef<SDUse> Ops = Op->ops();
9226 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9227 Ops.slice(0, NumOperands/2));
9228 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9229 Ops.slice(NumOperands/2));
9230 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9231 }
9232
9233 // Otherwise, build it up through insert_subvectors.
9234 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9235 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
9236 : DAG.getUNDEF(ResVT));
9237
9238 MVT SubVT = Op.getOperand(0).getSimpleValueType();
9239 unsigned NumSubElems = SubVT.getVectorNumElements();
9240 for (unsigned i = 0; i != NumOperands; ++i) {
9241 if ((NonZeros & (1 << i)) == 0)
9242 continue;
9243
9244 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
9245 Op.getOperand(i),
9246 DAG.getIntPtrConstant(i * NumSubElems, dl));
9247 }
9248
9249 return Vec;
9250}
9251
9252// Returns true if the given node is a type promotion (by concatenating i1
9253// zeros) of the result of a node that already zeros all upper bits of
9254// k-register.
9255// TODO: Merge this with LowerAVXCONCAT_VECTORS?
9257 const X86Subtarget &Subtarget,
9258 SelectionDAG & DAG) {
9259 SDLoc dl(Op);
9260 MVT ResVT = Op.getSimpleValueType();
9261 unsigned NumOperands = Op.getNumOperands();
9262
9263 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9264 "Unexpected number of operands in CONCAT_VECTORS");
9265
9266 uint64_t Zeros = 0;
9267 uint64_t NonZeros = 0;
9268 for (unsigned i = 0; i != NumOperands; ++i) {
9269 SDValue SubVec = Op.getOperand(i);
9270 if (SubVec.isUndef())
9271 continue;
9272 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9273 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9274 Zeros |= (uint64_t)1 << i;
9275 else
9276 NonZeros |= (uint64_t)1 << i;
9277 }
9278
9279 unsigned NumElems = ResVT.getVectorNumElements();
9280
9281 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9282 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
9283 // insert_subvector will give us two kshifts.
9284 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9285 Log2_64(NonZeros) != NumOperands - 1) {
9286 unsigned Idx = Log2_64(NonZeros);
9287 SDValue SubVec = Op.getOperand(Idx);
9288 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9289 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
9290 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
9291 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
9292 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
9293 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
9294 DAG.getIntPtrConstant(0, dl));
9295 }
9296
9297 // If there are zero or one non-zeros we can handle this very simply.
9298 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
9299 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
9300 if (!NonZeros)
9301 return Vec;
9302 unsigned Idx = Log2_64(NonZeros);
9303 SDValue SubVec = Op.getOperand(Idx);
9304 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9305 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
9306 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
9307 }
9308
9309 if (NumOperands > 2) {
9310 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9311 ArrayRef<SDUse> Ops = Op->ops();
9312 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9313 Ops.slice(0, NumOperands/2));
9314 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9315 Ops.slice(NumOperands/2));
9316 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9317 }
9318
9319 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
9320
9321 if (ResVT.getVectorNumElements() >= 16)
9322 return Op; // The operation is legal with KUNPCK
9323
9324 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
9325 DAG.getUNDEF(ResVT), Op.getOperand(0),
9326 DAG.getIntPtrConstant(0, dl));
9327 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
9328 DAG.getIntPtrConstant(NumElems/2, dl));
9329}
9330
9332 const X86Subtarget &Subtarget,
9333 SelectionDAG &DAG) {
9334 MVT VT = Op.getSimpleValueType();
9335 if (VT.getVectorElementType() == MVT::i1)
9336 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
9337
9338 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
9339 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
9340 Op.getNumOperands() == 4)));
9341
9342 // AVX can use the vinsertf128 instruction to create 256-bit vectors
9343 // from two other 128-bit ones.
9344
9345 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9346 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
9347}
9348
9349//===----------------------------------------------------------------------===//
9350// Vector shuffle lowering
9351//
9352// This is an experimental code path for lowering vector shuffles on x86. It is
9353// designed to handle arbitrary vector shuffles and blends, gracefully
9354// degrading performance as necessary. It works hard to recognize idiomatic
9355// shuffles and lower them to optimal instruction patterns without leaving
9356// a framework that allows reasonably efficient handling of all vector shuffle
9357// patterns.
9358//===----------------------------------------------------------------------===//
9359
9360/// Tiny helper function to identify a no-op mask.
9361///
9362/// This is a somewhat boring predicate function. It checks whether the mask
9363/// array input, which is assumed to be a single-input shuffle mask of the kind
9364/// used by the X86 shuffle instructions (not a fully general
9365/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
9366/// in-place shuffle are 'no-op's.
9368 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9369 assert(Mask[i] >= -1 && "Out of bound mask element!");
9370 if (Mask[i] >= 0 && Mask[i] != i)
9371 return false;
9372 }
9373 return true;
9374}
9375
9376/// Test whether there are elements crossing LaneSizeInBits lanes in this
9377/// shuffle mask.
9378///
9379/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9380/// and we routinely test for these.
9381static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
9382 unsigned ScalarSizeInBits,
9383 ArrayRef<int> Mask) {
9384 assert(LaneSizeInBits && ScalarSizeInBits &&
9385 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9386 "Illegal shuffle lane size");
9387 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
9388 int Size = Mask.size();
9389 for (int i = 0; i < Size; ++i)
9390 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9391 return true;
9392 return false;
9393}
9394
9395/// Test whether there are elements crossing 128-bit lanes in this
9396/// shuffle mask.
9398 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
9399}
9400
9401/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
9402/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9403/// better support 'repeated mask + lane permute' style shuffles.
9404static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
9405 unsigned ScalarSizeInBits,
9406 ArrayRef<int> Mask) {
9407 assert(LaneSizeInBits && ScalarSizeInBits &&
9408 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9409 "Illegal shuffle lane size");
9410 int NumElts = Mask.size();
9411 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
9412 int NumLanes = NumElts / NumEltsPerLane;
9413 if (NumLanes > 1) {
9414 for (int i = 0; i != NumLanes; ++i) {
9415 int SrcLane = -1;
9416 for (int j = 0; j != NumEltsPerLane; ++j) {
9417 int M = Mask[(i * NumEltsPerLane) + j];
9418 if (M < 0)
9419 continue;
9420 int Lane = (M % NumElts) / NumEltsPerLane;
9421 if (SrcLane >= 0 && SrcLane != Lane)
9422 return true;
9423 SrcLane = Lane;
9424 }
9425 }
9426 }
9427 return false;
9428}
9429
9430/// Test whether a shuffle mask is equivalent within each sub-lane.
9431///
9432/// This checks a shuffle mask to see if it is performing the same
9433/// lane-relative shuffle in each sub-lane. This trivially implies
9434/// that it is also not lane-crossing. It may however involve a blend from the
9435/// same lane of a second vector.
9436///
9437/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
9438/// non-trivial to compute in the face of undef lanes. The representation is
9439/// suitable for use with existing 128-bit shuffles as entries from the second
9440/// vector have been remapped to [LaneSize, 2*LaneSize).
9441static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
9442 ArrayRef<int> Mask,
9443 SmallVectorImpl<int> &RepeatedMask) {
9444 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
9445 RepeatedMask.assign(LaneSize, -1);
9446 int Size = Mask.size();
9447 for (int i = 0; i < Size; ++i) {
9448 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
9449 if (Mask[i] < 0)
9450 continue;
9451 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9452 // This entry crosses lanes, so there is no way to model this shuffle.
9453 return false;
9454
9455 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
9456 // Adjust second vector indices to start at LaneSize instead of Size.
9457 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
9458 : Mask[i] % LaneSize + LaneSize;
9459 if (RepeatedMask[i % LaneSize] < 0)
9460 // This is the first non-undef entry in this slot of a 128-bit lane.
9461 RepeatedMask[i % LaneSize] = LocalM;
9462 else if (RepeatedMask[i % LaneSize] != LocalM)
9463 // Found a mismatch with the repeated mask.
9464 return false;
9465 }
9466 return true;
9467}
9468
9469/// Test whether a shuffle mask is equivalent within each 128-bit lane.
9470static bool
9472 SmallVectorImpl<int> &RepeatedMask) {
9473 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
9474}
9475
9476static bool
9478 SmallVector<int, 32> RepeatedMask;
9479 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
9480}
9481
9482/// Test whether a shuffle mask is equivalent within each 256-bit lane.
9483static bool
9485 SmallVectorImpl<int> &RepeatedMask) {
9486 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
9487}
9488
9489/// Test whether a target shuffle mask is equivalent within each sub-lane.
9490/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
9491static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
9492 unsigned EltSizeInBits,
9493 ArrayRef<int> Mask,
9494 SmallVectorImpl<int> &RepeatedMask) {
9495 int LaneSize = LaneSizeInBits / EltSizeInBits;
9496 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
9497 int Size = Mask.size();
9498 for (int i = 0; i < Size; ++i) {
9499 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
9500 if (Mask[i] == SM_SentinelUndef)
9501 continue;
9502 if (Mask[i] == SM_SentinelZero) {
9503 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
9504 return false;
9505 RepeatedMask[i % LaneSize] = SM_SentinelZero;
9506 continue;
9507 }
9508 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9509 // This entry crosses lanes, so there is no way to model this shuffle.
9510 return false;
9511
9512 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
9513 // later vector indices to start at multiples of LaneSize instead of Size.
9514 int LaneM = Mask[i] / Size;
9515 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
9516 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
9517 // This is the first non-undef entry in this slot of a 128-bit lane.
9518 RepeatedMask[i % LaneSize] = LocalM;
9519 else if (RepeatedMask[i % LaneSize] != LocalM)
9520 // Found a mismatch with the repeated mask.
9521 return false;
9522 }
9523 return true;
9524}
9525
9526/// Test whether a target shuffle mask is equivalent within each sub-lane.
9527/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
9528static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
9529 ArrayRef<int> Mask,
9530 SmallVectorImpl<int> &RepeatedMask) {
9531 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
9532 Mask, RepeatedMask);
9533}
9534
9535/// Checks whether the vector elements referenced by two shuffle masks are
9536/// equivalent.
9537static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
9538 int Idx, int ExpectedIdx) {
9539 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
9540 ExpectedIdx < MaskSize && "Out of range element index");
9541 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
9542 return false;
9543
9544 switch (Op.getOpcode()) {
9545 case ISD::BUILD_VECTOR:
9546 // If the values are build vectors, we can look through them to find
9547 // equivalent inputs that make the shuffles equivalent.
9548 // TODO: Handle MaskSize != Op.getNumOperands()?
9549 if (MaskSize == (int)Op.getNumOperands() &&
9550 MaskSize == (int)ExpectedOp.getNumOperands())
9551 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
9552 break;
9553 case X86ISD::VBROADCAST:
9555 // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
9556 return (Op == ExpectedOp &&
9557 (int)Op.getValueType().getVectorNumElements() == MaskSize);
9558 case X86ISD::HADD:
9559 case X86ISD::HSUB:
9560 case X86ISD::FHADD:
9561 case X86ISD::FHSUB:
9562 case X86ISD::PACKSS:
9563 case X86ISD::PACKUS:
9564 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
9565 // TODO: Handle MaskSize != NumElts?
9566 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
9567 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
9568 MVT VT = Op.getSimpleValueType();
9569 int NumElts = VT.getVectorNumElements();
9570 if (MaskSize == NumElts) {
9571 int NumLanes = VT.getSizeInBits() / 128;
9572 int NumEltsPerLane = NumElts / NumLanes;
9573 int NumHalfEltsPerLane = NumEltsPerLane / 2;
9574 bool SameLane =
9575 (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
9576 bool SameElt =
9577 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
9578 return SameLane && SameElt;
9579 }
9580 }
9581 break;
9582 }
9583
9584 return false;
9585}
9586
9587/// Checks whether a shuffle mask is equivalent to an explicit list of
9588/// arguments.
9589///
9590/// This is a fast way to test a shuffle mask against a fixed pattern:
9591///
9592/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
9593///
9594/// It returns true if the mask is exactly as wide as the argument list, and
9595/// each element of the mask is either -1 (signifying undef) or the value given
9596/// in the argument.
9597static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
9598 SDValue V1 = SDValue(),
9599 SDValue V2 = SDValue()) {
9600 int Size = Mask.size();
9601 if (Size != (int)ExpectedMask.size())
9602 return false;
9603
9604 for (int i = 0; i < Size; ++i) {
9605 assert(Mask[i] >= -1 && "Out of bound mask element!");
9606 int MaskIdx = Mask[i];
9607 int ExpectedIdx = ExpectedMask[i];
9608 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
9609 SDValue MaskV = MaskIdx < Size ? V1 : V2;
9610 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9611 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9612 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9613 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
9614 return false;
9615 }
9616 }
9617 return true;
9618}
9619
9620/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
9621///
9622/// The masks must be exactly the same width.
9623///
9624/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
9625/// value in ExpectedMask is always accepted. Otherwise the indices must match.
9626///
9627/// SM_SentinelZero is accepted as a valid negative index but must match in
9628/// both, or via a known bits test.
9630 ArrayRef<int> ExpectedMask,
9631 const SelectionDAG &DAG,
9632 SDValue V1 = SDValue(),
9633 SDValue V2 = SDValue()) {
9634 int Size = Mask.size();
9635 if (Size != (int)ExpectedMask.size())
9636 return false;
9637 assert(llvm::all_of(ExpectedMask,
9638 [Size](int M) { return isInRange(M, 0, 2 * Size); }) &&
9639 "Illegal target shuffle mask");
9640
9641 // Check for out-of-range target shuffle mask indices.
9642 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
9643 return false;
9644
9645 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
9646 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
9647 !V1.getValueType().isVector()))
9648 V1 = SDValue();
9649 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
9650 !V2.getValueType().isVector()))
9651 V2 = SDValue();
9652
9653 APInt ZeroV1 = APInt::getZero(Size);
9654 APInt ZeroV2 = APInt::getZero(Size);
9655
9656 for (int i = 0; i < Size; ++i) {
9657 int MaskIdx = Mask[i];
9658 int ExpectedIdx = ExpectedMask[i];
9659 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
9660 continue;
9661 if (MaskIdx == SM_SentinelZero) {
9662 // If we need this expected index to be a zero element, then update the
9663 // relevant zero mask and perform the known bits at the end to minimize
9664 // repeated computes.
9665 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9666 if (ExpectedV &&
9667 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
9668 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9669 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
9670 ZeroMask.setBit(BitIdx);
9671 continue;
9672 }
9673 }
9674 if (MaskIdx >= 0) {
9675 SDValue MaskV = MaskIdx < Size ? V1 : V2;
9676 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9677 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9678 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9679 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
9680 continue;
9681 }
9682 return false;
9683 }
9684 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
9685 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
9686}
9687
9688// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
9689// instructions.
9691 const SelectionDAG &DAG) {
9692 if (VT != MVT::v8i32 && VT != MVT::v8f32)
9693 return false;
9694
9695 SmallVector<int, 8> Unpcklwd;
9696 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
9697 /* Unary = */ false);
9698 SmallVector<int, 8> Unpckhwd;
9699 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
9700 /* Unary = */ false);
9701 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
9702 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
9703 return IsUnpackwdMask;
9704}
9705
9707 const SelectionDAG &DAG) {
9708 // Create 128-bit vector type based on mask size.
9709 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
9710 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
9711
9712 // We can't assume a canonical shuffle mask, so try the commuted version too.
9713 SmallVector<int, 4> CommutedMask(Mask);
9715
9716 // Match any of unary/binary or low/high.
9717 for (unsigned i = 0; i != 4; ++i) {
9718 SmallVector<int, 16> UnpackMask;
9719 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
9720 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
9721 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
9722 return true;
9723 }
9724 return false;
9725}
9726
9727/// Return true if a shuffle mask chooses elements identically in its top and
9728/// bottom halves. For example, any splat mask has the same top and bottom
9729/// halves. If an element is undefined in only one half of the mask, the halves
9730/// are not considered identical.
9732 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
9733 unsigned HalfSize = Mask.size() / 2;
9734 for (unsigned i = 0; i != HalfSize; ++i) {
9735 if (Mask[i] != Mask[i + HalfSize])
9736 return false;
9737 }
9738 return true;
9739}
9740
9741/// Get a 4-lane 8-bit shuffle immediate for a mask.
9742///
9743/// This helper function produces an 8-bit shuffle immediate corresponding to
9744/// the ubiquitous shuffle encoding scheme used in x86 instructions for
9745/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
9746/// example.
9747///
9748/// NB: We rely heavily on "undef" masks preserving the input lane.
9749static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
9750 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
9751 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
9752 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
9753 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
9754 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
9755
9756 // If the mask only uses one non-undef element, then fully 'splat' it to
9757 // improve later broadcast matching.
9758 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
9759 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
9760
9761 int FirstElt = Mask[FirstIndex];
9762 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
9763 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
9764
9765 unsigned Imm = 0;
9766 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
9767 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
9768 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
9769 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
9770 return Imm;
9771}
9772
9774 SelectionDAG &DAG) {
9775 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
9776}
9777
9778// The Shuffle result is as follow:
9779// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
9780// Each Zeroable's element correspond to a particular Mask's element.
9781// As described in computeZeroableShuffleElements function.
9782//
9783// The function looks for a sub-mask that the nonzero elements are in
9784// increasing order. If such sub-mask exist. The function returns true.
9785static bool isNonZeroElementsInOrder(const APInt &Zeroable,
9786 ArrayRef<int> Mask, const EVT &VectorType,
9787 bool &IsZeroSideLeft) {
9788 int NextElement = -1;
9789 // Check if the Mask's nonzero elements are in increasing order.
9790 for (int i = 0, e = Mask.size(); i < e; i++) {
9791 // Checks if the mask's zeros elements are built from only zeros.
9792 assert(Mask[i] >= -1 && "Out of bound mask element!");
9793 if (Mask[i] < 0)
9794 return false;
9795 if (Zeroable[i])
9796 continue;
9797 // Find the lowest non zero element
9798 if (NextElement < 0) {
9799 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
9800 IsZeroSideLeft = NextElement != 0;
9801 }
9802 // Exit if the mask's non zero elements are not in increasing order.
9803 if (NextElement != Mask[i])
9804 return false;
9805 NextElement++;
9806 }
9807 return true;
9808}
9809
9810/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
9812 ArrayRef<int> Mask, SDValue V1,
9813 SDValue V2, const APInt &Zeroable,
9814 const X86Subtarget &Subtarget,
9815 SelectionDAG &DAG) {
9816 int Size = Mask.size();
9817 int LaneSize = 128 / VT.getScalarSizeInBits();
9818 const int NumBytes = VT.getSizeInBits() / 8;
9819 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
9820
9821 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
9822 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
9823 (Subtarget.hasBWI() && VT.is512BitVector()));
9824
9825 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
9826 // Sign bit set in i8 mask means zero element.
9827 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
9828
9829 SDValue V;
9830 for (int i = 0; i < NumBytes; ++i) {
9831 int M = Mask[i / NumEltBytes];
9832 if (M < 0) {
9833 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
9834 continue;
9835 }
9836 if (Zeroable[i / NumEltBytes]) {
9837 PSHUFBMask[i] = ZeroMask;
9838 continue;
9839 }
9840
9841 // We can only use a single input of V1 or V2.
9842 SDValue SrcV = (M >= Size ? V2 : V1);
9843 if (V && V != SrcV)
9844 return SDValue();
9845 V = SrcV;
9846 M %= Size;
9847
9848 // PSHUFB can't cross lanes, ensure this doesn't happen.
9849 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
9850 return SDValue();
9851
9852 M = M % LaneSize;
9853 M = M * NumEltBytes + (i % NumEltBytes);
9854 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
9855 }
9856 assert(V && "Failed to find a source input");
9857
9858 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
9859 return DAG.getBitcast(
9860 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
9861 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
9862}
9863
9864static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
9865 const X86Subtarget &Subtarget, SelectionDAG &DAG,
9866 const SDLoc &dl);
9867
9868// X86 has dedicated shuffle that can be lowered to VEXPAND
9870 const APInt &Zeroable,
9871 ArrayRef<int> Mask, SDValue &V1,
9872 SDValue &V2, SelectionDAG &DAG,
9873 const X86Subtarget &Subtarget) {
9874 bool IsLeftZeroSide = true;
9875 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
9876 IsLeftZeroSide))
9877 return SDValue();
9878 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
9880 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9881 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
9882 unsigned NumElts = VT.getVectorNumElements();
9883 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
9884 "Unexpected number of vector elements");
9885 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
9886 Subtarget, DAG, DL);
9887 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
9888 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
9889 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
9890}
9891
9892static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
9893 unsigned &UnpackOpcode, bool IsUnary,
9894 ArrayRef<int> TargetMask, const SDLoc &DL,
9895 SelectionDAG &DAG,
9896 const X86Subtarget &Subtarget) {
9897 int NumElts = VT.getVectorNumElements();
9898
9899 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
9900 for (int i = 0; i != NumElts; i += 2) {
9901 int M1 = TargetMask[i + 0];
9902 int M2 = TargetMask[i + 1];
9903 Undef1 &= (SM_SentinelUndef == M1);
9904 Undef2 &= (SM_SentinelUndef == M2);
9905 Zero1 &= isUndefOrZero(M1);
9906 Zero2 &= isUndefOrZero(M2);
9907 }
9908 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
9909 "Zeroable shuffle detected");
9910
9911 // Attempt to match the target mask against the unpack lo/hi mask patterns.
9912 SmallVector<int, 64> Unpckl, Unpckh;
9913 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
9914 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
9915 (IsUnary ? V1 : V2))) {
9916 UnpackOpcode = X86ISD::UNPCKL;
9917 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9918 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9919 return true;
9920 }
9921
9922 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
9923 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
9924 (IsUnary ? V1 : V2))) {
9925 UnpackOpcode = X86ISD::UNPCKH;
9926 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9927 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9928 return true;
9929 }
9930
9931 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
9932 if (IsUnary && (Zero1 || Zero2)) {
9933 // Don't bother if we can blend instead.
9934 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
9935 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
9936 return false;
9937
9938 bool MatchLo = true, MatchHi = true;
9939 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
9940 int M = TargetMask[i];
9941
9942 // Ignore if the input is known to be zero or the index is undef.
9943 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
9944 (M == SM_SentinelUndef))
9945 continue;
9946
9947 MatchLo &= (M == Unpckl[i]);
9948 MatchHi &= (M == Unpckh[i]);
9949 }
9950
9951 if (MatchLo || MatchHi) {
9952 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
9953 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9954 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9955 return true;
9956 }
9957 }
9958
9959 // If a binary shuffle, commute and try again.
9960 if (!IsUnary) {
9962 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
9963 UnpackOpcode = X86ISD::UNPCKL;
9964 std::swap(V1, V2);
9965 return true;
9966 }
9967
9969 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
9970 UnpackOpcode = X86ISD::UNPCKH;
9971 std::swap(V1, V2);
9972 return true;
9973 }
9974 }
9975
9976 return false;
9977}
9978
9979// X86 has dedicated unpack instructions that can handle specific blend
9980// operations: UNPCKH and UNPCKL.
9982 ArrayRef<int> Mask, SDValue V1, SDValue V2,
9983 SelectionDAG &DAG) {
9984 SmallVector<int, 8> Unpckl;
9985 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
9986 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
9987 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
9988
9989 SmallVector<int, 8> Unpckh;
9990 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
9991 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
9992 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
9993
9994 // Commute and try again.
9996 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
9997 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
9998
10000 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10001 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
10002
10003 return SDValue();
10004}
10005
10006/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10007/// followed by unpack 256-bit.
10009 ArrayRef<int> Mask, SDValue V1,
10010 SDValue V2, SelectionDAG &DAG) {
10011 SmallVector<int, 32> Unpckl, Unpckh;
10012 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
10013 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
10014
10015 unsigned UnpackOpcode;
10016 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10017 UnpackOpcode = X86ISD::UNPCKL;
10018 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10019 UnpackOpcode = X86ISD::UNPCKH;
10020 else
10021 return SDValue();
10022
10023 // This is a "natural" unpack operation (rather than the 128-bit sectored
10024 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10025 // input in order to use the x86 instruction.
10026 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
10027 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10028 V1 = DAG.getBitcast(VT, V1);
10029 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
10030}
10031
10032// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
10033// source into the lower elements and zeroing the upper elements.
10034static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
10035 ArrayRef<int> Mask, const APInt &Zeroable,
10036 const X86Subtarget &Subtarget) {
10037 if (!VT.is512BitVector() && !Subtarget.hasVLX())
10038 return false;
10039
10040 unsigned NumElts = Mask.size();
10041 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10042 unsigned MaxScale = 64 / EltSizeInBits;
10043
10044 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10045 unsigned SrcEltBits = EltSizeInBits * Scale;
10046 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10047 continue;
10048 unsigned NumSrcElts = NumElts / Scale;
10049 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
10050 continue;
10051 unsigned UpperElts = NumElts - NumSrcElts;
10052 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10053 continue;
10054 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
10055 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
10056 DstVT = MVT::getIntegerVT(EltSizeInBits);
10057 if ((NumSrcElts * EltSizeInBits) >= 128) {
10058 // ISD::TRUNCATE
10059 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
10060 } else {
10061 // X86ISD::VTRUNC
10062 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
10063 }
10064 return true;
10065 }
10066
10067 return false;
10068}
10069
10070// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
10071// element padding to the final DstVT.
10072static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
10073 const X86Subtarget &Subtarget,
10074 SelectionDAG &DAG, bool ZeroUppers) {
10075 MVT SrcVT = Src.getSimpleValueType();
10076 MVT DstSVT = DstVT.getScalarType();
10077 unsigned NumDstElts = DstVT.getVectorNumElements();
10078 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10079 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
10080
10081 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
10082 return SDValue();
10083
10084 // Perform a direct ISD::TRUNCATE if possible.
10085 if (NumSrcElts == NumDstElts)
10086 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
10087
10088 if (NumSrcElts > NumDstElts) {
10089 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10090 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10091 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
10092 }
10093
10094 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
10095 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10096 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10097 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10098 DstVT.getSizeInBits());
10099 }
10100
10101 // Non-VLX targets must truncate from a 512-bit type, so we need to
10102 // widen, truncate and then possibly extract the original subvector.
10103 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
10104 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
10105 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
10106 }
10107
10108 // Fallback to a X86ISD::VTRUNC, padding if necessary.
10109 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
10110 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
10111 if (DstVT != TruncVT)
10112 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10113 DstVT.getSizeInBits());
10114 return Trunc;
10115}
10116
10117// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
10118//
10119// An example is the following:
10120//
10121// t0: ch = EntryToken
10122// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
10123// t25: v4i32 = truncate t2
10124// t41: v8i16 = bitcast t25
10125// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
10126// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
10127// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
10128// t18: v2i64 = bitcast t51
10129//
10130// One can just use a single vpmovdw instruction, without avx512vl we need to
10131// use the zmm variant and extract the lower subvector, padding with zeroes.
10132// TODO: Merge with lowerShuffleAsVTRUNC.
10134 SDValue V2, ArrayRef<int> Mask,
10135 const APInt &Zeroable,
10136 const X86Subtarget &Subtarget,
10137 SelectionDAG &DAG) {
10138 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
10139 if (!Subtarget.hasAVX512())
10140 return SDValue();
10141
10142 unsigned NumElts = VT.getVectorNumElements();
10143 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10144 unsigned MaxScale = 64 / EltSizeInBits;
10145 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10146 unsigned SrcEltBits = EltSizeInBits * Scale;
10147 unsigned NumSrcElts = NumElts / Scale;
10148 unsigned UpperElts = NumElts - NumSrcElts;
10149 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
10150 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10151 continue;
10152
10153 // Attempt to find a matching source truncation, but as a fall back VLX
10154 // cases can use the VPMOV directly.
10155 SDValue Src = peekThroughBitcasts(V1);
10156 if (Src.getOpcode() == ISD::TRUNCATE &&
10157 Src.getScalarValueSizeInBits() == SrcEltBits) {
10158 Src = Src.getOperand(0);
10159 } else if (Subtarget.hasVLX()) {
10160 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10161 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10162 Src = DAG.getBitcast(SrcVT, Src);
10163 // Don't do this if PACKSS/PACKUS could perform it cheaper.
10164 if (Scale == 2 &&
10165 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
10166 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
10167 return SDValue();
10168 } else
10169 return SDValue();
10170
10171 // VPMOVWB is only available with avx512bw.
10172 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
10173 return SDValue();
10174
10175 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
10176 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10177 }
10178
10179 return SDValue();
10180}
10181
10182// Attempt to match binary shuffle patterns as a truncate.
10184 SDValue V2, ArrayRef<int> Mask,
10185 const APInt &Zeroable,
10186 const X86Subtarget &Subtarget,
10187 SelectionDAG &DAG) {
10188 assert((VT.is128BitVector() || VT.is256BitVector()) &&
10189 "Unexpected VTRUNC type");
10190 if (!Subtarget.hasAVX512())
10191 return SDValue();
10192
10193 unsigned NumElts = VT.getVectorNumElements();
10194 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10195 unsigned MaxScale = 64 / EltSizeInBits;
10196 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10197 // TODO: Support non-BWI VPMOVWB truncations?
10198 unsigned SrcEltBits = EltSizeInBits * Scale;
10199 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10200 continue;
10201
10202 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
10203 // Bail if the V2 elements are undef.
10204 unsigned NumHalfSrcElts = NumElts / Scale;
10205 unsigned NumSrcElts = 2 * NumHalfSrcElts;
10206 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
10207 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
10208 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
10209 continue;
10210
10211 // The elements beyond the truncation must be undef/zero.
10212 unsigned UpperElts = NumElts - NumSrcElts;
10213 if (UpperElts > 0 &&
10214 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10215 continue;
10216 bool UndefUppers =
10217 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
10218
10219 // For offset truncations, ensure that the concat is cheap.
10220 if (Offset) {
10221 auto IsCheapConcat = [&](SDValue Lo, SDValue Hi) {
10222 if (Lo.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
10223 Hi.getOpcode() == ISD::EXTRACT_SUBVECTOR)
10224 return Lo.getOperand(0) == Hi.getOperand(0);
10225 if (ISD::isNormalLoad(Lo.getNode()) &&
10226 ISD::isNormalLoad(Hi.getNode())) {
10227 auto *LDLo = cast<LoadSDNode>(Lo);
10228 auto *LDHi = cast<LoadSDNode>(Hi);
10230 LDHi, LDLo, Lo.getValueType().getStoreSize(), 1);
10231 }
10232 return false;
10233 };
10234 if (!IsCheapConcat(V1, V2))
10235 continue;
10236 }
10237
10238 // As we're using both sources then we need to concat them together
10239 // and truncate from the double-sized src.
10240 MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
10241 SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
10242
10243 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10244 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10245 Src = DAG.getBitcast(SrcVT, Src);
10246
10247 // Shift the offset'd elements into place for the truncation.
10248 // TODO: Use getTargetVShiftByConstNode.
10249 if (Offset)
10250 Src = DAG.getNode(
10251 X86ISD::VSRLI, DL, SrcVT, Src,
10252 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
10253
10254 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10255 }
10256 }
10257
10258 return SDValue();
10259}
10260
10261/// Check whether a compaction lowering can be done by dropping even/odd
10262/// elements and compute how many times even/odd elements must be dropped.
10263///
10264/// This handles shuffles which take every Nth element where N is a power of
10265/// two. Example shuffle masks:
10266///
10267/// (even)
10268/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10269/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10270/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10271/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10272/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10273/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10274///
10275/// (odd)
10276/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
10277/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
10278///
10279/// Any of these lanes can of course be undef.
10280///
10281/// This routine only supports N <= 3.
10282/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10283/// for larger N.
10284///
10285/// \returns N above, or the number of times even/odd elements must be dropped
10286/// if there is such a number. Otherwise returns zero.
10287static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
10288 bool IsSingleInput) {
10289 // The modulus for the shuffle vector entries is based on whether this is
10290 // a single input or not.
10291 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10292 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10293 "We should only be called with masks with a power-of-2 size!");
10294
10295 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10296 int Offset = MatchEven ? 0 : 1;
10297
10298 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10299 // and 2^3 simultaneously. This is because we may have ambiguity with
10300 // partially undef inputs.
10301 bool ViableForN[3] = {true, true, true};
10302
10303 for (int i = 0, e = Mask.size(); i < e; ++i) {
10304 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10305 // want.
10306 if (Mask[i] < 0)
10307 continue;
10308
10309 bool IsAnyViable = false;
10310 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10311 if (ViableForN[j]) {
10312 uint64_t N = j + 1;
10313
10314 // The shuffle mask must be equal to (i * 2^N) % M.
10315 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10316 IsAnyViable = true;
10317 else
10318 ViableForN[j] = false;
10319 }
10320 // Early exit if we exhaust the possible powers of two.
10321 if (!IsAnyViable)
10322 break;
10323 }
10324
10325 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10326 if (ViableForN[j])
10327 return j + 1;
10328
10329 // Return 0 as there is no viable power of two.
10330 return 0;
10331}
10332
10333// X86 has dedicated pack instructions that can handle specific truncation
10334// operations: PACKSS and PACKUS.
10335// Checks for compaction shuffle masks if MaxStages > 1.
10336// TODO: Add support for matching multiple PACKSS/PACKUS stages.
10337static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
10338 unsigned &PackOpcode, ArrayRef<int> TargetMask,
10339 const SelectionDAG &DAG,
10340 const X86Subtarget &Subtarget,
10341 unsigned MaxStages = 1) {
10342 unsigned NumElts = VT.getVectorNumElements();
10343 unsigned BitSize = VT.getScalarSizeInBits();
10344 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
10345 "Illegal maximum compaction");
10346
10347 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
10348 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
10349 unsigned NumPackedBits = NumSrcBits - BitSize;
10350 N1 = peekThroughBitcasts(N1);
10351 N2 = peekThroughBitcasts(N2);
10352 unsigned NumBits1 = N1.getScalarValueSizeInBits();
10353 unsigned NumBits2 = N2.getScalarValueSizeInBits();
10354 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
10355 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
10356 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
10357 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
10358 return false;
10359 if (Subtarget.hasSSE41() || BitSize == 8) {
10360 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
10361 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
10362 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
10363 V1 = N1;
10364 V2 = N2;
10365 SrcVT = PackVT;
10366 PackOpcode = X86ISD::PACKUS;
10367 return true;
10368 }
10369 }
10370 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
10371 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
10372 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
10373 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
10374 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
10375 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
10376 V1 = N1;
10377 V2 = N2;
10378 SrcVT = PackVT;
10379 PackOpcode = X86ISD::PACKSS;
10380 return true;
10381 }
10382 return false;
10383 };
10384
10385 // Attempt to match against wider and wider compaction patterns.
10386 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
10387 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
10388 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
10389
10390 // Try binary shuffle.
10391 SmallVector<int, 32> BinaryMask;
10392 createPackShuffleMask(VT, BinaryMask, false, NumStages);
10393 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
10394 if (MatchPACK(V1, V2, PackVT))
10395 return true;
10396
10397 // Try unary shuffle.
10398 SmallVector<int, 32> UnaryMask;
10399 createPackShuffleMask(VT, UnaryMask, true, NumStages);
10400 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
10401 if (MatchPACK(V1, V1, PackVT))
10402 return true;
10403 }
10404
10405 return false;
10406}
10407
10409 SDValue V1, SDValue V2, SelectionDAG &DAG,
10410 const X86Subtarget &Subtarget) {
10411 MVT PackVT;
10412 unsigned PackOpcode;
10413 unsigned SizeBits = VT.getSizeInBits();
10414 unsigned EltBits = VT.getScalarSizeInBits();
10415 unsigned MaxStages = Log2_32(64 / EltBits);
10416 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
10417 Subtarget, MaxStages))
10418 return SDValue();
10419
10420 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
10421 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
10422
10423 // Don't lower multi-stage packs on AVX512, truncation is better.
10424 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
10425 return SDValue();
10426
10427 // Pack to the largest type possible:
10428 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10429 unsigned MaxPackBits = 16;
10430 if (CurrentEltBits > 16 &&
10431 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
10432 MaxPackBits = 32;
10433
10434 // Repeatedly pack down to the target size.
10435 SDValue Res;
10436 for (unsigned i = 0; i != NumStages; ++i) {
10437 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
10438 unsigned NumSrcElts = SizeBits / SrcEltBits;
10439 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10440 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
10441 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10442 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
10443 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
10444 DAG.getBitcast(SrcVT, V2));
10445 V1 = V2 = Res;
10446 CurrentEltBits /= 2;
10447 }
10448 assert(Res && Res.getValueType() == VT &&
10449 "Failed to lower compaction shuffle");
10450 return Res;
10451}
10452
10453/// Try to emit a bitmask instruction for a shuffle.
10454///
10455/// This handles cases where we can model a blend exactly as a bitmask due to
10456/// one of the inputs being zeroable.
10458 SDValue V2, ArrayRef<int> Mask,
10459 const APInt &Zeroable,
10460 const X86Subtarget &Subtarget,
10461 SelectionDAG &DAG) {
10462 MVT MaskVT = VT;
10463 MVT EltVT = VT.getVectorElementType();
10464 SDValue Zero, AllOnes;
10465 // Use f64 if i64 isn't legal.
10466 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
10467 EltVT = MVT::f64;
10468 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
10469 }
10470
10471 MVT LogicVT = VT;
10472 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
10473 Zero = DAG.getConstantFP(0.0, DL, EltVT);
10474 APFloat AllOnesValue =
10476 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
10477 LogicVT =
10478 MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
10479 } else {
10480 Zero = DAG.getConstant(0, DL, EltVT);
10481 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10482 }
10483
10484 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
10485 SDValue V;
10486 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10487 if (Zeroable[i])
10488 continue;
10489 if (Mask[i] % Size != i)
10490 return SDValue(); // Not a blend.
10491 if (!V)
10492 V = Mask[i] < Size ? V1 : V2;
10493 else if (V != (Mask[i] < Size ? V1 : V2))
10494 return SDValue(); // Can only let one input through the mask.
10495
10496 VMaskOps[i] = AllOnes;
10497 }
10498 if (!V)
10499 return SDValue(); // No non-zeroable elements!
10500
10501 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
10502 VMask = DAG.getBitcast(LogicVT, VMask);
10503 V = DAG.getBitcast(LogicVT, V);
10504 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
10505 return DAG.getBitcast(VT, And);
10506}
10507
10508/// Try to emit a blend instruction for a shuffle using bit math.
10509///
10510/// This is used as a fallback approach when first class blend instructions are
10511/// unavailable. Currently it is only suitable for integer vectors, but could
10512/// be generalized for floating point vectors if desirable.
10514 SDValue V2, ArrayRef<int> Mask,
10515 SelectionDAG &DAG) {
10516 assert(VT.isInteger() && "Only supports integer vector types!");
10517 MVT EltVT = VT.getVectorElementType();
10518 SDValue Zero = DAG.getConstant(0, DL, EltVT);
10519 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10521 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10522 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
10523 return SDValue(); // Shuffled input!
10524 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
10525 }
10526
10527 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
10528 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
10529}
10530
10532 SDValue PreservedSrc,
10533 const X86Subtarget &Subtarget,
10534 SelectionDAG &DAG);
10535
10538 const APInt &Zeroable, bool &ForceV1Zero,
10539 bool &ForceV2Zero, uint64_t &BlendMask) {
10540 bool V1IsZeroOrUndef =
10542 bool V2IsZeroOrUndef =
10543 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
10544
10545 BlendMask = 0;
10546 ForceV1Zero = false, ForceV2Zero = false;
10547 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
10548
10549 int NumElts = Mask.size();
10550 int NumLanes = VT.getSizeInBits() / 128;
10551 int NumEltsPerLane = NumElts / NumLanes;
10552 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
10553
10554 // For 32/64-bit elements, if we only reference one input (plus any undefs),
10555 // then ensure the blend mask part for that lane just references that input.
10556 bool ForceWholeLaneMasks =
10557 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
10558
10559 // Attempt to generate the binary blend mask. If an input is zero then
10560 // we can use any lane.
10561 for (int Lane = 0; Lane != NumLanes; ++Lane) {
10562 // Keep track of the inputs used per lane.
10563 bool LaneV1InUse = false;
10564 bool LaneV2InUse = false;
10565 uint64_t LaneBlendMask = 0;
10566 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
10567 int Elt = (Lane * NumEltsPerLane) + LaneElt;
10568 int M = Mask[Elt];
10569 if (M == SM_SentinelUndef)
10570 continue;
10571 if (M == Elt || (0 <= M && M < NumElts &&
10572 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
10573 Mask[Elt] = Elt;
10574 LaneV1InUse = true;
10575 continue;
10576 }
10577 if (M == (Elt + NumElts) ||
10578 (NumElts <= M &&
10579 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
10580 LaneBlendMask |= 1ull << LaneElt;
10581 Mask[Elt] = Elt + NumElts;
10582 LaneV2InUse = true;
10583 continue;
10584 }
10585 if (Zeroable[Elt]) {
10586 if (V1IsZeroOrUndef) {
10587 ForceV1Zero = true;
10588 Mask[Elt] = Elt;
10589 LaneV1InUse = true;
10590 continue;
10591 }
10592 if (V2IsZeroOrUndef) {
10593 ForceV2Zero = true;
10594 LaneBlendMask |= 1ull << LaneElt;
10595 Mask[Elt] = Elt + NumElts;
10596 LaneV2InUse = true;
10597 continue;
10598 }
10599 }
10600 return false;
10601 }
10602
10603 // If we only used V2 then splat the lane blend mask to avoid any demanded
10604 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
10605 // blend mask bit).
10606 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
10607 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
10608
10609 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
10610 }
10611 return true;
10612}
10613
10614/// Try to emit a blend instruction for a shuffle.
10615///
10616/// This doesn't do any checks for the availability of instructions for blending
10617/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
10618/// be matched in the backend with the type given. What it does check for is
10619/// that the shuffle mask is a blend, or convertible into a blend with zero.
10621 SDValue V2, ArrayRef<int> Original,
10622 const APInt &Zeroable,
10623 const X86Subtarget &Subtarget,
10624 SelectionDAG &DAG) {
10625 uint64_t BlendMask = 0;
10626 bool ForceV1Zero = false, ForceV2Zero = false;
10627 SmallVector<int, 64> Mask(Original);
10628 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
10629 BlendMask))
10630 return SDValue();
10631
10632 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
10633 if (ForceV1Zero)
10634 V1 = getZeroVector(VT, Subtarget, DAG, DL);
10635 if (ForceV2Zero)
10636 V2 = getZeroVector(VT, Subtarget, DAG, DL);
10637
10638 unsigned NumElts = VT.getVectorNumElements();
10639
10640 switch (VT.SimpleTy) {
10641 case MVT::v4i64:
10642 case MVT::v8i32:
10643 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
10644 [[fallthrough]];
10645 case MVT::v4f64:
10646 case MVT::v8f32:
10647 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
10648 [[fallthrough]];
10649 case MVT::v2f64:
10650 case MVT::v2i64:
10651 case MVT::v4f32:
10652 case MVT::v4i32:
10653 case MVT::v8i16:
10654 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
10655 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
10656 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
10657 case MVT::v16i16: {
10658 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
10659 SmallVector<int, 8> RepeatedMask;
10660 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
10661 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
10662 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
10663 BlendMask = 0;
10664 for (int i = 0; i < 8; ++i)
10665 if (RepeatedMask[i] >= 8)
10666 BlendMask |= 1ull << i;
10667 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10668 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
10669 }
10670 // Use PBLENDW for lower/upper lanes and then blend lanes.
10671 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
10672 // merge to VSELECT where useful.
10673 uint64_t LoMask = BlendMask & 0xFF;
10674 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
10675 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
10676 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10677 DAG.getTargetConstant(LoMask, DL, MVT::i8));
10678 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10679 DAG.getTargetConstant(HiMask, DL, MVT::i8));
10680 return DAG.getVectorShuffle(
10681 MVT::v16i16, DL, Lo, Hi,
10682 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
10683 }
10684 [[fallthrough]];
10685 }
10686 case MVT::v32i8:
10687 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
10688 [[fallthrough]];
10689 case MVT::v16i8: {
10690 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
10691
10692 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
10693 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
10694 Subtarget, DAG))
10695 return Masked;
10696
10697 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
10698 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
10699 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
10700 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
10701 }
10702
10703 // If we have VPTERNLOG, we can use that as a bit blend.
10704 if (Subtarget.hasVLX())
10705 if (SDValue BitBlend =
10706 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
10707 return BitBlend;
10708
10709 // Scale the blend by the number of bytes per element.
10710 int Scale = VT.getScalarSizeInBits() / 8;
10711
10712 // This form of blend is always done on bytes. Compute the byte vector
10713 // type.
10714 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
10715
10716 // x86 allows load folding with blendvb from the 2nd source operand. But
10717 // we are still using LLVM select here (see comment below), so that's V1.
10718 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
10719 // allow that load-folding possibility.
10720 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
10722 std::swap(V1, V2);
10723 }
10724
10725 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
10726 // mix of LLVM's code generator and the x86 backend. We tell the code
10727 // generator that boolean values in the elements of an x86 vector register
10728 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
10729 // mapping a select to operand #1, and 'false' mapping to operand #2. The
10730 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
10731 // of the element (the remaining are ignored) and 0 in that high bit would
10732 // mean operand #1 while 1 in the high bit would mean operand #2. So while
10733 // the LLVM model for boolean values in vector elements gets the relevant
10734 // bit set, it is set backwards and over constrained relative to x86's
10735 // actual model.
10736 SmallVector<SDValue, 32> VSELECTMask;
10737 for (int i = 0, Size = Mask.size(); i < Size; ++i)
10738 for (int j = 0; j < Scale; ++j)
10739 VSELECTMask.push_back(
10740 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
10741 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
10742 MVT::i8));
10743
10744 V1 = DAG.getBitcast(BlendVT, V1);
10745 V2 = DAG.getBitcast(BlendVT, V2);
10746 return DAG.getBitcast(
10747 VT,
10748 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
10749 V1, V2));
10750 }
10751 case MVT::v16f32:
10752 case MVT::v8f64:
10753 case MVT::v8i64:
10754 case MVT::v16i32:
10755 case MVT::v32i16:
10756 case MVT::v64i8: {
10757 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
10758 bool OptForSize = DAG.shouldOptForSize();
10759 if (!OptForSize) {
10760 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
10761 Subtarget, DAG))
10762 return Masked;
10763 }
10764
10765 // Otherwise load an immediate into a GPR, cast to k-register, and use a
10766 // masked move.
10767 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
10768 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
10769 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
10770 }
10771 default:
10772 llvm_unreachable("Not a supported integer vector type!");
10773 }
10774}
10775
10776/// Try to lower as a blend of elements from two inputs followed by
10777/// a single-input permutation.
10778///
10779/// This matches the pattern where we can blend elements from two inputs and
10780/// then reduce the shuffle to a single-input permutation.
10782 SDValue V1, SDValue V2,
10783 ArrayRef<int> Mask,
10784 SelectionDAG &DAG,
10785 bool ImmBlends = false) {
10786 // We build up the blend mask while checking whether a blend is a viable way
10787 // to reduce the shuffle.
10788 SmallVector<int, 32> BlendMask(Mask.size(), -1);
10789 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
10790
10791 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10792 if (Mask[i] < 0)
10793 continue;
10794
10795 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
10796
10797 if (BlendMask[Mask[i] % Size] < 0)
10798 BlendMask[Mask[i] % Size] = Mask[i];
10799 else if (BlendMask[Mask[i] % Size] != Mask[i])
10800 return SDValue(); // Can't blend in the needed input!
10801
10802 PermuteMask[i] = Mask[i] % Size;
10803 }
10804
10805 // If only immediate blends, then bail if the blend mask can't be widened to
10806 // i16.
10807 unsigned EltSize = VT.getScalarSizeInBits();
10808 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
10809 return SDValue();
10810
10811 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
10812 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
10813}
10814
10815/// Try to lower as an unpack of elements from two inputs followed by
10816/// a single-input permutation.
10817///
10818/// This matches the pattern where we can unpack elements from two inputs and
10819/// then reduce the shuffle to a single-input (wider) permutation.
10821 SDValue V1, SDValue V2,
10822 ArrayRef<int> Mask,
10823 SelectionDAG &DAG) {
10824 int NumElts = Mask.size();
10825 int NumLanes = VT.getSizeInBits() / 128;
10826 int NumLaneElts = NumElts / NumLanes;
10827 int NumHalfLaneElts = NumLaneElts / 2;
10828
10829 bool MatchLo = true, MatchHi = true;
10830 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
10831
10832 // Determine UNPCKL/UNPCKH type and operand order.
10833 for (int Elt = 0; Elt != NumElts; ++Elt) {
10834 int M = Mask[Elt];
10835 if (M < 0)
10836 continue;
10837
10838 // Normalize the mask value depending on whether it's V1 or V2.
10839 int NormM = M;
10840 SDValue &Op = Ops[Elt & 1];
10841 if (M < NumElts && (Op.isUndef() || Op == V1))
10842 Op = V1;
10843 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
10844 Op = V2;
10845 NormM -= NumElts;
10846 } else
10847 return SDValue();
10848
10849 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
10850 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
10851 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
10852 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
10853 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
10854 if (MatchLoAnyLane || MatchHiAnyLane) {
10855 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
10856 "Failed to match UNPCKLO/UNPCKHI");
10857 break;
10858 }
10859 }
10860 MatchLo &= MatchLoAnyLane;
10861 MatchHi &= MatchHiAnyLane;
10862 if (!MatchLo && !MatchHi)
10863 return SDValue();
10864 }
10865 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
10866
10867 // Element indices have changed after unpacking. Calculate permute mask
10868 // so that they will be put back to the position as dictated by the
10869 // original shuffle mask indices.
10870 SmallVector<int, 32> PermuteMask(NumElts, -1);
10871 for (int Elt = 0; Elt != NumElts; ++Elt) {
10872 int M = Mask[Elt];
10873 if (M < 0)
10874 continue;
10875 int NormM = M;
10876 if (NumElts <= M)
10877 NormM -= NumElts;
10878 bool IsFirstOp = M < NumElts;
10879 int BaseMaskElt =
10880 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
10881 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
10882 PermuteMask[Elt] = BaseMaskElt;
10883 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
10884 PermuteMask[Elt] = BaseMaskElt + 1;
10885 assert(PermuteMask[Elt] != -1 &&
10886 "Input mask element is defined but failed to assign permute mask");
10887 }
10888
10889 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10890 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
10891 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
10892}
10893
10894/// Try to lower a shuffle as a permute of the inputs followed by an
10895/// UNPCK instruction.
10896///
10897/// This specifically targets cases where we end up with alternating between
10898/// the two inputs, and so can permute them into something that feeds a single
10899/// UNPCK instruction. Note that this routine only targets integer vectors
10900/// because for floating point vectors we have a generalized SHUFPS lowering
10901/// strategy that handles everything that doesn't *exactly* match an unpack,
10902/// making this clever lowering unnecessary.
10904 SDValue V1, SDValue V2,
10905 ArrayRef<int> Mask,
10906 const X86Subtarget &Subtarget,
10907 SelectionDAG &DAG) {
10908 int Size = Mask.size();
10909 assert(Mask.size() >= 2 && "Single element masks are invalid.");
10910
10911 // This routine only supports 128-bit integer dual input vectors.
10912 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
10913 return SDValue();
10914
10915 int NumLoInputs =
10916 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
10917 int NumHiInputs =
10918 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
10919
10920 bool UnpackLo = NumLoInputs >= NumHiInputs;
10921
10922 auto TryUnpack = [&](int ScalarSize, int Scale) {
10923 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
10924 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
10925
10926 for (int i = 0; i < Size; ++i) {
10927 if (Mask[i] < 0)
10928 continue;
10929
10930 // Each element of the unpack contains Scale elements from this mask.
10931 int UnpackIdx = i / Scale;
10932
10933 // We only handle the case where V1 feeds the first slots of the unpack.
10934 // We rely on canonicalization to ensure this is the case.
10935 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
10936 return SDValue();
10937
10938 // Setup the mask for this input. The indexing is tricky as we have to
10939 // handle the unpack stride.
10940 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
10941 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
10942 Mask[i] % Size;
10943 }
10944
10945 // If we will have to shuffle both inputs to use the unpack, check whether
10946 // we can just unpack first and shuffle the result. If so, skip this unpack.
10947 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
10948 !isNoopShuffleMask(V2Mask))
10949 return SDValue();
10950
10951 // Shuffle the inputs into place.
10952 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
10953 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
10954
10955 // Cast the inputs to the type we will use to unpack them.
10956 MVT UnpackVT =
10957 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
10958 V1 = DAG.getBitcast(UnpackVT, V1);
10959 V2 = DAG.getBitcast(UnpackVT, V2);
10960
10961 // Unpack the inputs and cast the result back to the desired type.
10962 return DAG.getBitcast(
10963 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10964 UnpackVT, V1, V2));
10965 };
10966
10967 // We try each unpack from the largest to the smallest to try and find one
10968 // that fits this mask.
10969 int OrigScalarSize = VT.getScalarSizeInBits();
10970 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
10971 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
10972 return Unpack;
10973
10974 // If we're shuffling with a zero vector then we're better off not doing
10975 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
10977 ISD::isBuildVectorAllZeros(V2.getNode()))
10978 return SDValue();
10979
10980 // If none of the unpack-rooted lowerings worked (or were profitable) try an
10981 // initial unpack.
10982 if (NumLoInputs == 0 || NumHiInputs == 0) {
10983 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
10984 "We have to have *some* inputs!");
10985 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
10986
10987 // FIXME: We could consider the total complexity of the permute of each
10988 // possible unpacking. Or at the least we should consider how many
10989 // half-crossings are created.
10990 // FIXME: We could consider commuting the unpacks.
10991
10992 SmallVector<int, 32> PermMask((unsigned)Size, -1);
10993 for (int i = 0; i < Size; ++i) {
10994 if (Mask[i] < 0)
10995 continue;
10996
10997 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
10998
10999 PermMask[i] =
11000 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11001 }
11002 return DAG.getVectorShuffle(
11003 VT, DL,
11004 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
11005 V1, V2),
11006 DAG.getUNDEF(VT), PermMask);
11007 }
11008
11009 return SDValue();
11010}
11011
11012/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11013/// permuting the elements of the result in place.
11015 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11016 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11017 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11018 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11019 (VT.is512BitVector() && !Subtarget.hasBWI()))
11020 return SDValue();
11021
11022 // We don't currently support lane crossing permutes.
11023 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11024 return SDValue();
11025
11026 int Scale = VT.getScalarSizeInBits() / 8;
11027 int NumLanes = VT.getSizeInBits() / 128;
11028 int NumElts = VT.getVectorNumElements();
11029 int NumEltsPerLane = NumElts / NumLanes;
11030
11031 // Determine range of mask elts.
11032 bool Blend1 = true;
11033 bool Blend2 = true;
11034 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
11035 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
11036 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11037 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11038 int M = Mask[Lane + Elt];
11039 if (M < 0)
11040 continue;
11041 if (M < NumElts) {
11042 Blend1 &= (M == (Lane + Elt));
11043 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11044 M = M % NumEltsPerLane;
11045 Range1.first = std::min(Range1.first, M);
11046 Range1.second = std::max(Range1.second, M);
11047 } else {
11048 M -= NumElts;
11049 Blend2 &= (M == (Lane + Elt));
11050 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11051 M = M % NumEltsPerLane;
11052 Range2.first = std::min(Range2.first, M);
11053 Range2.second = std::max(Range2.second, M);
11054 }
11055 }
11056 }
11057
11058 // Bail if we don't need both elements.
11059 // TODO - it might be worth doing this for unary shuffles if the permute
11060 // can be widened.
11061 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11062 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11063 return SDValue();
11064
11065 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11066 return SDValue();
11067
11068 // Rotate the 2 ops so we can access both ranges, then permute the result.
11069 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11070 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11071 SDValue Rotate = DAG.getBitcast(
11072 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11073 DAG.getBitcast(ByteVT, Lo),
11074 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11075 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11076 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11077 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11078 int M = Mask[Lane + Elt];
11079 if (M < 0)
11080 continue;
11081 if (M < NumElts)
11082 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11083 else
11084 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11085 }
11086 }
11087 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11088 };
11089
11090 // Check if the ranges are small enough to rotate from either direction.
11091 if (Range2.second < Range1.first)
11092 return RotateAndPermute(V1, V2, Range1.first, 0);
11093 if (Range1.second < Range2.first)
11094 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11095 return SDValue();
11096}
11097
11099 return isUndefOrEqual(Mask, 0);
11100}
11101
11103 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
11104}
11105
11106/// Check if the Mask consists of the same element repeated multiple times.
11108 size_t NumUndefs = 0;
11109 std::optional<int> UniqueElt;
11110 for (int Elt : Mask) {
11111 if (Elt == SM_SentinelUndef) {
11112 NumUndefs++;
11113 continue;
11114 }
11115 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
11116 return false;
11117 UniqueElt = Elt;
11118 }
11119 // Make sure the element is repeated enough times by checking the number of
11120 // undefs is small.
11121 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
11122}
11123
11124/// Generic routine to decompose a shuffle and blend into independent
11125/// blends and permutes.
11126///
11127/// This matches the extremely common pattern for handling combined
11128/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11129/// operations. It will try to pick the best arrangement of shuffles and
11130/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
11132 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11133 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11134 int NumElts = Mask.size();
11135 int NumLanes = VT.getSizeInBits() / 128;
11136 int NumEltsPerLane = NumElts / NumLanes;
11137
11138 // Shuffle the input elements into the desired positions in V1 and V2 and
11139 // unpack/blend them together.
11140 bool IsAlternating = true;
11141 SmallVector<int, 32> V1Mask(NumElts, -1);
11142 SmallVector<int, 32> V2Mask(NumElts, -1);
11143 SmallVector<int, 32> FinalMask(NumElts, -1);
11144 for (int i = 0; i < NumElts; ++i) {
11145 int M = Mask[i];
11146 if (M >= 0 && M < NumElts) {
11147 V1Mask[i] = M;
11148 FinalMask[i] = i;
11149 IsAlternating &= (i & 1) == 0;
11150 } else if (M >= NumElts) {
11151 V2Mask[i] = M - NumElts;
11152 FinalMask[i] = i + NumElts;
11153 IsAlternating &= (i & 1) == 1;
11154 }
11155 }
11156
11157 // If we effectively only demand the 0'th element of \p Input, and not only
11158 // as 0'th element, then broadcast said input,
11159 // and change \p InputMask to be a no-op (identity) mask.
11160 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
11161 &DAG](SDValue &Input,
11162 MutableArrayRef<int> InputMask) {
11163 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
11164 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
11165 !X86::mayFoldLoad(Input, Subtarget)))
11166 return;
11167 if (isNoopShuffleMask(InputMask))
11168 return;
11169 assert(isBroadcastShuffleMask(InputMask) &&
11170 "Expected to demand only the 0'th element.");
11171 Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);
11172 for (auto I : enumerate(InputMask)) {
11173 int &InputMaskElt = I.value();
11174 if (InputMaskElt >= 0)
11175 InputMaskElt = I.index();
11176 }
11177 };
11178
11179 // Currently, we may need to produce one shuffle per input, and blend results.
11180 // It is possible that the shuffle for one of the inputs is already a no-op.
11181 // See if we can simplify non-no-op shuffles into broadcasts,
11182 // which we consider to be strictly better than an arbitrary shuffle.
11183 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
11185 canonicalizeBroadcastableInput(V1, V1Mask);
11186 canonicalizeBroadcastableInput(V2, V2Mask);
11187 }
11188
11189 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11190 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11191 // the shuffle may be able to fold with a load or other benefit. However, when
11192 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11193 // pre-shuffle first is a better strategy.
11194 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11195 // Only prefer immediate blends to unpack/rotate.
11196 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11197 DAG, true))
11198 return BlendPerm;
11199 // If either input vector provides only a single element which is repeated
11200 // multiple times, unpacking from both input vectors would generate worse
11201 // code. e.g. for
11202 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
11203 // it is better to process t4 first to create a vector of t4[0], then unpack
11204 // that vector with t2.
11205 if (!isSingleElementRepeatedMask(V1Mask) &&
11207 if (SDValue UnpackPerm =
11208 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
11209 return UnpackPerm;
11211 DL, VT, V1, V2, Mask, Subtarget, DAG))
11212 return RotatePerm;
11213 // Unpack/rotate failed - try again with variable blends.
11214 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11215 DAG))
11216 return BlendPerm;
11217 if (VT.getScalarSizeInBits() >= 32)
11218 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11219 DL, VT, V1, V2, Mask, Subtarget, DAG))
11220 return PermUnpack;
11221 }
11222
11223 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
11224 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
11225 // TODO: It doesn't have to be alternating - but each lane mustn't have more
11226 // than half the elements coming from each source.
11227 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
11228 V1Mask.assign(NumElts, -1);
11229 V2Mask.assign(NumElts, -1);
11230 FinalMask.assign(NumElts, -1);
11231 for (int i = 0; i != NumElts; i += NumEltsPerLane)
11232 for (int j = 0; j != NumEltsPerLane; ++j) {
11233 int M = Mask[i + j];
11234 if (M >= 0 && M < NumElts) {
11235 V1Mask[i + (j / 2)] = M;
11236 FinalMask[i + j] = i + (j / 2);
11237 } else if (M >= NumElts) {
11238 V2Mask[i + (j / 2)] = M - NumElts;
11239 FinalMask[i + j] = i + (j / 2) + NumElts;
11240 }
11241 }
11242 }
11243
11244 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11245 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11246 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
11247}
11248
11249static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11250 const X86Subtarget &Subtarget,
11251 ArrayRef<int> Mask) {
11252 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11253 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11254
11255 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11256 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11257 int MaxSubElts = 64 / EltSizeInBits;
11258 unsigned RotateAmt, NumSubElts;
11259 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
11260 MaxSubElts, NumSubElts, RotateAmt))
11261 return -1;
11262 unsigned NumElts = Mask.size();
11263 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11264 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11265 return RotateAmt;
11266}
11267
11268/// Lower shuffle using X86ISD::VROTLI rotations.
11270 ArrayRef<int> Mask,
11271 const X86Subtarget &Subtarget,
11272 SelectionDAG &DAG) {
11273 // Only XOP + AVX512 targets have bit rotation instructions.
11274 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
11275 bool IsLegal =
11276 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11277 if (!IsLegal && Subtarget.hasSSE3())
11278 return SDValue();
11279
11280 MVT RotateVT;
11281 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11282 Subtarget, Mask);
11283 if (RotateAmt < 0)
11284 return SDValue();
11285
11286 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11287 // expanded to OR(SRL,SHL), will be more efficient, but if they can
11288 // widen to vXi16 or more then existing lowering should will be better.
11289 if (!IsLegal) {
11290 if ((RotateAmt % 16) == 0)
11291 return SDValue();
11292 // TODO: Use getTargetVShiftByConstNode.
11293 unsigned ShlAmt = RotateAmt;
11294 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11295 V1 = DAG.getBitcast(RotateVT, V1);
11296 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11297 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11298 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11299 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11300 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11301 return DAG.getBitcast(VT, Rot);
11302 }
11303
11304 SDValue Rot =
11305 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11306 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11307 return DAG.getBitcast(VT, Rot);
11308}
11309
11310/// Try to match a vector shuffle as an element rotation.
11311///
11312/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11314 ArrayRef<int> Mask) {
11315 int NumElts = Mask.size();
11316
11317 // We need to detect various ways of spelling a rotation:
11318 // [11, 12, 13, 14, 15, 0, 1, 2]
11319 // [-1, 12, 13, 14, -1, -1, 1, -1]
11320 // [-1, -1, -1, -1, -1, -1, 1, 2]
11321 // [ 3, 4, 5, 6, 7, 8, 9, 10]
11322 // [-1, 4, 5, 6, -1, -1, 9, -1]
11323 // [-1, 4, 5, 6, -1, -1, -1, -1]
11324 int Rotation = 0;
11325 SDValue Lo, Hi;
11326 for (int i = 0; i < NumElts; ++i) {
11327 int M = Mask[i];
11328 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
11329 "Unexpected mask index.");
11330 if (M < 0)
11331 continue;
11332
11333 // Determine where a rotated vector would have started.
11334 int StartIdx = i - (M % NumElts);
11335 if (StartIdx == 0)
11336 // The identity rotation isn't interesting, stop.
11337 return -1;
11338
11339 // If we found the tail of a vector the rotation must be the missing
11340 // front. If we found the head of a vector, it must be how much of the
11341 // head.
11342 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11343
11344 if (Rotation == 0)
11345 Rotation = CandidateRotation;
11346 else if (Rotation != CandidateRotation)
11347 // The rotations don't match, so we can't match this mask.
11348 return -1;
11349
11350 // Compute which value this mask is pointing at.
11351 SDValue MaskV = M < NumElts ? V1 : V2;
11352
11353 // Compute which of the two target values this index should be assigned
11354 // to. This reflects whether the high elements are remaining or the low
11355 // elements are remaining.
11356 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11357
11358 // Either set up this value if we've not encountered it before, or check
11359 // that it remains consistent.
11360 if (!TargetV)
11361 TargetV = MaskV;
11362 else if (TargetV != MaskV)
11363 // This may be a rotation, but it pulls from the inputs in some
11364 // unsupported interleaving.
11365 return -1;
11366 }
11367
11368 // Check that we successfully analyzed the mask, and normalize the results.
11369 assert(Rotation != 0 && "Failed to locate a viable rotation!");
11370 assert((Lo || Hi) && "Failed to find a rotated input vector!");
11371 if (!Lo)
11372 Lo = Hi;
11373 else if (!Hi)
11374 Hi = Lo;
11375
11376 V1 = Lo;
11377 V2 = Hi;
11378
11379 return Rotation;
11380}
11381
11382/// Try to lower a vector shuffle as a byte rotation.
11383///
11384/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11385/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11386/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11387/// try to generically lower a vector shuffle through such an pattern. It
11388/// does not check for the profitability of lowering either as PALIGNR or
11389/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11390/// This matches shuffle vectors that look like:
11391///
11392/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11393///
11394/// Essentially it concatenates V1 and V2, shifts right by some number of
11395/// elements, and takes the low elements as the result. Note that while this is
11396/// specified as a *right shift* because x86 is little-endian, it is a *left
11397/// rotate* of the vector lanes.
11399 ArrayRef<int> Mask) {
11400 // Don't accept any shuffles with zero elements.
11401 if (isAnyZero(Mask))
11402 return -1;
11403
11404 // PALIGNR works on 128-bit lanes.
11405 SmallVector<int, 16> RepeatedMask;
11406 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11407 return -1;
11408
11409 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
11410 if (Rotation <= 0)
11411 return -1;
11412
11413 // PALIGNR rotates bytes, so we need to scale the
11414 // rotation based on how many bytes are in the vector lane.
11415 int NumElts = RepeatedMask.size();
11416 int Scale = 16 / NumElts;
11417 return Rotation * Scale;
11418}
11419
11421 SDValue V2, ArrayRef<int> Mask,
11422 const X86Subtarget &Subtarget,
11423 SelectionDAG &DAG) {
11424 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11425
11426 SDValue Lo = V1, Hi = V2;
11427 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11428 if (ByteRotation <= 0)
11429 return SDValue();
11430
11431 // Cast the inputs to i8 vector of correct length to match PALIGNR or
11432 // PSLLDQ/PSRLDQ.
11433 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11434 Lo = DAG.getBitcast(ByteVT, Lo);
11435 Hi = DAG.getBitcast(ByteVT, Hi);
11436
11437 // SSSE3 targets can use the palignr instruction.
11438 if (Subtarget.hasSSSE3()) {
11439 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
11440 "512-bit PALIGNR requires BWI instructions");
11441 return DAG.getBitcast(
11442 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
11443 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
11444 }
11445
11446 assert(VT.is128BitVector() &&
11447 "Rotate-based lowering only supports 128-bit lowering!");
11448 assert(Mask.size() <= 16 &&
11449 "Can shuffle at most 16 bytes in a 128-bit vector!");
11450 assert(ByteVT == MVT::v16i8 &&
11451 "SSE2 rotate lowering only needed for v16i8!");
11452
11453 // Default SSE2 implementation
11454 int LoByteShift = 16 - ByteRotation;
11455 int HiByteShift = ByteRotation;
11456
11457 SDValue LoShift =
11458 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
11459 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
11460 SDValue HiShift =
11461 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
11462 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
11463 return DAG.getBitcast(VT,
11464 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
11465}
11466
11467/// Try to lower a vector shuffle as a dword/qword rotation.
11468///
11469/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
11470/// rotation of the concatenation of two vectors; This routine will
11471/// try to generically lower a vector shuffle through such an pattern.
11472///
11473/// Essentially it concatenates V1 and V2, shifts right by some number of
11474/// elements, and takes the low elements as the result. Note that while this is
11475/// specified as a *right shift* because x86 is little-endian, it is a *left
11476/// rotate* of the vector lanes.
11478 SDValue V2, ArrayRef<int> Mask,
11479 const APInt &Zeroable,
11480 const X86Subtarget &Subtarget,
11481 SelectionDAG &DAG) {
11482 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
11483 "Only 32-bit and 64-bit elements are supported!");
11484
11485 // 128/256-bit vectors are only supported with VLX.
11486 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
11487 && "VLX required for 128/256-bit vectors");
11488
11489 SDValue Lo = V1, Hi = V2;
11490 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
11491 if (0 < Rotation)
11492 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
11493 DAG.getTargetConstant(Rotation, DL, MVT::i8));
11494
11495 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
11496 // TODO: Pull this out as a matchShuffleAsElementShift helper?
11497 // TODO: We can probably make this more aggressive and use shift-pairs like
11498 // lowerShuffleAsByteShiftMask.
11499 unsigned NumElts = Mask.size();
11500 unsigned ZeroLo = Zeroable.countr_one();
11501 unsigned ZeroHi = Zeroable.countl_one();
11502 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
11503 if (!ZeroLo && !ZeroHi)
11504 return SDValue();
11505
11506 if (ZeroLo) {
11507 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11508 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
11509 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
11510 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
11511 getZeroVector(VT, Subtarget, DAG, DL),
11512 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
11513 }
11514
11515 if (ZeroHi) {
11516 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
11517 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
11518 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
11519 return DAG.getNode(X86ISD::VALIGN, DL, VT,
11520 getZeroVector(VT, Subtarget, DAG, DL), Src,
11521 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
11522 }
11523
11524 return SDValue();
11525}
11526
11527/// Try to lower a vector shuffle as a byte shift sequence.
11529 SDValue V2, ArrayRef<int> Mask,
11530 const APInt &Zeroable,
11531 const X86Subtarget &Subtarget,
11532 SelectionDAG &DAG) {
11533 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11534 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
11535
11536 // We need a shuffle that has zeros at one/both ends and a sequential
11537 // shuffle from one source within.
11538 unsigned ZeroLo = Zeroable.countr_one();
11539 unsigned ZeroHi = Zeroable.countl_one();
11540 if (!ZeroLo && !ZeroHi)
11541 return SDValue();
11542
11543 unsigned NumElts = Mask.size();
11544 unsigned Len = NumElts - (ZeroLo + ZeroHi);
11545 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
11546 return SDValue();
11547
11548 unsigned Scale = VT.getScalarSizeInBits() / 8;
11549 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
11550 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
11551 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
11552 return SDValue();
11553
11554 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11555 Res = DAG.getBitcast(MVT::v16i8, Res);
11556
11557 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
11558 // inner sequential set of elements, possibly offset:
11559 // 01234567 --> zzzzzz01 --> 1zzzzzzz
11560 // 01234567 --> 4567zzzz --> zzzzz456
11561 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
11562 if (ZeroLo == 0) {
11563 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11564 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11565 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11566 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11567 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
11568 } else if (ZeroHi == 0) {
11569 unsigned Shift = Mask[ZeroLo] % NumElts;
11570 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11571 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11572 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11573 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
11574 } else if (!Subtarget.hasSSSE3()) {
11575 // If we don't have PSHUFB then its worth avoiding an AND constant mask
11576 // by performing 3 byte shifts. Shuffle combining can kick in above that.
11577 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
11578 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11579 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11580 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11581 Shift += Mask[ZeroLo] % NumElts;
11582 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11583 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11584 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11585 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
11586 } else
11587 return SDValue();
11588
11589 return DAG.getBitcast(VT, Res);
11590}
11591
11592/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
11593///
11594/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
11595/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
11596/// matches elements from one of the input vectors shuffled to the left or
11597/// right with zeroable elements 'shifted in'. It handles both the strictly
11598/// bit-wise element shifts and the byte shift across an entire 128-bit double
11599/// quad word lane.
11600///
11601/// PSHL : (little-endian) left bit shift.
11602/// [ zz, 0, zz, 2 ]
11603/// [ -1, 4, zz, -1 ]
11604/// PSRL : (little-endian) right bit shift.
11605/// [ 1, zz, 3, zz]
11606/// [ -1, -1, 7, zz]
11607/// PSLLDQ : (little-endian) left byte shift
11608/// [ zz, 0, 1, 2, 3, 4, 5, 6]
11609/// [ zz, zz, -1, -1, 2, 3, 4, -1]
11610/// [ zz, zz, zz, zz, zz, zz, -1, 1]
11611/// PSRLDQ : (little-endian) right byte shift
11612/// [ 5, 6, 7, zz, zz, zz, zz, zz]
11613/// [ -1, 5, 6, 7, zz, zz, zz, zz]
11614/// [ 1, 2, -1, -1, -1, -1, zz, zz]
11615static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
11616 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
11617 int MaskOffset, const APInt &Zeroable,
11618 const X86Subtarget &Subtarget) {
11619 int Size = Mask.size();
11620 unsigned SizeInBits = Size * ScalarSizeInBits;
11621
11622 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
11623 for (int i = 0; i < Size; i += Scale)
11624 for (int j = 0; j < Shift; ++j)
11625 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
11626 return false;
11627
11628 return true;
11629 };
11630
11631 auto MatchShift = [&](int Shift, int Scale, bool Left) {
11632 for (int i = 0; i != Size; i += Scale) {
11633 unsigned Pos = Left ? i + Shift : i;
11634 unsigned Low = Left ? i : i + Shift;
11635 unsigned Len = Scale - Shift;
11636 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
11637 return -1;
11638 }
11639
11640 int ShiftEltBits = ScalarSizeInBits * Scale;
11641 bool ByteShift = ShiftEltBits > 64;
11642 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
11643 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
11644 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
11645
11646 // Normalize the scale for byte shifts to still produce an i64 element
11647 // type.
11648 Scale = ByteShift ? Scale / 2 : Scale;
11649
11650 // We need to round trip through the appropriate type for the shift.
11651 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
11652 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
11653 : MVT::getVectorVT(ShiftSVT, Size / Scale);
11654 return (int)ShiftAmt;
11655 };
11656
11657 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
11658 // keep doubling the size of the integer elements up to that. We can
11659 // then shift the elements of the integer vector by whole multiples of
11660 // their width within the elements of the larger integer vector. Test each
11661 // multiple to see if we can find a match with the moved element indices
11662 // and that the shifted in elements are all zeroable.
11663 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
11664 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
11665 for (int Shift = 1; Shift != Scale; ++Shift)
11666 for (bool Left : {true, false})
11667 if (CheckZeros(Shift, Scale, Left)) {
11668 int ShiftAmt = MatchShift(Shift, Scale, Left);
11669 if (0 < ShiftAmt)
11670 return ShiftAmt;
11671 }
11672
11673 // no match
11674 return -1;
11675}
11676
11678 SDValue V2, ArrayRef<int> Mask,
11679 const APInt &Zeroable,
11680 const X86Subtarget &Subtarget,
11681 SelectionDAG &DAG, bool BitwiseOnly) {
11682 int Size = Mask.size();
11683 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11684
11685 MVT ShiftVT;
11686 SDValue V = V1;
11687 unsigned Opcode;
11688
11689 // Try to match shuffle against V1 shift.
11690 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
11691 Mask, 0, Zeroable, Subtarget);
11692
11693 // If V1 failed, try to match shuffle against V2 shift.
11694 if (ShiftAmt < 0) {
11695 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
11696 Mask, Size, Zeroable, Subtarget);
11697 V = V2;
11698 }
11699
11700 if (ShiftAmt < 0)
11701 return SDValue();
11702
11703 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
11704 return SDValue();
11705
11706 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
11707 "Illegal integer vector type");
11708 V = DAG.getBitcast(ShiftVT, V);
11709 V = DAG.getNode(Opcode, DL, ShiftVT, V,
11710 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
11711 return DAG.getBitcast(VT, V);
11712}
11713
11714// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
11715// Remainder of lower half result is zero and upper half is all undef.
11716static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
11717 ArrayRef<int> Mask, uint64_t &BitLen,
11718 uint64_t &BitIdx, const APInt &Zeroable) {
11719 int Size = Mask.size();
11720 int HalfSize = Size / 2;
11721 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11722 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
11723
11724 // Upper half must be undefined.
11725 if (!isUndefUpperHalf(Mask))
11726 return false;
11727
11728 // Determine the extraction length from the part of the
11729 // lower half that isn't zeroable.
11730 int Len = HalfSize;
11731 for (; Len > 0; --Len)
11732 if (!Zeroable[Len - 1])
11733 break;
11734 assert(Len > 0 && "Zeroable shuffle mask");
11735
11736 // Attempt to match first Len sequential elements from the lower half.
11737 SDValue Src;
11738 int Idx = -1;
11739 for (int i = 0; i != Len; ++i) {
11740 int M = Mask[i];
11741 if (M == SM_SentinelUndef)
11742 continue;
11743 SDValue &V = (M < Size ? V1 : V2);
11744 M = M % Size;
11745
11746 // The extracted elements must start at a valid index and all mask
11747 // elements must be in the lower half.
11748 if (i > M || M >= HalfSize)
11749 return false;
11750
11751 if (Idx < 0 || (Src == V && Idx == (M - i))) {
11752 Src = V;
11753 Idx = M - i;
11754 continue;
11755 }
11756 return false;
11757 }
11758
11759 if (!Src || Idx < 0)
11760 return false;
11761
11762 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
11763 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
11764 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
11765 V1 = Src;
11766 return true;
11767}
11768
11769// INSERTQ: Extract lowest Len elements from lower half of second source and
11770// insert over first source, starting at Idx.
11771// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
11772static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
11773 ArrayRef<int> Mask, uint64_t &BitLen,
11774 uint64_t &BitIdx) {
11775 int Size = Mask.size();
11776 int HalfSize = Size / 2;
11777 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11778
11779 // Upper half must be undefined.
11780 if (!isUndefUpperHalf(Mask))
11781 return false;
11782
11783 for (int Idx = 0; Idx != HalfSize; ++Idx) {
11784 SDValue Base;
11785
11786 // Attempt to match first source from mask before insertion point.
11787 if (isUndefInRange(Mask, 0, Idx)) {
11788 /* EMPTY */
11789 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
11790 Base = V1;
11791 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
11792 Base = V2;
11793 } else {
11794 continue;
11795 }
11796
11797 // Extend the extraction length looking to match both the insertion of
11798 // the second source and the remaining elements of the first.
11799 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
11800 SDValue Insert;
11801 int Len = Hi - Idx;
11802
11803 // Match insertion.
11804 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
11805 Insert = V1;
11806 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
11807 Insert = V2;
11808 } else {
11809 continue;
11810 }
11811
11812 // Match the remaining elements of the lower half.
11813 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
11814 /* EMPTY */
11815 } else if ((!Base || (Base == V1)) &&
11816 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
11817 Base = V1;
11818 } else if ((!Base || (Base == V2)) &&
11819 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
11820 Size + Hi)) {
11821 Base = V2;
11822 } else {
11823 continue;
11824 }
11825
11826 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
11827 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
11828 V1 = Base;
11829 V2 = Insert;
11830 return true;
11831 }
11832 }
11833
11834 return false;
11835}
11836
11837/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
11839 SDValue V2, ArrayRef<int> Mask,
11840 const APInt &Zeroable, SelectionDAG &DAG) {
11841 uint64_t BitLen, BitIdx;
11842 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
11843 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
11844 DAG.getTargetConstant(BitLen, DL, MVT::i8),
11845 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
11846
11847 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
11848 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
11849 V2 ? V2 : DAG.getUNDEF(VT),
11850 DAG.getTargetConstant(BitLen, DL, MVT::i8),
11851 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
11852
11853 return SDValue();
11854}
11855
11856/// Lower a vector shuffle as a zero or any extension.
11857///
11858/// Given a specific number of elements, element bit width, and extension
11859/// stride, produce either a zero or any extension based on the available
11860/// features of the subtarget. The extended elements are consecutive and
11861/// begin and can start from an offsetted element index in the input; to
11862/// avoid excess shuffling the offset must either being in the bottom lane
11863/// or at the start of a higher lane. All extended elements must be from
11864/// the same lane.
11866 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
11867 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11868 assert(Scale > 1 && "Need a scale to extend.");
11869 int EltBits = VT.getScalarSizeInBits();
11870 int NumElements = VT.getVectorNumElements();
11871 int NumEltsPerLane = 128 / EltBits;
11872 int OffsetLane = Offset / NumEltsPerLane;
11873 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
11874 "Only 8, 16, and 32 bit elements can be extended.");
11875 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
11876 assert(0 <= Offset && "Extension offset must be positive.");
11877 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
11878 "Extension offset must be in the first lane or start an upper lane.");
11879
11880 // Check that an index is in same lane as the base offset.
11881 auto SafeOffset = [&](int Idx) {
11882 return OffsetLane == (Idx / NumEltsPerLane);
11883 };
11884
11885 // Shift along an input so that the offset base moves to the first element.
11886 auto ShuffleOffset = [&](SDValue V) {
11887 if (!Offset)
11888 return V;
11889
11890 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
11891 for (int i = 0; i * Scale < NumElements; ++i) {
11892 int SrcIdx = i + Offset;
11893 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
11894 }
11895 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
11896 };
11897
11898 // Found a valid a/zext mask! Try various lowering strategies based on the
11899 // input type and available ISA extensions.
11900 if (Subtarget.hasSSE41()) {
11901 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
11902 // PUNPCK will catch this in a later shuffle match.
11903 if (Offset && Scale == 2 && VT.is128BitVector())
11904 return SDValue();
11905 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
11906 NumElements / Scale);
11907 InputV = DAG.getBitcast(VT, InputV);
11908 InputV = ShuffleOffset(InputV);
11910 DL, ExtVT, InputV, DAG);
11911 return DAG.getBitcast(VT, InputV);
11912 }
11913
11914 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
11915 InputV = DAG.getBitcast(VT, InputV);
11916
11917 // For any extends we can cheat for larger element sizes and use shuffle
11918 // instructions that can fold with a load and/or copy.
11919 if (AnyExt && EltBits == 32) {
11920 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
11921 -1};
11922 return DAG.getBitcast(
11923 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
11924 DAG.getBitcast(MVT::v4i32, InputV),
11925 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11926 }
11927 if (AnyExt && EltBits == 16 && Scale > 2) {
11928 int PSHUFDMask[4] = {Offset / 2, -1,
11929 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
11930 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
11931 DAG.getBitcast(MVT::v4i32, InputV),
11932 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
11933 int PSHUFWMask[4] = {1, -1, -1, -1};
11934 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
11935 return DAG.getBitcast(
11936 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
11937 DAG.getBitcast(MVT::v8i16, InputV),
11938 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
11939 }
11940
11941 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
11942 // to 64-bits.
11943 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
11944 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
11945 assert(VT.is128BitVector() && "Unexpected vector width!");
11946
11947 int LoIdx = Offset * EltBits;
11948 SDValue Lo = DAG.getBitcast(
11949 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
11950 DAG.getTargetConstant(EltBits, DL, MVT::i8),
11951 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
11952
11953 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
11954 return DAG.getBitcast(VT, Lo);
11955
11956 int HiIdx = (Offset + 1) * EltBits;
11957 SDValue Hi = DAG.getBitcast(
11958 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
11959 DAG.getTargetConstant(EltBits, DL, MVT::i8),
11960 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
11961 return DAG.getBitcast(VT,
11962 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
11963 }
11964
11965 // If this would require more than 2 unpack instructions to expand, use
11966 // pshufb when available. We can only use more than 2 unpack instructions
11967 // when zero extending i8 elements which also makes it easier to use pshufb.
11968 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
11969 assert(NumElements == 16 && "Unexpected byte vector width!");
11970 SDValue PSHUFBMask[16];
11971 for (int i = 0; i < 16; ++i) {
11972 int Idx = Offset + (i / Scale);
11973 if ((i % Scale == 0 && SafeOffset(Idx))) {
11974 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
11975 continue;
11976 }
11977 PSHUFBMask[i] =
11978 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
11979 }
11980 InputV = DAG.getBitcast(MVT::v16i8, InputV);
11981 return DAG.getBitcast(
11982 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
11983 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
11984 }
11985
11986 // If we are extending from an offset, ensure we start on a boundary that
11987 // we can unpack from.
11988 int AlignToUnpack = Offset % (NumElements / Scale);
11989 if (AlignToUnpack) {
11990 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
11991 for (int i = AlignToUnpack; i < NumElements; ++i)
11992 ShMask[i - AlignToUnpack] = i;
11993 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
11994 Offset -= AlignToUnpack;
11995 }
11996
11997 // Otherwise emit a sequence of unpacks.
11998 do {
11999 unsigned UnpackLoHi = X86ISD::UNPCKL;
12000 if (Offset >= (NumElements / 2)) {
12001 UnpackLoHi = X86ISD::UNPCKH;
12002 Offset -= (NumElements / 2);
12003 }
12004
12005 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
12006 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
12007 : getZeroVector(InputVT, Subtarget, DAG, DL);
12008 InputV = DAG.getBitcast(InputVT, InputV);
12009 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
12010 Scale /= 2;
12011 EltBits *= 2;
12012 NumElements /= 2;
12013 } while (Scale > 1);
12014 return DAG.getBitcast(VT, InputV);
12015}
12016
12017/// Try to lower a vector shuffle as a zero extension on any microarch.
12018///
12019/// This routine will try to do everything in its power to cleverly lower
12020/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12021/// check for the profitability of this lowering, it tries to aggressively
12022/// match this pattern. It will use all of the micro-architectural details it
12023/// can to emit an efficient lowering. It handles both blends with all-zero
12024/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12025/// masking out later).
12026///
12027/// The reason we have dedicated lowering for zext-style shuffles is that they
12028/// are both incredibly common and often quite performance sensitive.
12030 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12031 const APInt &Zeroable, const X86Subtarget &Subtarget,
12032 SelectionDAG &DAG) {
12033 int Bits = VT.getSizeInBits();
12034 int NumLanes = Bits / 128;
12035 int NumElements = VT.getVectorNumElements();
12036 int NumEltsPerLane = NumElements / NumLanes;
12037 assert(VT.getScalarSizeInBits() <= 32 &&
12038 "Exceeds 32-bit integer zero extension limit");
12039 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
12040
12041 // Define a helper function to check a particular ext-scale and lower to it if
12042 // valid.
12043 auto Lower = [&](int Scale) -> SDValue {
12044 SDValue InputV;
12045 bool AnyExt = true;
12046 int Offset = 0;
12047 int Matches = 0;
12048 for (int i = 0; i < NumElements; ++i) {
12049 int M = Mask[i];
12050 if (M < 0)
12051 continue; // Valid anywhere but doesn't tell us anything.
12052 if (i % Scale != 0) {
12053 // Each of the extended elements need to be zeroable.
12054 if (!Zeroable[i])
12055 return SDValue();
12056
12057 // We no longer are in the anyext case.
12058 AnyExt = false;
12059 continue;
12060 }
12061
12062 // Each of the base elements needs to be consecutive indices into the
12063 // same input vector.
12064 SDValue V = M < NumElements ? V1 : V2;
12065 M = M % NumElements;
12066 if (!InputV) {
12067 InputV = V;
12068 Offset = M - (i / Scale);
12069 } else if (InputV != V)
12070 return SDValue(); // Flip-flopping inputs.
12071
12072 // Offset must start in the lowest 128-bit lane or at the start of an
12073 // upper lane.
12074 // FIXME: Is it ever worth allowing a negative base offset?
12075 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12076 (Offset % NumEltsPerLane) == 0))
12077 return SDValue();
12078
12079 // If we are offsetting, all referenced entries must come from the same
12080 // lane.
12081 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12082 return SDValue();
12083
12084 if ((M % NumElements) != (Offset + (i / Scale)))
12085 return SDValue(); // Non-consecutive strided elements.
12086 Matches++;
12087 }
12088
12089 // If we fail to find an input, we have a zero-shuffle which should always
12090 // have already been handled.
12091 // FIXME: Maybe handle this here in case during blending we end up with one?
12092 if (!InputV)
12093 return SDValue();
12094
12095 // If we are offsetting, don't extend if we only match a single input, we
12096 // can always do better by using a basic PSHUF or PUNPCK.
12097 if (Offset != 0 && Matches < 2)
12098 return SDValue();
12099
12100 return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
12101 InputV, Mask, Subtarget, DAG);
12102 };
12103
12104 // The widest scale possible for extending is to a 64-bit integer.
12105 assert(Bits % 64 == 0 &&
12106 "The number of bits in a vector must be divisible by 64 on x86!");
12107 int NumExtElements = Bits / 64;
12108
12109 // Each iteration, try extending the elements half as much, but into twice as
12110 // many elements.
12111 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12112 assert(NumElements % NumExtElements == 0 &&
12113 "The input vector size must be divisible by the extended size.");
12114 if (SDValue V = Lower(NumElements / NumExtElements))
12115 return V;
12116 }
12117
12118 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12119 if (Bits != 128)
12120 return SDValue();
12121
12122 // Returns one of the source operands if the shuffle can be reduced to a
12123 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12124 auto CanZExtLowHalf = [&]() {
12125 for (int i = NumElements / 2; i != NumElements; ++i)
12126 if (!Zeroable[i])
12127 return SDValue();
12128 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12129 return V1;
12130 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12131 return V2;
12132 return SDValue();
12133 };
12134
12135 if (SDValue V = CanZExtLowHalf()) {
12136 V = DAG.getBitcast(MVT::v2i64, V);
12137 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12138 return DAG.getBitcast(VT, V);
12139 }
12140
12141 // No viable ext lowering found.
12142 return SDValue();
12143}
12144
12145/// Try to get a scalar value for a specific element of a vector.
12146///
12147/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12149 SelectionDAG &DAG) {
12150 MVT VT = V.getSimpleValueType();
12151 MVT EltVT = VT.getVectorElementType();
12152 V = peekThroughBitcasts(V);
12153
12154 // If the bitcasts shift the element size, we can't extract an equivalent
12155 // element from it.
12156 MVT NewVT = V.getSimpleValueType();
12157 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12158 return SDValue();
12159
12160 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12161 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12162 // Ensure the scalar operand is the same size as the destination.
12163 // FIXME: Add support for scalar truncation where possible.
12164 SDValue S = V.getOperand(Idx);
12165 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12166 return DAG.getBitcast(EltVT, S);
12167 }
12168
12169 return SDValue();
12170}
12171
12172/// Helper to test for a load that can be folded with x86 shuffles.
12173///
12174/// This is particularly important because the set of instructions varies
12175/// significantly based on whether the operand is a load or not.
12177 return V->hasOneUse() &&
12179}
12180
12181template<typename T>
12182static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
12183 T EltVT = VT.getScalarType();
12184 return EltVT == MVT::bf16 || (EltVT == MVT::f16 && !Subtarget.hasFP16());
12185}
12186
12187/// Try to lower insertion of a single element into a zero vector.
12188///
12189/// This is a common pattern that we have especially efficient patterns to lower
12190/// across all subtarget feature sets.
12192 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12193 const APInt &Zeroable, const X86Subtarget &Subtarget,
12194 SelectionDAG &DAG) {
12195 MVT ExtVT = VT;
12196 MVT EltVT = VT.getVectorElementType();
12197 unsigned NumElts = VT.getVectorNumElements();
12198 unsigned EltBits = VT.getScalarSizeInBits();
12199
12200 if (isSoftF16(EltVT, Subtarget))
12201 return SDValue();
12202
12203 int V2Index =
12204 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12205 Mask.begin();
12206 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
12207 bool IsV1Zeroable = true;
12208 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12209 if (i != V2Index && !Zeroable[i]) {
12210 IsV1Zeroable = false;
12211 break;
12212 }
12213
12214 // Bail if a non-zero V1 isn't used in place.
12215 if (!IsV1Zeroable) {
12216 SmallVector<int, 8> V1Mask(Mask);
12217 V1Mask[V2Index] = -1;
12218 if (!isNoopShuffleMask(V1Mask))
12219 return SDValue();
12220 }
12221
12222 // Check for a single input from a SCALAR_TO_VECTOR node.
12223 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12224 // all the smarts here sunk into that routine. However, the current
12225 // lowering of BUILD_VECTOR makes that nearly impossible until the old
12226 // vector shuffle lowering is dead.
12227 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12228 DAG);
12229 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12230 // We need to zext the scalar if it is smaller than an i32.
12231 V2S = DAG.getBitcast(EltVT, V2S);
12232 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
12233 // Using zext to expand a narrow element won't work for non-zero
12234 // insertions. But we can use a masked constant vector if we're
12235 // inserting V2 into the bottom of V1.
12236 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
12237 return SDValue();
12238
12239 // Zero-extend directly to i32.
12240 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12241 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12242
12243 // If we're inserting into a constant, mask off the inserted index
12244 // and OR with the zero-extended scalar.
12245 if (!IsV1Zeroable) {
12246 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
12247 Bits[V2Index] = APInt::getZero(EltBits);
12248 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
12249 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
12250 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12251 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
12252 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12253 }
12254 }
12255 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12256 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12257 EltVT == MVT::i16) {
12258 // Either not inserting from the low element of the input or the input
12259 // element size is too small to use VZEXT_MOVL to clear the high bits.
12260 return SDValue();
12261 }
12262
12263 if (!IsV1Zeroable) {
12264 // If V1 can't be treated as a zero vector we have fewer options to lower
12265 // this. We can't support integer vectors or non-zero targets cheaply.
12266 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12267 if (!VT.isFloatingPoint() || V2Index != 0)
12268 return SDValue();
12269 if (!VT.is128BitVector())
12270 return SDValue();
12271
12272 // Otherwise, use MOVSD, MOVSS or MOVSH.
12273 unsigned MovOpc = 0;
12274 if (EltVT == MVT::f16)
12275 MovOpc = X86ISD::MOVSH;
12276 else if (EltVT == MVT::f32)
12277 MovOpc = X86ISD::MOVSS;
12278 else if (EltVT == MVT::f64)
12279 MovOpc = X86ISD::MOVSD;
12280 else
12281 llvm_unreachable("Unsupported floating point element type to handle!");
12282 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
12283 }
12284
12285 // This lowering only works for the low element with floating point vectors.
12286 if (VT.isFloatingPoint() && V2Index != 0)
12287 return SDValue();
12288
12289 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12290 if (ExtVT != VT)
12291 V2 = DAG.getBitcast(VT, V2);
12292
12293 if (V2Index != 0) {
12294 // If we have 4 or fewer lanes we can cheaply shuffle the element into
12295 // the desired position. Otherwise it is more efficient to do a vector
12296 // shift left. We know that we can do a vector shift left because all
12297 // the inputs are zero.
12298 if (VT.isFloatingPoint() || NumElts <= 4) {
12299 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12300 V2Shuffle[V2Index] = 0;
12301 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12302 } else {
12303 V2 = DAG.getBitcast(MVT::v16i8, V2);
12304 V2 = DAG.getNode(
12305 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12306 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
12307 V2 = DAG.getBitcast(VT, V2);
12308 }
12309 }
12310 return V2;
12311}
12312
12313/// Try to lower broadcast of a single - truncated - integer element,
12314/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12315///
12316/// This assumes we have AVX2.
12318 int BroadcastIdx,
12319 const X86Subtarget &Subtarget,
12320 SelectionDAG &DAG) {
12321 assert(Subtarget.hasAVX2() &&
12322 "We can only lower integer broadcasts with AVX2!");
12323
12324 MVT EltVT = VT.getVectorElementType();
12325 MVT V0VT = V0.getSimpleValueType();
12326
12327 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12328 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12329
12330 MVT V0EltVT = V0VT.getVectorElementType();
12331 if (!V0EltVT.isInteger())
12332 return SDValue();
12333
12334 const unsigned EltSize = EltVT.getSizeInBits();
12335 const unsigned V0EltSize = V0EltVT.getSizeInBits();
12336
12337 // This is only a truncation if the original element type is larger.
12338 if (V0EltSize <= EltSize)
12339 return SDValue();
12340
12341 assert(((V0EltSize % EltSize) == 0) &&
12342 "Scalar type sizes must all be powers of 2 on x86!");
12343
12344 const unsigned V0Opc = V0.getOpcode();
12345 const unsigned Scale = V0EltSize / EltSize;
12346 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12347
12348 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12349 V0Opc != ISD::BUILD_VECTOR)
12350 return SDValue();
12351
12352 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12353
12354 // If we're extracting non-least-significant bits, shift so we can truncate.
12355 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12356 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12357 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12358 if (const int OffsetIdx = BroadcastIdx % Scale)
12359 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12360 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12361
12362 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12363 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12364}
12365
12366/// Test whether this can be lowered with a single SHUFPS instruction.
12367///
12368/// This is used to disable more specialized lowerings when the shufps lowering
12369/// will happen to be efficient.
12371 // This routine only handles 128-bit shufps.
12372 assert(Mask.size() == 4 && "Unsupported mask size!");
12373 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12374 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12375 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12376 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12377
12378 // To lower with a single SHUFPS we need to have the low half and high half
12379 // each requiring a single input.
12380 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12381 return false;
12382 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12383 return false;
12384
12385 return true;
12386}
12387
12388/// Test whether the specified input (0 or 1) is in-place blended by the
12389/// given mask.
12390///
12391/// This returns true if the elements from a particular input are already in the
12392/// slot required by the given mask and require no permutation.
12393static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12394 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12395 int Size = Mask.size();
12396 for (int i = 0; i < Size; ++i)
12397 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12398 return false;
12399
12400 return true;
12401}
12402
12403/// If we are extracting two 128-bit halves of a vector and shuffling the
12404/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12405/// multi-shuffle lowering.
12407 SDValue N1, ArrayRef<int> Mask,
12408 SelectionDAG &DAG) {
12409 MVT VT = N0.getSimpleValueType();
12410 assert((VT.is128BitVector() &&
12411 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
12412 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12413
12414 // Check that both sources are extracts of the same source vector.
12415 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12417 N0.getOperand(0) != N1.getOperand(0) ||
12418 !N0.hasOneUse() || !N1.hasOneUse())
12419 return SDValue();
12420
12421 SDValue WideVec = N0.getOperand(0);
12422 MVT WideVT = WideVec.getSimpleValueType();
12423 if (!WideVT.is256BitVector())
12424 return SDValue();
12425
12426 // Match extracts of each half of the wide source vector. Commute the shuffle
12427 // if the extract of the low half is N1.
12428 unsigned NumElts = VT.getVectorNumElements();
12429 SmallVector<int, 4> NewMask(Mask);
12430 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
12431 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
12432 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
12434 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
12435 return SDValue();
12436
12437 // Final bailout: if the mask is simple, we are better off using an extract
12438 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
12439 // because that avoids a constant load from memory.
12440 if (NumElts == 4 &&
12441 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
12442 return SDValue();
12443
12444 // Extend the shuffle mask with undef elements.
12445 NewMask.append(NumElts, -1);
12446
12447 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
12448 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
12449 NewMask);
12450 // This is free: ymm -> xmm.
12451 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
12452 DAG.getIntPtrConstant(0, DL));
12453}
12454
12455/// Try to lower broadcast of a single element.
12456///
12457/// For convenience, this code also bundles all of the subtarget feature set
12458/// filtering. While a little annoying to re-dispatch on type here, there isn't
12459/// a convenient way to factor it out.
12461 SDValue V2, ArrayRef<int> Mask,
12462 const X86Subtarget &Subtarget,
12463 SelectionDAG &DAG) {
12464 MVT EltVT = VT.getVectorElementType();
12465 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
12466 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
12467 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
12468 return SDValue();
12469
12470 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
12471 // we can only broadcast from a register with AVX2.
12472 unsigned NumEltBits = VT.getScalarSizeInBits();
12473 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
12476 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
12477
12478 // Check that the mask is a broadcast.
12479 int BroadcastIdx = getSplatIndex(Mask);
12480 if (BroadcastIdx < 0)
12481 return SDValue();
12482 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
12483 "a sorted mask where the broadcast "
12484 "comes from V1.");
12485
12486 // Go up the chain of (vector) values to find a scalar load that we can
12487 // combine with the broadcast.
12488 // TODO: Combine this logic with findEltLoadSrc() used by
12489 // EltsFromConsecutiveLoads().
12490 int BitOffset = BroadcastIdx * NumEltBits;
12491 SDValue V = V1;
12492 for (;;) {
12493 switch (V.getOpcode()) {
12494 case ISD::BITCAST: {
12495 V = V.getOperand(0);
12496 continue;
12497 }
12498 case ISD::CONCAT_VECTORS: {
12499 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
12500 int OpIdx = BitOffset / OpBitWidth;
12501 V = V.getOperand(OpIdx);
12502 BitOffset %= OpBitWidth;
12503 continue;
12504 }
12506 // The extraction index adds to the existing offset.
12507 unsigned EltBitWidth = V.getScalarValueSizeInBits();
12508 unsigned Idx = V.getConstantOperandVal(1);
12509 unsigned BeginOffset = Idx * EltBitWidth;
12510 BitOffset += BeginOffset;
12511 V = V.getOperand(0);
12512 continue;
12513 }
12514 case ISD::INSERT_SUBVECTOR: {
12515 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
12516 int EltBitWidth = VOuter.getScalarValueSizeInBits();
12517 int Idx = (int)V.getConstantOperandVal(2);
12518 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
12519 int BeginOffset = Idx * EltBitWidth;
12520 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
12521 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
12522 BitOffset -= BeginOffset;
12523 V = VInner;
12524 } else {
12525 V = VOuter;
12526 }
12527 continue;
12528 }
12529 }
12530 break;
12531 }
12532 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
12533 BroadcastIdx = BitOffset / NumEltBits;
12534
12535 // Do we need to bitcast the source to retrieve the original broadcast index?
12536 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
12537
12538 // Check if this is a broadcast of a scalar. We special case lowering
12539 // for scalars so that we can more effectively fold with loads.
12540 // If the original value has a larger element type than the shuffle, the
12541 // broadcast element is in essence truncated. Make that explicit to ease
12542 // folding.
12543 if (BitCastSrc && VT.isInteger())
12544 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
12545 DL, VT, V, BroadcastIdx, Subtarget, DAG))
12546 return TruncBroadcast;
12547
12548 // Also check the simpler case, where we can directly reuse the scalar.
12549 if (!BitCastSrc &&
12550 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
12551 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
12552 V = V.getOperand(BroadcastIdx);
12553
12554 // If we can't broadcast from a register, check that the input is a load.
12555 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
12556 return SDValue();
12557 } else if (ISD::isNormalLoad(V.getNode()) &&
12558 cast<LoadSDNode>(V)->isSimple()) {
12559 // We do not check for one-use of the vector load because a broadcast load
12560 // is expected to be a win for code size, register pressure, and possibly
12561 // uops even if the original vector load is not eliminated.
12562
12563 // Reduce the vector load and shuffle to a broadcasted scalar load.
12564 LoadSDNode *Ld = cast<LoadSDNode>(V);
12565 SDValue BaseAddr = Ld->getOperand(1);
12566 MVT SVT = VT.getScalarType();
12567 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
12568 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
12569 SDValue NewAddr =
12571
12572 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
12573 // than MOVDDUP.
12574 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
12575 if (Opcode == X86ISD::VBROADCAST) {
12576 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
12577 SDValue Ops[] = {Ld->getChain(), NewAddr};
12578 V = DAG.getMemIntrinsicNode(
12579 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
12581 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12583 return DAG.getBitcast(VT, V);
12584 }
12585 assert(SVT == MVT::f64 && "Unexpected VT!");
12586 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
12588 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12590 } else if (!BroadcastFromReg) {
12591 // We can't broadcast from a vector register.
12592 return SDValue();
12593 } else if (BitOffset != 0) {
12594 // We can only broadcast from the zero-element of a vector register,
12595 // but it can be advantageous to broadcast from the zero-element of a
12596 // subvector.
12597 if (!VT.is256BitVector() && !VT.is512BitVector())
12598 return SDValue();
12599
12600 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
12601 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12602 return SDValue();
12603
12604 // Only broadcast the zero-element of a 128-bit subvector.
12605 if ((BitOffset % 128) != 0)
12606 return SDValue();
12607
12608 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
12609 "Unexpected bit-offset");
12610 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
12611 "Unexpected vector size");
12612 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
12613 V = extract128BitVector(V, ExtractIdx, DAG, DL);
12614 }
12615
12616 // On AVX we can use VBROADCAST directly for scalar sources.
12617 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
12618 V = DAG.getBitcast(MVT::f64, V);
12619 if (Subtarget.hasAVX()) {
12620 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
12621 return DAG.getBitcast(VT, V);
12622 }
12623 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
12624 }
12625
12626 // If this is a scalar, do the broadcast on this type and bitcast.
12627 if (!V.getValueType().isVector()) {
12628 assert(V.getScalarValueSizeInBits() == NumEltBits &&
12629 "Unexpected scalar size");
12630 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
12632 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
12633 }
12634
12635 // We only support broadcasting from 128-bit vectors to minimize the
12636 // number of patterns we need to deal with in isel. So extract down to
12637 // 128-bits, removing as many bitcasts as possible.
12638 if (V.getValueSizeInBits() > 128)
12640
12641 // Otherwise cast V to a vector with the same element type as VT, but
12642 // possibly narrower than VT. Then perform the broadcast.
12643 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
12644 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
12645 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
12646}
12647
12648// Check for whether we can use INSERTPS to perform the shuffle. We only use
12649// INSERTPS when the V1 elements are already in the correct locations
12650// because otherwise we can just always use two SHUFPS instructions which
12651// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
12652// perform INSERTPS if a single V1 element is out of place and all V2
12653// elements are zeroable.
12655 unsigned &InsertPSMask,
12656 const APInt &Zeroable,
12657 ArrayRef<int> Mask, SelectionDAG &DAG) {
12658 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
12659 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
12660 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12661
12662 // Attempt to match INSERTPS with one element from VA or VB being
12663 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
12664 // are updated.
12665 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
12666 ArrayRef<int> CandidateMask) {
12667 unsigned ZMask = 0;
12668 int VADstIndex = -1;
12669 int VBDstIndex = -1;
12670 bool VAUsedInPlace = false;
12671
12672 for (int i = 0; i < 4; ++i) {
12673 // Synthesize a zero mask from the zeroable elements (includes undefs).
12674 if (Zeroable[i]) {
12675 ZMask |= 1 << i;
12676 continue;
12677 }
12678
12679 // Flag if we use any VA inputs in place.
12680 if (i == CandidateMask[i]) {
12681 VAUsedInPlace = true;
12682 continue;
12683 }
12684
12685 // We can only insert a single non-zeroable element.
12686 if (VADstIndex >= 0 || VBDstIndex >= 0)
12687 return false;
12688
12689 if (CandidateMask[i] < 4) {
12690 // VA input out of place for insertion.
12691 VADstIndex = i;
12692 } else {
12693 // VB input for insertion.
12694 VBDstIndex = i;
12695 }
12696 }
12697
12698 // Don't bother if we have no (non-zeroable) element for insertion.
12699 if (VADstIndex < 0 && VBDstIndex < 0)
12700 return false;
12701
12702 // Determine element insertion src/dst indices. The src index is from the
12703 // start of the inserted vector, not the start of the concatenated vector.
12704 unsigned VBSrcIndex = 0;
12705 if (VADstIndex >= 0) {
12706 // If we have a VA input out of place, we use VA as the V2 element
12707 // insertion and don't use the original V2 at all.
12708 VBSrcIndex = CandidateMask[VADstIndex];
12709 VBDstIndex = VADstIndex;
12710 VB = VA;
12711 } else {
12712 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
12713 }
12714
12715 // If no V1 inputs are used in place, then the result is created only from
12716 // the zero mask and the V2 insertion - so remove V1 dependency.
12717 if (!VAUsedInPlace)
12718 VA = DAG.getUNDEF(MVT::v4f32);
12719
12720 // Update V1, V2 and InsertPSMask accordingly.
12721 V1 = VA;
12722 V2 = VB;
12723
12724 // Insert the V2 element into the desired position.
12725 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
12726 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
12727 return true;
12728 };
12729
12730 if (matchAsInsertPS(V1, V2, Mask))
12731 return true;
12732
12733 // Commute and try again.
12734 SmallVector<int, 4> CommutedMask(Mask);
12736 if (matchAsInsertPS(V2, V1, CommutedMask))
12737 return true;
12738
12739 return false;
12740}
12741
12743 ArrayRef<int> Mask, const APInt &Zeroable,
12744 SelectionDAG &DAG) {
12745 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
12746 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
12747
12748 // Attempt to match the insertps pattern.
12749 unsigned InsertPSMask = 0;
12750 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
12751 return SDValue();
12752
12753 // Insert the V2 element into the desired position.
12754 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
12755 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
12756}
12757
12758/// Handle lowering of 2-lane 64-bit floating point shuffles.
12759///
12760/// This is the basis function for the 2-lane 64-bit shuffles as we have full
12761/// support for floating point shuffles but not integer shuffles. These
12762/// instructions will incur a domain crossing penalty on some chips though so
12763/// it is better to avoid lowering through this for integer vectors where
12764/// possible.
12766 const APInt &Zeroable, SDValue V1, SDValue V2,
12767 const X86Subtarget &Subtarget,
12768 SelectionDAG &DAG) {
12769 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
12770 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
12771 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
12772
12773 if (V2.isUndef()) {
12774 // Check for being able to broadcast a single element.
12775 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
12776 Mask, Subtarget, DAG))
12777 return Broadcast;
12778
12779 // Straight shuffle of a single input vector. Simulate this by using the
12780 // single input as both of the "inputs" to this instruction..
12781 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
12782
12783 if (Subtarget.hasAVX()) {
12784 // If we have AVX, we can use VPERMILPS which will allow folding a load
12785 // into the shuffle.
12786 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
12787 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
12788 }
12789
12790 return DAG.getNode(
12791 X86ISD::SHUFP, DL, MVT::v2f64,
12792 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
12793 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
12794 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
12795 }
12796 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
12797 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
12798 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
12799 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
12800
12801 if (Subtarget.hasAVX2())
12802 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
12803 return Extract;
12804
12805 // When loading a scalar and then shuffling it into a vector we can often do
12806 // the insertion cheaply.
12808 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
12809 return Insertion;
12810 // Try inverting the insertion since for v2 masks it is easy to do and we
12811 // can't reliably sort the mask one way or the other.
12812 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
12813 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
12815 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
12816 return Insertion;
12817
12818 // Try to use one of the special instruction patterns to handle two common
12819 // blend patterns if a zero-blend above didn't work.
12820 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
12821 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
12822 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
12823 // We can either use a special instruction to load over the low double or
12824 // to move just the low double.
12825 return DAG.getNode(
12826 X86ISD::MOVSD, DL, MVT::v2f64, V2,
12827 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
12828
12829 if (Subtarget.hasSSE41())
12830 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
12831 Zeroable, Subtarget, DAG))
12832 return Blend;
12833
12834 // Use dedicated unpack instructions for masks that match their pattern.
12835 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
12836 return V;
12837
12838 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
12839 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
12840 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
12841}
12842
12843/// Handle lowering of 2-lane 64-bit integer shuffles.
12844///
12845/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
12846/// the integer unit to minimize domain crossing penalties. However, for blends
12847/// it falls back to the floating point shuffle operation with appropriate bit
12848/// casting.
12850 const APInt &Zeroable, SDValue V1, SDValue V2,
12851 const X86Subtarget &Subtarget,
12852 SelectionDAG &DAG) {
12853 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
12854 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
12855 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
12856
12857 if (V2.isUndef()) {
12858 // Check for being able to broadcast a single element.
12859 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
12860 Mask, Subtarget, DAG))
12861 return Broadcast;
12862
12863 // Straight shuffle of a single input vector. For everything from SSE2
12864 // onward this has a single fast instruction with no scary immediates.
12865 // We have to map the mask as it is actually a v4i32 shuffle instruction.
12866 V1 = DAG.getBitcast(MVT::v4i32, V1);
12867 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
12868 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
12869 Mask[1] < 0 ? -1 : (Mask[1] * 2),
12870 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
12871 return DAG.getBitcast(
12872 MVT::v2i64,
12873 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
12874 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
12875 }
12876 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
12877 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
12878 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
12879 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
12880
12881 if (Subtarget.hasAVX2())
12882 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
12883 return Extract;
12884
12885 // Try to use shift instructions.
12886 if (SDValue Shift =
12887 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
12888 DAG, /*BitwiseOnly*/ false))
12889 return Shift;
12890
12891 // When loading a scalar and then shuffling it into a vector we can often do
12892 // the insertion cheaply.
12894 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
12895 return Insertion;
12896 // Try inverting the insertion since for v2 masks it is easy to do and we
12897 // can't reliably sort the mask one way or the other.
12898 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
12900 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
12901 return Insertion;
12902
12903 // We have different paths for blend lowering, but they all must use the
12904 // *exact* same predicate.
12905 bool IsBlendSupported = Subtarget.hasSSE41();
12906 if (IsBlendSupported)
12907 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
12908 Zeroable, Subtarget, DAG))
12909 return Blend;
12910
12911 // Use dedicated unpack instructions for masks that match their pattern.
12912 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
12913 return V;
12914
12915 // Try to use byte rotation instructions.
12916 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
12917 if (Subtarget.hasSSSE3()) {
12918 if (Subtarget.hasVLX())
12919 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
12920 Zeroable, Subtarget, DAG))
12921 return Rotate;
12922
12923 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
12924 Subtarget, DAG))
12925 return Rotate;
12926 }
12927
12928 // If we have direct support for blends, we should lower by decomposing into
12929 // a permute. That will be faster than the domain cross.
12930 if (IsBlendSupported)
12931 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
12932 Subtarget, DAG);
12933
12934 // We implement this with SHUFPD which is pretty lame because it will likely
12935 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
12936 // However, all the alternatives are still more cycles and newer chips don't
12937 // have this problem. It would be really nice if x86 had better shuffles here.
12938 V1 = DAG.getBitcast(MVT::v2f64, V1);
12939 V2 = DAG.getBitcast(MVT::v2f64, V2);
12940 return DAG.getBitcast(MVT::v2i64,
12941 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
12942}
12943
12944/// Lower a vector shuffle using the SHUFPS instruction.
12945///
12946/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
12947/// It makes no assumptions about whether this is the *best* lowering, it simply
12948/// uses it.
12950 ArrayRef<int> Mask, SDValue V1,
12951 SDValue V2, SelectionDAG &DAG) {
12952 SDValue LowV = V1, HighV = V2;
12953 SmallVector<int, 4> NewMask(Mask);
12954 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
12955
12956 if (NumV2Elements == 1) {
12957 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
12958
12959 // Compute the index adjacent to V2Index and in the same half by toggling
12960 // the low bit.
12961 int V2AdjIndex = V2Index ^ 1;
12962
12963 if (Mask[V2AdjIndex] < 0) {
12964 // Handles all the cases where we have a single V2 element and an undef.
12965 // This will only ever happen in the high lanes because we commute the
12966 // vector otherwise.
12967 if (V2Index < 2)
12968 std::swap(LowV, HighV);
12969 NewMask[V2Index] -= 4;
12970 } else {
12971 // Handle the case where the V2 element ends up adjacent to a V1 element.
12972 // To make this work, blend them together as the first step.
12973 int V1Index = V2AdjIndex;
12974 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
12975 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
12976 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
12977
12978 // Now proceed to reconstruct the final blend as we have the necessary
12979 // high or low half formed.
12980 if (V2Index < 2) {
12981 LowV = V2;
12982 HighV = V1;
12983 } else {
12984 HighV = V2;
12985 }
12986 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
12987 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
12988 }
12989 } else if (NumV2Elements == 2) {
12990 if (Mask[0] < 4 && Mask[1] < 4) {
12991 // Handle the easy case where we have V1 in the low lanes and V2 in the
12992 // high lanes.
12993 NewMask[2] -= 4;
12994 NewMask[3] -= 4;
12995 } else if (Mask[2] < 4 && Mask[3] < 4) {
12996 // We also handle the reversed case because this utility may get called
12997 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
12998 // arrange things in the right direction.
12999 NewMask[0] -= 4;
13000 NewMask[1] -= 4;
13001 HighV = V1;
13002 LowV = V2;
13003 } else {
13004 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
13005 // trying to place elements directly, just blend them and set up the final
13006 // shuffle to place them.
13007
13008 // The first two blend mask elements are for V1, the second two are for
13009 // V2.
13010 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13011 Mask[2] < 4 ? Mask[2] : Mask[3],
13012 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13013 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13014 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13015 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13016
13017 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13018 // a blend.
13019 LowV = HighV = V1;
13020 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13021 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13022 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13023 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13024 }
13025 } else if (NumV2Elements == 3) {
13026 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
13027 // we can get here due to other paths (e.g repeated mask matching) that we
13028 // don't want to do another round of lowerVECTOR_SHUFFLE.
13030 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
13031 }
13032 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13033 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13034}
13035
13036/// Lower 4-lane 32-bit floating point shuffles.
13037///
13038/// Uses instructions exclusively from the floating point unit to minimize
13039/// domain crossing penalties, as these are sufficient to implement all v4f32
13040/// shuffles.
13042 const APInt &Zeroable, SDValue V1, SDValue V2,
13043 const X86Subtarget &Subtarget,
13044 SelectionDAG &DAG) {
13045 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13046 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13047 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13048
13049 if (Subtarget.hasSSE41())
13050 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13051 Zeroable, Subtarget, DAG))
13052 return Blend;
13053
13054 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13055
13056 if (NumV2Elements == 0) {
13057 // Check for being able to broadcast a single element.
13058 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13059 Mask, Subtarget, DAG))
13060 return Broadcast;
13061
13062 // Use even/odd duplicate instructions for masks that match their pattern.
13063 if (Subtarget.hasSSE3()) {
13064 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
13065 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13066 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
13067 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13068 }
13069
13070 if (Subtarget.hasAVX()) {
13071 // If we have AVX, we can use VPERMILPS which will allow folding a load
13072 // into the shuffle.
13073 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13074 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13075 }
13076
13077 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13078 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13079 if (!Subtarget.hasSSE2()) {
13080 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
13081 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13082 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
13083 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13084 }
13085
13086 // Otherwise, use a straight shuffle of a single input vector. We pass the
13087 // input vector to both operands to simulate this with a SHUFPS.
13088 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13089 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13090 }
13091
13092 if (Subtarget.hasSSE2())
13094 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
13095 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
13096 return ZExt;
13097 }
13098
13099 if (Subtarget.hasAVX2())
13100 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13101 return Extract;
13102
13103 // There are special ways we can lower some single-element blends. However, we
13104 // have custom ways we can lower more complex single-element blends below that
13105 // we defer to if both this and BLENDPS fail to match, so restrict this to
13106 // when the V2 input is targeting element 0 of the mask -- that is the fast
13107 // case here.
13108 if (NumV2Elements == 1 && Mask[0] >= 4)
13110 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13111 return V;
13112
13113 if (Subtarget.hasSSE41()) {
13114 // Use INSERTPS if we can complete the shuffle efficiently.
13115 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13116 return V;
13117
13118 if (!isSingleSHUFPSMask(Mask))
13119 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13120 V2, Mask, DAG))
13121 return BlendPerm;
13122 }
13123
13124 // Use low/high mov instructions. These are only valid in SSE1 because
13125 // otherwise they are widened to v2f64 and never get here.
13126 if (!Subtarget.hasSSE2()) {
13127 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
13128 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13129 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
13130 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13131 }
13132
13133 // Use dedicated unpack instructions for masks that match their pattern.
13134 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
13135 return V;
13136
13137 // Otherwise fall back to a SHUFPS lowering strategy.
13138 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13139}
13140
13141/// Lower 4-lane i32 vector shuffles.
13142///
13143/// We try to handle these with integer-domain shuffles where we can, but for
13144/// blends we use the floating point domain blend instructions.
13146 const APInt &Zeroable, SDValue V1, SDValue V2,
13147 const X86Subtarget &Subtarget,
13148 SelectionDAG &DAG) {
13149 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13150 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13151 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13152
13153 // Whenever we can lower this as a zext, that instruction is strictly faster
13154 // than any alternative. It also allows us to fold memory operands into the
13155 // shuffle in many cases.
13156 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13157 Zeroable, Subtarget, DAG))
13158 return ZExt;
13159
13160 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13161
13162 // Try to use shift instructions if fast.
13163 if (Subtarget.preferLowerShuffleAsShift()) {
13164 if (SDValue Shift =
13165 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
13166 Subtarget, DAG, /*BitwiseOnly*/ true))
13167 return Shift;
13168 if (NumV2Elements == 0)
13169 if (SDValue Rotate =
13170 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
13171 return Rotate;
13172 }
13173
13174 if (NumV2Elements == 0) {
13175 // Try to use broadcast unless the mask only has one non-undef element.
13176 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
13177 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
13178 Mask, Subtarget, DAG))
13179 return Broadcast;
13180 }
13181
13182 // Straight shuffle of a single input vector. For everything from SSE2
13183 // onward this has a single fast instruction with no scary immediates.
13184 // We coerce the shuffle pattern to be compatible with UNPCK instructions
13185 // but we aren't actually going to use the UNPCK instruction because doing
13186 // so prevents folding a load into this instruction or making a copy.
13187 const int UnpackLoMask[] = {0, 0, 1, 1};
13188 const int UnpackHiMask[] = {2, 2, 3, 3};
13189 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
13190 Mask = UnpackLoMask;
13191 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
13192 Mask = UnpackHiMask;
13193
13194 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13195 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13196 }
13197
13198 if (Subtarget.hasAVX2())
13199 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13200 return Extract;
13201
13202 // Try to use shift instructions.
13203 if (SDValue Shift =
13204 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
13205 DAG, /*BitwiseOnly*/ false))
13206 return Shift;
13207
13208 // There are special ways we can lower some single-element blends.
13209 if (NumV2Elements == 1)
13211 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13212 return V;
13213
13214 // We have different paths for blend lowering, but they all must use the
13215 // *exact* same predicate.
13216 bool IsBlendSupported = Subtarget.hasSSE41();
13217 if (IsBlendSupported)
13218 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
13219 Zeroable, Subtarget, DAG))
13220 return Blend;
13221
13222 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13223 Zeroable, Subtarget, DAG))
13224 return Masked;
13225
13226 // Use dedicated unpack instructions for masks that match their pattern.
13227 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
13228 return V;
13229
13230 // Try to use byte rotation instructions.
13231 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13232 if (Subtarget.hasSSSE3()) {
13233 if (Subtarget.hasVLX())
13234 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13235 Zeroable, Subtarget, DAG))
13236 return Rotate;
13237
13238 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13239 Subtarget, DAG))
13240 return Rotate;
13241 }
13242
13243 // Assume that a single SHUFPS is faster than an alternative sequence of
13244 // multiple instructions (even if the CPU has a domain penalty).
13245 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13246 if (!isSingleSHUFPSMask(Mask)) {
13247 // If we have direct support for blends, we should lower by decomposing into
13248 // a permute. That will be faster than the domain cross.
13249 if (IsBlendSupported)
13250 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
13251 Subtarget, DAG);
13252
13253 // Try to lower by permuting the inputs into an unpack instruction.
13254 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13255 Mask, Subtarget, DAG))
13256 return Unpack;
13257 }
13258
13259 // We implement this with SHUFPS because it can blend from two vectors.
13260 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13261 // up the inputs, bypassing domain shift penalties that we would incur if we
13262 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13263 // relevant.
13264 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13265 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13266 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13267 return DAG.getBitcast(MVT::v4i32, ShufPS);
13268}
13269
13270/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13271/// shuffle lowering, and the most complex part.
13272///
13273/// The lowering strategy is to try to form pairs of input lanes which are
13274/// targeted at the same half of the final vector, and then use a dword shuffle
13275/// to place them onto the right half, and finally unpack the paired lanes into
13276/// their final position.
13277///
13278/// The exact breakdown of how to form these dword pairs and align them on the
13279/// correct sides is really tricky. See the comments within the function for
13280/// more of the details.
13281///
13282/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13283/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13284/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13285/// vector, form the analogous 128-bit 8-element Mask.
13287 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13288 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13289 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
13290 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13291
13292 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
13293 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13294 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13295
13296 // Attempt to directly match PSHUFLW or PSHUFHW.
13297 if (isUndefOrInRange(LoMask, 0, 4) &&
13298 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13299 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13300 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13301 }
13302 if (isUndefOrInRange(HiMask, 4, 8) &&
13303 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13304 for (int i = 0; i != 4; ++i)
13305 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13306 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13307 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13308 }
13309
13310 SmallVector<int, 4> LoInputs;
13311 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13312 array_pod_sort(LoInputs.begin(), LoInputs.end());
13313 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
13314 SmallVector<int, 4> HiInputs;
13315 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13316 array_pod_sort(HiInputs.begin(), HiInputs.end());
13317 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
13318 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13319 int NumHToL = LoInputs.size() - NumLToL;
13320 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13321 int NumHToH = HiInputs.size() - NumLToH;
13322 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13323 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13324 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13325 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13326
13327 // If we are shuffling values from one half - check how many different DWORD
13328 // pairs we need to create. If only 1 or 2 then we can perform this as a
13329 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13330 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13331 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13332 V = DAG.getNode(ShufWOp, DL, VT, V,
13333 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13334 V = DAG.getBitcast(PSHUFDVT, V);
13335 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13336 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13337 return DAG.getBitcast(VT, V);
13338 };
13339
13340 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13341 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13342 SmallVector<std::pair<int, int>, 4> DWordPairs;
13343 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13344
13345 // Collect the different DWORD pairs.
13346 for (int DWord = 0; DWord != 4; ++DWord) {
13347 int M0 = Mask[2 * DWord + 0];
13348 int M1 = Mask[2 * DWord + 1];
13349 M0 = (M0 >= 0 ? M0 % 4 : M0);
13350 M1 = (M1 >= 0 ? M1 % 4 : M1);
13351 if (M0 < 0 && M1 < 0)
13352 continue;
13353
13354 bool Match = false;
13355 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13356 auto &DWordPair = DWordPairs[j];
13357 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13358 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13359 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13360 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13361 PSHUFDMask[DWord] = DOffset + j;
13362 Match = true;
13363 break;
13364 }
13365 }
13366 if (!Match) {
13367 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13368 DWordPairs.push_back(std::make_pair(M0, M1));
13369 }
13370 }
13371
13372 if (DWordPairs.size() <= 2) {
13373 DWordPairs.resize(2, std::make_pair(-1, -1));
13374 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13375 DWordPairs[1].first, DWordPairs[1].second};
13376 if ((NumHToL + NumHToH) == 0)
13377 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13378 if ((NumLToL + NumLToH) == 0)
13379 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13380 }
13381 }
13382
13383 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13384 // such inputs we can swap two of the dwords across the half mark and end up
13385 // with <=2 inputs to each half in each half. Once there, we can fall through
13386 // to the generic code below. For example:
13387 //
13388 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13389 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13390 //
13391 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13392 // and an existing 2-into-2 on the other half. In this case we may have to
13393 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13394 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
13395 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
13396 // because any other situation (including a 3-into-1 or 1-into-3 in the other
13397 // half than the one we target for fixing) will be fixed when we re-enter this
13398 // path. We will also combine away any sequence of PSHUFD instructions that
13399 // result into a single instruction. Here is an example of the tricky case:
13400 //
13401 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13402 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
13403 //
13404 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
13405 //
13406 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
13407 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
13408 //
13409 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
13410 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
13411 //
13412 // The result is fine to be handled by the generic logic.
13413 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
13414 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
13415 int AOffset, int BOffset) {
13416 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
13417 "Must call this with A having 3 or 1 inputs from the A half.");
13418 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
13419 "Must call this with B having 1 or 3 inputs from the B half.");
13420 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
13421 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
13422
13423 bool ThreeAInputs = AToAInputs.size() == 3;
13424
13425 // Compute the index of dword with only one word among the three inputs in
13426 // a half by taking the sum of the half with three inputs and subtracting
13427 // the sum of the actual three inputs. The difference is the remaining
13428 // slot.
13429 int ADWord = 0, BDWord = 0;
13430 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
13431 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
13432 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
13433 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
13434 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
13435 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
13436 int TripleNonInputIdx =
13437 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
13438 TripleDWord = TripleNonInputIdx / 2;
13439
13440 // We use xor with one to compute the adjacent DWord to whichever one the
13441 // OneInput is in.
13442 OneInputDWord = (OneInput / 2) ^ 1;
13443
13444 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
13445 // and BToA inputs. If there is also such a problem with the BToB and AToB
13446 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
13447 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
13448 // is essential that we don't *create* a 3<-1 as then we might oscillate.
13449 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
13450 // Compute how many inputs will be flipped by swapping these DWords. We
13451 // need
13452 // to balance this to ensure we don't form a 3-1 shuffle in the other
13453 // half.
13454 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
13455 llvm::count(AToBInputs, 2 * ADWord + 1);
13456 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
13457 llvm::count(BToBInputs, 2 * BDWord + 1);
13458 if ((NumFlippedAToBInputs == 1 &&
13459 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
13460 (NumFlippedBToBInputs == 1 &&
13461 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
13462 // We choose whether to fix the A half or B half based on whether that
13463 // half has zero flipped inputs. At zero, we may not be able to fix it
13464 // with that half. We also bias towards fixing the B half because that
13465 // will more commonly be the high half, and we have to bias one way.
13466 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
13467 ArrayRef<int> Inputs) {
13468 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
13469 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
13470 // Determine whether the free index is in the flipped dword or the
13471 // unflipped dword based on where the pinned index is. We use this bit
13472 // in an xor to conditionally select the adjacent dword.
13473 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
13474 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13475 if (IsFixIdxInput == IsFixFreeIdxInput)
13476 FixFreeIdx += 1;
13477 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13478 assert(IsFixIdxInput != IsFixFreeIdxInput &&
13479 "We need to be changing the number of flipped inputs!");
13480 int PSHUFHalfMask[] = {0, 1, 2, 3};
13481 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
13482 V = DAG.getNode(
13483 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
13484 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
13485 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13486
13487 for (int &M : Mask)
13488 if (M >= 0 && M == FixIdx)
13489 M = FixFreeIdx;
13490 else if (M >= 0 && M == FixFreeIdx)
13491 M = FixIdx;
13492 };
13493 if (NumFlippedBToBInputs != 0) {
13494 int BPinnedIdx =
13495 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
13496 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
13497 } else {
13498 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
13499 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
13500 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
13501 }
13502 }
13503 }
13504
13505 int PSHUFDMask[] = {0, 1, 2, 3};
13506 PSHUFDMask[ADWord] = BDWord;
13507 PSHUFDMask[BDWord] = ADWord;
13508 V = DAG.getBitcast(
13509 VT,
13510 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
13511 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13512
13513 // Adjust the mask to match the new locations of A and B.
13514 for (int &M : Mask)
13515 if (M >= 0 && M/2 == ADWord)
13516 M = 2 * BDWord + M % 2;
13517 else if (M >= 0 && M/2 == BDWord)
13518 M = 2 * ADWord + M % 2;
13519
13520 // Recurse back into this routine to re-compute state now that this isn't
13521 // a 3 and 1 problem.
13522 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
13523 };
13524 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
13525 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
13526 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
13527 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
13528
13529 // At this point there are at most two inputs to the low and high halves from
13530 // each half. That means the inputs can always be grouped into dwords and
13531 // those dwords can then be moved to the correct half with a dword shuffle.
13532 // We use at most one low and one high word shuffle to collect these paired
13533 // inputs into dwords, and finally a dword shuffle to place them.
13534 int PSHUFLMask[4] = {-1, -1, -1, -1};
13535 int PSHUFHMask[4] = {-1, -1, -1, -1};
13536 int PSHUFDMask[4] = {-1, -1, -1, -1};
13537
13538 // First fix the masks for all the inputs that are staying in their
13539 // original halves. This will then dictate the targets of the cross-half
13540 // shuffles.
13541 auto fixInPlaceInputs =
13542 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
13543 MutableArrayRef<int> SourceHalfMask,
13544 MutableArrayRef<int> HalfMask, int HalfOffset) {
13545 if (InPlaceInputs.empty())
13546 return;
13547 if (InPlaceInputs.size() == 1) {
13548 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13549 InPlaceInputs[0] - HalfOffset;
13550 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
13551 return;
13552 }
13553 if (IncomingInputs.empty()) {
13554 // Just fix all of the in place inputs.
13555 for (int Input : InPlaceInputs) {
13556 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
13557 PSHUFDMask[Input / 2] = Input / 2;
13558 }
13559 return;
13560 }
13561
13562 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
13563 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13564 InPlaceInputs[0] - HalfOffset;
13565 // Put the second input next to the first so that they are packed into
13566 // a dword. We find the adjacent index by toggling the low bit.
13567 int AdjIndex = InPlaceInputs[0] ^ 1;
13568 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
13569 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
13570 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
13571 };
13572 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
13573 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
13574
13575 // Now gather the cross-half inputs and place them into a free dword of
13576 // their target half.
13577 // FIXME: This operation could almost certainly be simplified dramatically to
13578 // look more like the 3-1 fixing operation.
13579 auto moveInputsToRightHalf = [&PSHUFDMask](
13580 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
13581 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
13582 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
13583 int DestOffset) {
13584 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
13585 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
13586 };
13587 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
13588 int Word) {
13589 int LowWord = Word & ~1;
13590 int HighWord = Word | 1;
13591 return isWordClobbered(SourceHalfMask, LowWord) ||
13592 isWordClobbered(SourceHalfMask, HighWord);
13593 };
13594
13595 if (IncomingInputs.empty())
13596 return;
13597
13598 if (ExistingInputs.empty()) {
13599 // Map any dwords with inputs from them into the right half.
13600 for (int Input : IncomingInputs) {
13601 // If the source half mask maps over the inputs, turn those into
13602 // swaps and use the swapped lane.
13603 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
13604 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
13605 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
13606 Input - SourceOffset;
13607 // We have to swap the uses in our half mask in one sweep.
13608 for (int &M : HalfMask)
13609 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
13610 M = Input;
13611 else if (M == Input)
13612 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13613 } else {
13614 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
13615 Input - SourceOffset &&
13616 "Previous placement doesn't match!");
13617 }
13618 // Note that this correctly re-maps both when we do a swap and when
13619 // we observe the other side of the swap above. We rely on that to
13620 // avoid swapping the members of the input list directly.
13621 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13622 }
13623
13624 // Map the input's dword into the correct half.
13625 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
13626 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
13627 else
13628 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
13629 Input / 2 &&
13630 "Previous placement doesn't match!");
13631 }
13632
13633 // And just directly shift any other-half mask elements to be same-half
13634 // as we will have mirrored the dword containing the element into the
13635 // same position within that half.
13636 for (int &M : HalfMask)
13637 if (M >= SourceOffset && M < SourceOffset + 4) {
13638 M = M - SourceOffset + DestOffset;
13639 assert(M >= 0 && "This should never wrap below zero!");
13640 }
13641 return;
13642 }
13643
13644 // Ensure we have the input in a viable dword of its current half. This
13645 // is particularly tricky because the original position may be clobbered
13646 // by inputs being moved and *staying* in that half.
13647 if (IncomingInputs.size() == 1) {
13648 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13649 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
13650 SourceOffset;
13651 SourceHalfMask[InputFixed - SourceOffset] =
13652 IncomingInputs[0] - SourceOffset;
13653 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
13654 InputFixed);
13655 IncomingInputs[0] = InputFixed;
13656 }
13657 } else if (IncomingInputs.size() == 2) {
13658 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
13659 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13660 // We have two non-adjacent or clobbered inputs we need to extract from
13661 // the source half. To do this, we need to map them into some adjacent
13662 // dword slot in the source mask.
13663 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
13664 IncomingInputs[1] - SourceOffset};
13665
13666 // If there is a free slot in the source half mask adjacent to one of
13667 // the inputs, place the other input in it. We use (Index XOR 1) to
13668 // compute an adjacent index.
13669 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
13670 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
13671 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
13672 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
13673 InputsFixed[1] = InputsFixed[0] ^ 1;
13674 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
13675 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
13676 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
13677 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
13678 InputsFixed[0] = InputsFixed[1] ^ 1;
13679 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
13680 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
13681 // The two inputs are in the same DWord but it is clobbered and the
13682 // adjacent DWord isn't used at all. Move both inputs to the free
13683 // slot.
13684 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
13685 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
13686 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
13687 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
13688 } else {
13689 // The only way we hit this point is if there is no clobbering
13690 // (because there are no off-half inputs to this half) and there is no
13691 // free slot adjacent to one of the inputs. In this case, we have to
13692 // swap an input with a non-input.
13693 for (int i = 0; i < 4; ++i)
13694 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
13695 "We can't handle any clobbers here!");
13696 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
13697 "Cannot have adjacent inputs here!");
13698
13699 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
13700 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
13701
13702 // We also have to update the final source mask in this case because
13703 // it may need to undo the above swap.
13704 for (int &M : FinalSourceHalfMask)
13705 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
13706 M = InputsFixed[1] + SourceOffset;
13707 else if (M == InputsFixed[1] + SourceOffset)
13708 M = (InputsFixed[0] ^ 1) + SourceOffset;
13709
13710 InputsFixed[1] = InputsFixed[0] ^ 1;
13711 }
13712
13713 // Point everything at the fixed inputs.
13714 for (int &M : HalfMask)
13715 if (M == IncomingInputs[0])
13716 M = InputsFixed[0] + SourceOffset;
13717 else if (M == IncomingInputs[1])
13718 M = InputsFixed[1] + SourceOffset;
13719
13720 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
13721 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
13722 }
13723 } else {
13724 llvm_unreachable("Unhandled input size!");
13725 }
13726
13727 // Now hoist the DWord down to the right half.
13728 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
13729 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
13730 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
13731 for (int &M : HalfMask)
13732 for (int Input : IncomingInputs)
13733 if (M == Input)
13734 M = FreeDWord * 2 + Input % 2;
13735 };
13736 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
13737 /*SourceOffset*/ 4, /*DestOffset*/ 0);
13738 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
13739 /*SourceOffset*/ 0, /*DestOffset*/ 4);
13740
13741 // Now enact all the shuffles we've computed to move the inputs into their
13742 // target half.
13743 if (!isNoopShuffleMask(PSHUFLMask))
13744 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13745 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
13746 if (!isNoopShuffleMask(PSHUFHMask))
13747 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13748 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
13749 if (!isNoopShuffleMask(PSHUFDMask))
13750 V = DAG.getBitcast(
13751 VT,
13752 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
13753 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13754
13755 // At this point, each half should contain all its inputs, and we can then
13756 // just shuffle them into their final position.
13757 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
13758 "Failed to lift all the high half inputs to the low mask!");
13759 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
13760 "Failed to lift all the low half inputs to the high mask!");
13761
13762 // Do a half shuffle for the low mask.
13763 if (!isNoopShuffleMask(LoMask))
13764 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13765 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13766
13767 // Do a half shuffle with the high mask after shifting its values down.
13768 for (int &M : HiMask)
13769 if (M >= 0)
13770 M -= 4;
13771 if (!isNoopShuffleMask(HiMask))
13772 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13773 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13774
13775 return V;
13776}
13777
13778/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
13779/// blend if only one input is used.
13781 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13782 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
13784 "Lane crossing shuffle masks not supported");
13785
13786 int NumBytes = VT.getSizeInBits() / 8;
13787 int Size = Mask.size();
13788 int Scale = NumBytes / Size;
13789
13790 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
13791 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
13792 V1InUse = false;
13793 V2InUse = false;
13794
13795 for (int i = 0; i < NumBytes; ++i) {
13796 int M = Mask[i / Scale];
13797 if (M < 0)
13798 continue;
13799
13800 const int ZeroMask = 0x80;
13801 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
13802 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
13803 if (Zeroable[i / Scale])
13804 V1Idx = V2Idx = ZeroMask;
13805
13806 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
13807 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
13808 V1InUse |= (ZeroMask != V1Idx);
13809 V2InUse |= (ZeroMask != V2Idx);
13810 }
13811
13812 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
13813 if (V1InUse)
13814 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
13815 DAG.getBuildVector(ShufVT, DL, V1Mask));
13816 if (V2InUse)
13817 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
13818 DAG.getBuildVector(ShufVT, DL, V2Mask));
13819
13820 // If we need shuffled inputs from both, blend the two.
13821 SDValue V;
13822 if (V1InUse && V2InUse)
13823 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
13824 else
13825 V = V1InUse ? V1 : V2;
13826
13827 // Cast the result back to the correct type.
13828 return DAG.getBitcast(VT, V);
13829}
13830
13831/// Generic lowering of 8-lane i16 shuffles.
13832///
13833/// This handles both single-input shuffles and combined shuffle/blends with
13834/// two inputs. The single input shuffles are immediately delegated to
13835/// a dedicated lowering routine.
13836///
13837/// The blends are lowered in one of three fundamental ways. If there are few
13838/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
13839/// of the input is significantly cheaper when lowered as an interleaving of
13840/// the two inputs, try to interleave them. Otherwise, blend the low and high
13841/// halves of the inputs separately (making them have relatively few inputs)
13842/// and then concatenate them.
13844 const APInt &Zeroable, SDValue V1, SDValue V2,
13845 const X86Subtarget &Subtarget,
13846 SelectionDAG &DAG) {
13847 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
13848 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
13849 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13850
13851 // Whenever we can lower this as a zext, that instruction is strictly faster
13852 // than any alternative.
13853 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
13854 Zeroable, Subtarget, DAG))
13855 return ZExt;
13856
13857 // Try to use lower using a truncation.
13858 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
13859 Subtarget, DAG))
13860 return V;
13861
13862 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
13863
13864 if (NumV2Inputs == 0) {
13865 // Try to use shift instructions.
13866 if (SDValue Shift =
13867 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
13868 Subtarget, DAG, /*BitwiseOnly*/ false))
13869 return Shift;
13870
13871 // Check for being able to broadcast a single element.
13872 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
13873 Mask, Subtarget, DAG))
13874 return Broadcast;
13875
13876 // Try to use bit rotation instructions.
13877 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
13878 Subtarget, DAG))
13879 return Rotate;
13880
13881 // Use dedicated unpack instructions for masks that match their pattern.
13882 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
13883 return V;
13884
13885 // Use dedicated pack instructions for masks that match their pattern.
13886 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
13887 Subtarget))
13888 return V;
13889
13890 // Try to use byte rotation instructions.
13891 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
13892 Subtarget, DAG))
13893 return Rotate;
13894
13895 // Make a copy of the mask so it can be modified.
13896 SmallVector<int, 8> MutableMask(Mask);
13897 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
13898 Subtarget, DAG);
13899 }
13900
13901 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
13902 "All single-input shuffles should be canonicalized to be V1-input "
13903 "shuffles.");
13904
13905 // Try to use shift instructions.
13906 if (SDValue Shift =
13907 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
13908 DAG, /*BitwiseOnly*/ false))
13909 return Shift;
13910
13911 // See if we can use SSE4A Extraction / Insertion.
13912 if (Subtarget.hasSSE4A())
13913 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
13914 Zeroable, DAG))
13915 return V;
13916
13917 // There are special ways we can lower some single-element blends.
13918 if (NumV2Inputs == 1)
13920 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13921 return V;
13922
13923 // We have different paths for blend lowering, but they all must use the
13924 // *exact* same predicate.
13925 bool IsBlendSupported = Subtarget.hasSSE41();
13926 if (IsBlendSupported)
13927 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
13928 Zeroable, Subtarget, DAG))
13929 return Blend;
13930
13931 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
13932 Zeroable, Subtarget, DAG))
13933 return Masked;
13934
13935 // Use dedicated unpack instructions for masks that match their pattern.
13936 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
13937 return V;
13938
13939 // Use dedicated pack instructions for masks that match their pattern.
13940 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
13941 Subtarget))
13942 return V;
13943
13944 // Try to use lower using a truncation.
13945 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
13946 Subtarget, DAG))
13947 return V;
13948
13949 // Try to use byte rotation instructions.
13950 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
13951 Subtarget, DAG))
13952 return Rotate;
13953
13954 if (SDValue BitBlend =
13955 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
13956 return BitBlend;
13957
13958 // Try to use byte shift instructions to mask.
13959 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
13960 Zeroable, Subtarget, DAG))
13961 return V;
13962
13963 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
13964 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
13965 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
13966 !Subtarget.hasVLX()) {
13967 // Check if this is part of a 256-bit vector truncation.
13968 unsigned PackOpc = 0;
13969 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
13972 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
13973 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
13974 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
13975 DAG.getTargetConstant(0xEE, DL, MVT::i8));
13976 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
13977 V1 = extract128BitVector(V1V2, 0, DAG, DL);
13978 V2 = extract128BitVector(V1V2, 4, DAG, DL);
13979 PackOpc = X86ISD::PACKUS;
13980 } else if (Subtarget.hasSSE41()) {
13981 SmallVector<SDValue, 4> DWordClearOps(4,
13982 DAG.getConstant(0, DL, MVT::i32));
13983 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
13984 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
13985 SDValue DWordClearMask =
13986 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
13987 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
13988 DWordClearMask);
13989 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
13990 DWordClearMask);
13991 PackOpc = X86ISD::PACKUS;
13992 } else if (!Subtarget.hasSSSE3()) {
13993 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
13994 V1 = DAG.getBitcast(MVT::v4i32, V1);
13995 V2 = DAG.getBitcast(MVT::v4i32, V2);
13996 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
13997 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
13998 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
13999 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
14000 PackOpc = X86ISD::PACKSS;
14001 }
14002 if (PackOpc) {
14003 // Now pack things back together.
14004 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
14005 if (NumEvenDrops == 2) {
14006 Result = DAG.getBitcast(MVT::v4i32, Result);
14007 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
14008 }
14009 return Result;
14010 }
14011 }
14012
14013 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
14014 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
14015 if (NumOddDrops == 1) {
14016 bool HasSSE41 = Subtarget.hasSSE41();
14017 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14018 DAG.getBitcast(MVT::v4i32, V1),
14019 DAG.getTargetConstant(16, DL, MVT::i8));
14020 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14021 DAG.getBitcast(MVT::v4i32, V2),
14022 DAG.getTargetConstant(16, DL, MVT::i8));
14023 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
14024 MVT::v8i16, V1, V2);
14025 }
14026
14027 // Try to lower by permuting the inputs into an unpack instruction.
14028 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14029 Mask, Subtarget, DAG))
14030 return Unpack;
14031
14032 // If we can't directly blend but can use PSHUFB, that will be better as it
14033 // can both shuffle and set up the inefficient blend.
14034 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14035 bool V1InUse, V2InUse;
14036 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14037 Zeroable, DAG, V1InUse, V2InUse);
14038 }
14039
14040 // We can always bit-blend if we have to so the fallback strategy is to
14041 // decompose into single-input permutes and blends/unpacks.
14042 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,
14043 Mask, Subtarget, DAG);
14044}
14045
14046/// Lower 8-lane 16-bit floating point shuffles.
14048 const APInt &Zeroable, SDValue V1, SDValue V2,
14049 const X86Subtarget &Subtarget,
14050 SelectionDAG &DAG) {
14051 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14052 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14053 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14054 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
14055
14056 if (Subtarget.hasFP16()) {
14057 if (NumV2Elements == 0) {
14058 // Check for being able to broadcast a single element.
14059 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
14060 Mask, Subtarget, DAG))
14061 return Broadcast;
14062 }
14063 if (NumV2Elements == 1 && Mask[0] >= 8)
14065 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14066 return V;
14067 }
14068
14069 V1 = DAG.getBitcast(MVT::v8i16, V1);
14070 V2 = DAG.getBitcast(MVT::v8i16, V2);
14071 return DAG.getBitcast(MVT::v8f16,
14072 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14073}
14074
14075// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14076// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14077// the active subvector is extracted.
14079 ArrayRef<int> Mask, SDValue V1, SDValue V2,
14080 const X86Subtarget &Subtarget,
14081 SelectionDAG &DAG) {
14082 MVT MaskVT = VT.changeTypeToInteger();
14083 SDValue MaskNode;
14084 MVT ShuffleVT = VT;
14085 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
14086 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
14087 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
14088 ShuffleVT = V1.getSimpleValueType();
14089
14090 // Adjust mask to correct indices for the second input.
14091 int NumElts = VT.getVectorNumElements();
14092 unsigned Scale = 512 / VT.getSizeInBits();
14093 SmallVector<int, 32> AdjustedMask(Mask);
14094 for (int &M : AdjustedMask)
14095 if (NumElts <= M)
14096 M += (Scale - 1) * NumElts;
14097 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
14098 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
14099 } else {
14100 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
14101 }
14102
14103 SDValue Result;
14104 if (V2.isUndef())
14105 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
14106 else
14107 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
14108
14109 if (VT != ShuffleVT)
14110 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
14111
14112 return Result;
14113}
14114
14115/// Generic lowering of v16i8 shuffles.
14116///
14117/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14118/// detect any complexity reducing interleaving. If that doesn't help, it uses
14119/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14120/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14121/// back together.
14123 const APInt &Zeroable, SDValue V1, SDValue V2,
14124 const X86Subtarget &Subtarget,
14125 SelectionDAG &DAG) {
14126 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14127 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14128 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14129
14130 // Try to use shift instructions.
14131 if (SDValue Shift =
14132 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
14133 DAG, /*BitwiseOnly*/ false))
14134 return Shift;
14135
14136 // Try to use byte rotation instructions.
14137 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14138 Subtarget, DAG))
14139 return Rotate;
14140
14141 // Use dedicated pack instructions for masks that match their pattern.
14142 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
14143 Subtarget))
14144 return V;
14145
14146 // Try to use a zext lowering.
14147 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14148 Zeroable, Subtarget, DAG))
14149 return ZExt;
14150
14151 // Try to use lower using a truncation.
14152 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14153 Subtarget, DAG))
14154 return V;
14155
14156 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14157 Subtarget, DAG))
14158 return V;
14159
14160 // See if we can use SSE4A Extraction / Insertion.
14161 if (Subtarget.hasSSE4A())
14162 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
14163 Zeroable, DAG))
14164 return V;
14165
14166 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
14167
14168 // For single-input shuffles, there are some nicer lowering tricks we can use.
14169 if (NumV2Elements == 0) {
14170 // Check for being able to broadcast a single element.
14171 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
14172 Mask, Subtarget, DAG))
14173 return Broadcast;
14174
14175 // Try to use bit rotation instructions.
14176 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
14177 Subtarget, DAG))
14178 return Rotate;
14179
14180 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
14181 return V;
14182
14183 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
14184 // Notably, this handles splat and partial-splat shuffles more efficiently.
14185 // However, it only makes sense if the pre-duplication shuffle simplifies
14186 // things significantly. Currently, this means we need to be able to
14187 // express the pre-duplication shuffle as an i16 shuffle.
14188 //
14189 // FIXME: We should check for other patterns which can be widened into an
14190 // i16 shuffle as well.
14191 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
14192 for (int i = 0; i < 16; i += 2)
14193 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14194 return false;
14195
14196 return true;
14197 };
14198 auto tryToWidenViaDuplication = [&]() -> SDValue {
14199 if (!canWidenViaDuplication(Mask))
14200 return SDValue();
14201 SmallVector<int, 4> LoInputs;
14202 copy_if(Mask, std::back_inserter(LoInputs),
14203 [](int M) { return M >= 0 && M < 8; });
14204 array_pod_sort(LoInputs.begin(), LoInputs.end());
14205 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
14206 LoInputs.end());
14207 SmallVector<int, 4> HiInputs;
14208 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
14209 array_pod_sort(HiInputs.begin(), HiInputs.end());
14210 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
14211 HiInputs.end());
14212
14213 bool TargetLo = LoInputs.size() >= HiInputs.size();
14214 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14215 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14216
14217 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14219 for (int I : InPlaceInputs) {
14220 PreDupI16Shuffle[I/2] = I/2;
14221 LaneMap[I] = I;
14222 }
14223 int j = TargetLo ? 0 : 4, je = j + 4;
14224 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14225 // Check if j is already a shuffle of this input. This happens when
14226 // there are two adjacent bytes after we move the low one.
14227 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14228 // If we haven't yet mapped the input, search for a slot into which
14229 // we can map it.
14230 while (j < je && PreDupI16Shuffle[j] >= 0)
14231 ++j;
14232
14233 if (j == je)
14234 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14235 return SDValue();
14236
14237 // Map this input with the i16 shuffle.
14238 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14239 }
14240
14241 // Update the lane map based on the mapping we ended up with.
14242 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14243 }
14244 V1 = DAG.getBitcast(
14245 MVT::v16i8,
14246 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14247 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14248
14249 // Unpack the bytes to form the i16s that will be shuffled into place.
14250 bool EvenInUse = false, OddInUse = false;
14251 for (int i = 0; i < 16; i += 2) {
14252 EvenInUse |= (Mask[i + 0] >= 0);
14253 OddInUse |= (Mask[i + 1] >= 0);
14254 if (EvenInUse && OddInUse)
14255 break;
14256 }
14257 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14258 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14259 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14260
14261 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14262 for (int i = 0; i < 16; ++i)
14263 if (Mask[i] >= 0) {
14264 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14265 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
14266 if (PostDupI16Shuffle[i / 2] < 0)
14267 PostDupI16Shuffle[i / 2] = MappedMask;
14268 else
14269 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
14270 "Conflicting entries in the original shuffle!");
14271 }
14272 return DAG.getBitcast(
14273 MVT::v16i8,
14274 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14275 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14276 };
14277 if (SDValue V = tryToWidenViaDuplication())
14278 return V;
14279 }
14280
14281 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14282 Zeroable, Subtarget, DAG))
14283 return Masked;
14284
14285 // Use dedicated unpack instructions for masks that match their pattern.
14286 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
14287 return V;
14288
14289 // Try to use byte shift instructions to mask.
14290 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
14291 Zeroable, Subtarget, DAG))
14292 return V;
14293
14294 // Check for compaction patterns.
14295 bool IsSingleInput = V2.isUndef();
14296 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
14297
14298 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14299 // with PSHUFB. It is important to do this before we attempt to generate any
14300 // blends but after all of the single-input lowerings. If the single input
14301 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14302 // want to preserve that and we can DAG combine any longer sequences into
14303 // a PSHUFB in the end. But once we start blending from multiple inputs,
14304 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14305 // and there are *very* few patterns that would actually be faster than the
14306 // PSHUFB approach because of its ability to zero lanes.
14307 //
14308 // If the mask is a binary compaction, we can more efficiently perform this
14309 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14310 //
14311 // FIXME: The only exceptions to the above are blends which are exact
14312 // interleavings with direct instructions supporting them. We currently don't
14313 // handle those well here.
14314 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
14315 bool V1InUse = false;
14316 bool V2InUse = false;
14317
14319 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14320
14321 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14322 // do so. This avoids using them to handle blends-with-zero which is
14323 // important as a single pshufb is significantly faster for that.
14324 if (V1InUse && V2InUse) {
14325 if (Subtarget.hasSSE41())
14326 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14327 Zeroable, Subtarget, DAG))
14328 return Blend;
14329
14330 // We can use an unpack to do the blending rather than an or in some
14331 // cases. Even though the or may be (very minorly) more efficient, we
14332 // preference this lowering because there are common cases where part of
14333 // the complexity of the shuffles goes away when we do the final blend as
14334 // an unpack.
14335 // FIXME: It might be worth trying to detect if the unpack-feeding
14336 // shuffles will both be pshufb, in which case we shouldn't bother with
14337 // this.
14339 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14340 return Unpack;
14341
14342 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14343 if (Subtarget.hasVBMI())
14344 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
14345 DAG);
14346
14347 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
14348 if (Subtarget.hasXOP()) {
14349 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
14350 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
14351 }
14352
14353 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14354 // PALIGNR will be cheaper than the second PSHUFB+OR.
14356 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14357 return V;
14358 }
14359
14360 return PSHUFB;
14361 }
14362
14363 // There are special ways we can lower some single-element blends.
14364 if (NumV2Elements == 1)
14366 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14367 return V;
14368
14369 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14370 return Blend;
14371
14372 // Check whether a compaction lowering can be done. This handles shuffles
14373 // which take every Nth element for some even N. See the helper function for
14374 // details.
14375 //
14376 // We special case these as they can be particularly efficiently handled with
14377 // the PACKUSB instruction on x86 and they show up in common patterns of
14378 // rearranging bytes to truncate wide elements.
14379 if (NumEvenDrops) {
14380 // NumEvenDrops is the power of two stride of the elements. Another way of
14381 // thinking about it is that we need to drop the even elements this many
14382 // times to get the original input.
14383
14384 // First we need to zero all the dropped bytes.
14385 assert(NumEvenDrops <= 3 &&
14386 "No support for dropping even elements more than 3 times.");
14387 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
14388 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14389 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
14390 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
14391 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
14392 WordClearMask);
14393 if (!IsSingleInput)
14394 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
14395 WordClearMask);
14396
14397 // Now pack things back together.
14398 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14399 IsSingleInput ? V1 : V2);
14400 for (int i = 1; i < NumEvenDrops; ++i) {
14401 Result = DAG.getBitcast(MVT::v8i16, Result);
14402 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
14403 }
14404 return Result;
14405 }
14406
14407 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
14408 if (NumOddDrops == 1) {
14409 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14410 DAG.getBitcast(MVT::v8i16, V1),
14411 DAG.getTargetConstant(8, DL, MVT::i8));
14412 if (!IsSingleInput)
14413 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14414 DAG.getBitcast(MVT::v8i16, V2),
14415 DAG.getTargetConstant(8, DL, MVT::i8));
14416 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14417 IsSingleInput ? V1 : V2);
14418 }
14419
14420 // Handle multi-input cases by blending/unpacking single-input shuffles.
14421 if (NumV2Elements > 0)
14422 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
14423 Subtarget, DAG);
14424
14425 // The fallback path for single-input shuffles widens this into two v8i16
14426 // vectors with unpacks, shuffles those, and then pulls them back together
14427 // with a pack.
14428 SDValue V = V1;
14429
14430 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14431 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14432 for (int i = 0; i < 16; ++i)
14433 if (Mask[i] >= 0)
14434 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
14435
14436 SDValue VLoHalf, VHiHalf;
14437 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
14438 // them out and avoid using UNPCK{L,H} to extract the elements of V as
14439 // i16s.
14440 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
14441 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
14442 // Use a mask to drop the high bytes.
14443 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
14444 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
14445 DAG.getConstant(0x00FF, DL, MVT::v8i16));
14446
14447 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
14448 VHiHalf = DAG.getUNDEF(MVT::v8i16);
14449
14450 // Squash the masks to point directly into VLoHalf.
14451 for (int &M : LoBlendMask)
14452 if (M >= 0)
14453 M /= 2;
14454 for (int &M : HiBlendMask)
14455 if (M >= 0)
14456 M /= 2;
14457 } else {
14458 // Otherwise just unpack the low half of V into VLoHalf and the high half into
14459 // VHiHalf so that we can blend them as i16s.
14460 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
14461
14462 VLoHalf = DAG.getBitcast(
14463 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
14464 VHiHalf = DAG.getBitcast(
14465 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
14466 }
14467
14468 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
14469 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
14470
14471 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
14472}
14473
14474/// Dispatching routine to lower various 128-bit x86 vector shuffles.
14475///
14476/// This routine breaks down the specific type of 128-bit shuffle and
14477/// dispatches to the lowering routines accordingly.
14479 MVT VT, SDValue V1, SDValue V2,
14480 const APInt &Zeroable,
14481 const X86Subtarget &Subtarget,
14482 SelectionDAG &DAG) {
14483 if (VT == MVT::v8bf16) {
14484 V1 = DAG.getBitcast(MVT::v8i16, V1);
14485 V2 = DAG.getBitcast(MVT::v8i16, V2);
14486 return DAG.getBitcast(VT,
14487 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14488 }
14489
14490 switch (VT.SimpleTy) {
14491 case MVT::v2i64:
14492 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14493 case MVT::v2f64:
14494 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14495 case MVT::v4i32:
14496 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14497 case MVT::v4f32:
14498 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14499 case MVT::v8i16:
14500 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14501 case MVT::v8f16:
14502 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14503 case MVT::v16i8:
14504 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14505
14506 default:
14507 llvm_unreachable("Unimplemented!");
14508 }
14509}
14510
14511/// Generic routine to split vector shuffle into half-sized shuffles.
14512///
14513/// This routine just extracts two subvectors, shuffles them independently, and
14514/// then concatenates them back together. This should work effectively with all
14515/// AVX vector shuffle types.
14517 SDValue V2, ArrayRef<int> Mask,
14518 SelectionDAG &DAG, bool SimpleOnly) {
14519 assert(VT.getSizeInBits() >= 256 &&
14520 "Only for 256-bit or wider vector shuffles!");
14521 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
14522 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
14523
14524 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
14525 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
14526
14527 int NumElements = VT.getVectorNumElements();
14528 int SplitNumElements = NumElements / 2;
14529 MVT ScalarVT = VT.getVectorElementType();
14530 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
14531
14532 // Use splitVector/extractSubVector so that split build-vectors just build two
14533 // narrower build vectors. This helps shuffling with splats and zeros.
14534 auto SplitVector = [&](SDValue V) {
14535 SDValue LoV, HiV;
14536 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
14537 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
14538 DAG.getBitcast(SplitVT, HiV));
14539 };
14540
14541 SDValue LoV1, HiV1, LoV2, HiV2;
14542 std::tie(LoV1, HiV1) = SplitVector(V1);
14543 std::tie(LoV2, HiV2) = SplitVector(V2);
14544
14545 // Now create two 4-way blends of these half-width vectors.
14546 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
14547 bool &UseHiV1, bool &UseLoV2,
14548 bool &UseHiV2) {
14549 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
14550 for (int i = 0; i < SplitNumElements; ++i) {
14551 int M = HalfMask[i];
14552 if (M >= NumElements) {
14553 if (M >= NumElements + SplitNumElements)
14554 UseHiV2 = true;
14555 else
14556 UseLoV2 = true;
14557 } else if (M >= 0) {
14558 if (M >= SplitNumElements)
14559 UseHiV1 = true;
14560 else
14561 UseLoV1 = true;
14562 }
14563 }
14564 };
14565
14566 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
14567 if (!SimpleOnly)
14568 return true;
14569
14570 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14571 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14572
14573 return !(UseHiV1 || UseHiV2);
14574 };
14575
14576 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
14577 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
14578 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
14579 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
14580 for (int i = 0; i < SplitNumElements; ++i) {
14581 int M = HalfMask[i];
14582 if (M >= NumElements) {
14583 V2BlendMask[i] = M - NumElements;
14584 BlendMask[i] = SplitNumElements + i;
14585 } else if (M >= 0) {
14586 V1BlendMask[i] = M;
14587 BlendMask[i] = i;
14588 }
14589 }
14590
14591 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14592 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14593
14594 // Because the lowering happens after all combining takes place, we need to
14595 // manually combine these blend masks as much as possible so that we create
14596 // a minimal number of high-level vector shuffle nodes.
14597 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
14598
14599 // First try just blending the halves of V1 or V2.
14600 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
14601 return DAG.getUNDEF(SplitVT);
14602 if (!UseLoV2 && !UseHiV2)
14603 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14604 if (!UseLoV1 && !UseHiV1)
14605 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14606
14607 SDValue V1Blend, V2Blend;
14608 if (UseLoV1 && UseHiV1) {
14609 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14610 } else {
14611 // We only use half of V1 so map the usage down into the final blend mask.
14612 V1Blend = UseLoV1 ? LoV1 : HiV1;
14613 for (int i = 0; i < SplitNumElements; ++i)
14614 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
14615 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
14616 }
14617 if (UseLoV2 && UseHiV2) {
14618 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14619 } else {
14620 // We only use half of V2 so map the usage down into the final blend mask.
14621 V2Blend = UseLoV2 ? LoV2 : HiV2;
14622 for (int i = 0; i < SplitNumElements; ++i)
14623 if (BlendMask[i] >= SplitNumElements)
14624 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
14625 }
14626 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
14627 };
14628
14629 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
14630 return SDValue();
14631
14632 SDValue Lo = HalfBlend(LoMask);
14633 SDValue Hi = HalfBlend(HiMask);
14634 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
14635}
14636
14637/// Either split a vector in halves or decompose the shuffles and the
14638/// blend/unpack.
14639///
14640/// This is provided as a good fallback for many lowerings of non-single-input
14641/// shuffles with more than one 128-bit lane. In those cases, we want to select
14642/// between splitting the shuffle into 128-bit components and stitching those
14643/// back together vs. extracting the single-input shuffles and blending those
14644/// results.
14646 SDValue V2, ArrayRef<int> Mask,
14647 const X86Subtarget &Subtarget,
14648 SelectionDAG &DAG) {
14649 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
14650 "shuffles as it could then recurse on itself.");
14651 int Size = Mask.size();
14652
14653 // If this can be modeled as a broadcast of two elements followed by a blend,
14654 // prefer that lowering. This is especially important because broadcasts can
14655 // often fold with memory operands.
14656 auto DoBothBroadcast = [&] {
14657 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
14658 for (int M : Mask)
14659 if (M >= Size) {
14660 if (V2BroadcastIdx < 0)
14661 V2BroadcastIdx = M - Size;
14662 else if (M - Size != V2BroadcastIdx)
14663 return false;
14664 } else if (M >= 0) {
14665 if (V1BroadcastIdx < 0)
14666 V1BroadcastIdx = M;
14667 else if (M != V1BroadcastIdx)
14668 return false;
14669 }
14670 return true;
14671 };
14672 if (DoBothBroadcast())
14673 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
14674 DAG);
14675
14676 // If the inputs all stem from a single 128-bit lane of each input, then we
14677 // split them rather than blending because the split will decompose to
14678 // unusually few instructions.
14679 int LaneCount = VT.getSizeInBits() / 128;
14680 int LaneSize = Size / LaneCount;
14681 SmallBitVector LaneInputs[2];
14682 LaneInputs[0].resize(LaneCount, false);
14683 LaneInputs[1].resize(LaneCount, false);
14684 for (int i = 0; i < Size; ++i)
14685 if (Mask[i] >= 0)
14686 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
14687 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
14688 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
14689 /*SimpleOnly*/ false);
14690
14691 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
14692 // requires that the decomposed single-input shuffles don't end up here.
14693 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,
14694 DAG);
14695}
14696
14697// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
14698// TODO: Extend to support v8f32 (+ 512-bit shuffles).
14700 SDValue V1, SDValue V2,
14701 ArrayRef<int> Mask,
14702 SelectionDAG &DAG) {
14703 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
14704
14705 int LHSMask[4] = {-1, -1, -1, -1};
14706 int RHSMask[4] = {-1, -1, -1, -1};
14707 unsigned SHUFPMask = 0;
14708
14709 // As SHUFPD uses a single LHS/RHS element per lane, we can always
14710 // perform the shuffle once the lanes have been shuffled in place.
14711 for (int i = 0; i != 4; ++i) {
14712 int M = Mask[i];
14713 if (M < 0)
14714 continue;
14715 int LaneBase = i & ~1;
14716 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
14717 LaneMask[LaneBase + (M & 1)] = M;
14718 SHUFPMask |= (M & 1) << i;
14719 }
14720
14721 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
14722 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
14723 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
14724 DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
14725}
14726
14727/// Lower a vector shuffle crossing multiple 128-bit lanes as
14728/// a lane permutation followed by a per-lane permutation.
14729///
14730/// This is mainly for cases where we can have non-repeating permutes
14731/// in each lane.
14732///
14733/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
14734/// we should investigate merging them.
14736 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14737 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
14738 int NumElts = VT.getVectorNumElements();
14739 int NumLanes = VT.getSizeInBits() / 128;
14740 int NumEltsPerLane = NumElts / NumLanes;
14741 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
14742
14743 /// Attempts to find a sublane permute with the given size
14744 /// that gets all elements into their target lanes.
14745 ///
14746 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
14747 /// If unsuccessful, returns false and may overwrite InLaneMask.
14748 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
14749 int NumSublanesPerLane = NumSublanes / NumLanes;
14750 int NumEltsPerSublane = NumElts / NumSublanes;
14751
14752 SmallVector<int, 16> CrossLaneMask;
14753 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
14754 // CrossLaneMask but one entry == one sublane.
14755 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
14756
14757 for (int i = 0; i != NumElts; ++i) {
14758 int M = Mask[i];
14759 if (M < 0)
14760 continue;
14761
14762 int SrcSublane = M / NumEltsPerSublane;
14763 int DstLane = i / NumEltsPerLane;
14764
14765 // We only need to get the elements into the right lane, not sublane.
14766 // So search all sublanes that make up the destination lane.
14767 bool Found = false;
14768 int DstSubStart = DstLane * NumSublanesPerLane;
14769 int DstSubEnd = DstSubStart + NumSublanesPerLane;
14770 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
14771 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
14772 continue;
14773
14774 Found = true;
14775 CrossLaneMaskLarge[DstSublane] = SrcSublane;
14776 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
14777 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
14778 break;
14779 }
14780 if (!Found)
14781 return SDValue();
14782 }
14783
14784 // Fill CrossLaneMask using CrossLaneMaskLarge.
14785 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
14786
14787 if (!CanUseSublanes) {
14788 // If we're only shuffling a single lowest lane and the rest are identity
14789 // then don't bother.
14790 // TODO - isShuffleMaskInputInPlace could be extended to something like
14791 // this.
14792 int NumIdentityLanes = 0;
14793 bool OnlyShuffleLowestLane = true;
14794 for (int i = 0; i != NumLanes; ++i) {
14795 int LaneOffset = i * NumEltsPerLane;
14796 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
14797 i * NumEltsPerLane))
14798 NumIdentityLanes++;
14799 else if (CrossLaneMask[LaneOffset] != 0)
14800 OnlyShuffleLowestLane = false;
14801 }
14802 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
14803 return SDValue();
14804 }
14805
14806 // Avoid returning the same shuffle operation. For example,
14807 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
14808 // undef:v16i16
14809 if (CrossLaneMask == Mask || InLaneMask == Mask)
14810 return SDValue();
14811
14812 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
14813 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
14814 InLaneMask);
14815 };
14816
14817 // First attempt a solution with full lanes.
14818 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
14819 return V;
14820
14821 // The rest of the solutions use sublanes.
14822 if (!CanUseSublanes)
14823 return SDValue();
14824
14825 // Then attempt a solution with 64-bit sublanes (vpermq).
14826 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
14827 return V;
14828
14829 // If that doesn't work and we have fast variable cross-lane shuffle,
14830 // attempt 32-bit sublanes (vpermd).
14831 if (!Subtarget.hasFastVariableCrossLaneShuffle())
14832 return SDValue();
14833
14834 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
14835}
14836
14837/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
14838static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
14839 SmallVector<int> &InLaneMask) {
14840 int Size = Mask.size();
14841 InLaneMask.assign(Mask.begin(), Mask.end());
14842 for (int i = 0; i < Size; ++i) {
14843 int &M = InLaneMask[i];
14844 if (M < 0)
14845 continue;
14846 if (((M % Size) / LaneSize) != (i / LaneSize))
14847 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
14848 }
14849}
14850
14851/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
14852/// source with a lane permutation.
14853///
14854/// This lowering strategy results in four instructions in the worst case for a
14855/// single-input cross lane shuffle which is lower than any other fully general
14856/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
14857/// shuffle pattern should be handled prior to trying this lowering.
14859 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14860 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
14861 // FIXME: This should probably be generalized for 512-bit vectors as well.
14862 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
14863 int Size = Mask.size();
14864 int LaneSize = Size / 2;
14865
14866 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
14867 // Only do this if the elements aren't all from the lower lane,
14868 // otherwise we're (probably) better off doing a split.
14869 if (VT == MVT::v4f64 &&
14870 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
14871 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
14872
14873 // If there are only inputs from one 128-bit lane, splitting will in fact be
14874 // less expensive. The flags track whether the given lane contains an element
14875 // that crosses to another lane.
14876 bool AllLanes;
14877 if (!Subtarget.hasAVX2()) {
14878 bool LaneCrossing[2] = {false, false};
14879 for (int i = 0; i < Size; ++i)
14880 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
14881 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
14882 AllLanes = LaneCrossing[0] && LaneCrossing[1];
14883 } else {
14884 bool LaneUsed[2] = {false, false};
14885 for (int i = 0; i < Size; ++i)
14886 if (Mask[i] >= 0)
14887 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
14888 AllLanes = LaneUsed[0] && LaneUsed[1];
14889 }
14890
14891 // TODO - we could support shuffling V2 in the Flipped input.
14892 assert(V2.isUndef() &&
14893 "This last part of this routine only works on single input shuffles");
14894
14895 SmallVector<int> InLaneMask;
14896 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
14897
14898 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
14899 "In-lane shuffle mask expected");
14900
14901 // If we're not using both lanes in each lane and the inlane mask is not
14902 // repeating, then we're better off splitting.
14903 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
14904 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
14905 /*SimpleOnly*/ false);
14906
14907 // Flip the lanes, and shuffle the results which should now be in-lane.
14908 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
14909 SDValue Flipped = DAG.getBitcast(PVT, V1);
14910 Flipped =
14911 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
14912 Flipped = DAG.getBitcast(VT, Flipped);
14913 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
14914}
14915
14916/// Handle lowering 2-lane 128-bit shuffles.
14918 SDValue V2, ArrayRef<int> Mask,
14919 const APInt &Zeroable,
14920 const X86Subtarget &Subtarget,
14921 SelectionDAG &DAG) {
14922 if (V2.isUndef()) {
14923 // Attempt to match VBROADCAST*128 subvector broadcast load.
14924 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
14925 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
14926 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
14928 MVT MemVT = VT.getHalfNumVectorElementsVT();
14929 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
14930 auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
14932 VT, MemVT, Ld, Ofs, DAG))
14933 return BcstLd;
14934 }
14935
14936 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
14937 if (Subtarget.hasAVX2())
14938 return SDValue();
14939 }
14940
14941 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
14942
14943 SmallVector<int, 4> WidenedMask;
14944 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
14945 return SDValue();
14946
14947 bool IsLowZero = (Zeroable & 0x3) == 0x3;
14948 bool IsHighZero = (Zeroable & 0xc) == 0xc;
14949
14950 // Try to use an insert into a zero vector.
14951 if (WidenedMask[0] == 0 && IsHighZero) {
14952 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
14953 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
14954 DAG.getIntPtrConstant(0, DL));
14955 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
14956 getZeroVector(VT, Subtarget, DAG, DL), LoV,
14957 DAG.getIntPtrConstant(0, DL));
14958 }
14959
14960 // TODO: If minimizing size and one of the inputs is a zero vector and the
14961 // the zero vector has only one use, we could use a VPERM2X128 to save the
14962 // instruction bytes needed to explicitly generate the zero vector.
14963
14964 // Blends are faster and handle all the non-lane-crossing cases.
14965 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
14966 Subtarget, DAG))
14967 return Blend;
14968
14969 // If either input operand is a zero vector, use VPERM2X128 because its mask
14970 // allows us to replace the zero input with an implicit zero.
14971 if (!IsLowZero && !IsHighZero) {
14972 // Check for patterns which can be matched with a single insert of a 128-bit
14973 // subvector.
14974 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
14975 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
14976
14977 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
14978 // this will likely become vinsertf128 which can't fold a 256-bit memop.
14979 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
14980 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
14981 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
14982 OnlyUsesV1 ? V1 : V2,
14983 DAG.getIntPtrConstant(0, DL));
14984 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
14985 DAG.getIntPtrConstant(2, DL));
14986 }
14987 }
14988
14989 // Try to use SHUF128 if possible.
14990 if (Subtarget.hasVLX()) {
14991 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
14992 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
14993 ((WidenedMask[1] % 2) << 1);
14994 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
14995 DAG.getTargetConstant(PermMask, DL, MVT::i8));
14996 }
14997 }
14998 }
14999
15000 // Otherwise form a 128-bit permutation. After accounting for undefs,
15001 // convert the 64-bit shuffle mask selection values into 128-bit
15002 // selection bits by dividing the indexes by 2 and shifting into positions
15003 // defined by a vperm2*128 instruction's immediate control byte.
15004
15005 // The immediate permute control byte looks like this:
15006 // [1:0] - select 128 bits from sources for low half of destination
15007 // [2] - ignore
15008 // [3] - zero low half of destination
15009 // [5:4] - select 128 bits from sources for high half of destination
15010 // [6] - ignore
15011 // [7] - zero high half of destination
15012
15013 assert((WidenedMask[0] >= 0 || IsLowZero) &&
15014 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
15015
15016 unsigned PermMask = 0;
15017 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15018 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15019
15020 // Check the immediate mask and replace unused sources with undef.
15021 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15022 V1 = DAG.getUNDEF(VT);
15023 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15024 V2 = DAG.getUNDEF(VT);
15025
15026 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15027 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15028}
15029
15030/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15031/// shuffling each lane.
15032///
15033/// This attempts to create a repeated lane shuffle where each lane uses one
15034/// or two of the lanes of the inputs. The lanes of the input vectors are
15035/// shuffled in one or two independent shuffles to get the lanes into the
15036/// position needed by the final shuffle.
15038 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15039 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15040 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
15041
15042 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15043 return SDValue();
15044
15045 int NumElts = Mask.size();
15046 int NumLanes = VT.getSizeInBits() / 128;
15047 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15048 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15049 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15050
15051 // First pass will try to fill in the RepeatMask from lanes that need two
15052 // sources.
15053 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15054 int Srcs[2] = {-1, -1};
15055 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15056 for (int i = 0; i != NumLaneElts; ++i) {
15057 int M = Mask[(Lane * NumLaneElts) + i];
15058 if (M < 0)
15059 continue;
15060 // Determine which of the possible input lanes (NumLanes from each source)
15061 // this element comes from. Assign that as one of the sources for this
15062 // lane. We can assign up to 2 sources for this lane. If we run out
15063 // sources we can't do anything.
15064 int LaneSrc = M / NumLaneElts;
15065 int Src;
15066 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15067 Src = 0;
15068 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15069 Src = 1;
15070 else
15071 return SDValue();
15072
15073 Srcs[Src] = LaneSrc;
15074 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15075 }
15076
15077 // If this lane has two sources, see if it fits with the repeat mask so far.
15078 if (Srcs[1] < 0)
15079 continue;
15080
15081 LaneSrcs[Lane][0] = Srcs[0];
15082 LaneSrcs[Lane][1] = Srcs[1];
15083
15084 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
15085 assert(M1.size() == M2.size() && "Unexpected mask size");
15086 for (int i = 0, e = M1.size(); i != e; ++i)
15087 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
15088 return false;
15089 return true;
15090 };
15091
15092 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
15093 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
15094 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
15095 int M = Mask[i];
15096 if (M < 0)
15097 continue;
15098 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
15099 "Unexpected mask element");
15100 MergedMask[i] = M;
15101 }
15102 };
15103
15104 if (MatchMasks(InLaneMask, RepeatMask)) {
15105 // Merge this lane mask into the final repeat mask.
15106 MergeMasks(InLaneMask, RepeatMask);
15107 continue;
15108 }
15109
15110 // Didn't find a match. Swap the operands and try again.
15111 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15113
15114 if (MatchMasks(InLaneMask, RepeatMask)) {
15115 // Merge this lane mask into the final repeat mask.
15116 MergeMasks(InLaneMask, RepeatMask);
15117 continue;
15118 }
15119
15120 // Couldn't find a match with the operands in either order.
15121 return SDValue();
15122 }
15123
15124 // Now handle any lanes with only one source.
15125 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15126 // If this lane has already been processed, skip it.
15127 if (LaneSrcs[Lane][0] >= 0)
15128 continue;
15129
15130 for (int i = 0; i != NumLaneElts; ++i) {
15131 int M = Mask[(Lane * NumLaneElts) + i];
15132 if (M < 0)
15133 continue;
15134
15135 // If RepeatMask isn't defined yet we can define it ourself.
15136 if (RepeatMask[i] < 0)
15137 RepeatMask[i] = M % NumLaneElts;
15138
15139 if (RepeatMask[i] < NumElts) {
15140 if (RepeatMask[i] != M % NumLaneElts)
15141 return SDValue();
15142 LaneSrcs[Lane][0] = M / NumLaneElts;
15143 } else {
15144 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15145 return SDValue();
15146 LaneSrcs[Lane][1] = M / NumLaneElts;
15147 }
15148 }
15149
15150 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15151 return SDValue();
15152 }
15153
15154 SmallVector<int, 16> NewMask(NumElts, -1);
15155 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15156 int Src = LaneSrcs[Lane][0];
15157 for (int i = 0; i != NumLaneElts; ++i) {
15158 int M = -1;
15159 if (Src >= 0)
15160 M = Src * NumLaneElts + i;
15161 NewMask[Lane * NumLaneElts + i] = M;
15162 }
15163 }
15164 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15165 // Ensure we didn't get back the shuffle we started with.
15166 // FIXME: This is a hack to make up for some splat handling code in
15167 // getVectorShuffle.
15168 if (isa<ShuffleVectorSDNode>(NewV1) &&
15169 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15170 return SDValue();
15171
15172 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15173 int Src = LaneSrcs[Lane][1];
15174 for (int i = 0; i != NumLaneElts; ++i) {
15175 int M = -1;
15176 if (Src >= 0)
15177 M = Src * NumLaneElts + i;
15178 NewMask[Lane * NumLaneElts + i] = M;
15179 }
15180 }
15181 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15182 // Ensure we didn't get back the shuffle we started with.
15183 // FIXME: This is a hack to make up for some splat handling code in
15184 // getVectorShuffle.
15185 if (isa<ShuffleVectorSDNode>(NewV2) &&
15186 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15187 return SDValue();
15188
15189 for (int i = 0; i != NumElts; ++i) {
15190 if (Mask[i] < 0) {
15191 NewMask[i] = -1;
15192 continue;
15193 }
15194 NewMask[i] = RepeatMask[i % NumLaneElts];
15195 if (NewMask[i] < 0)
15196 continue;
15197
15198 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15199 }
15200 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
15201}
15202
15203/// If the input shuffle mask results in a vector that is undefined in all upper
15204/// or lower half elements and that mask accesses only 2 halves of the
15205/// shuffle's operands, return true. A mask of half the width with mask indexes
15206/// adjusted to access the extracted halves of the original shuffle operands is
15207/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
15208/// lower half of each input operand is accessed.
15209static bool
15211 int &HalfIdx1, int &HalfIdx2) {
15212 assert((Mask.size() == HalfMask.size() * 2) &&
15213 "Expected input mask to be twice as long as output");
15214
15215 // Exactly one half of the result must be undef to allow narrowing.
15216 bool UndefLower = isUndefLowerHalf(Mask);
15217 bool UndefUpper = isUndefUpperHalf(Mask);
15218 if (UndefLower == UndefUpper)
15219 return false;
15220
15221 unsigned HalfNumElts = HalfMask.size();
15222 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15223 HalfIdx1 = -1;
15224 HalfIdx2 = -1;
15225 for (unsigned i = 0; i != HalfNumElts; ++i) {
15226 int M = Mask[i + MaskIndexOffset];
15227 if (M < 0) {
15228 HalfMask[i] = M;
15229 continue;
15230 }
15231
15232 // Determine which of the 4 half vectors this element is from.
15233 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15234 int HalfIdx = M / HalfNumElts;
15235
15236 // Determine the element index into its half vector source.
15237 int HalfElt = M % HalfNumElts;
15238
15239 // We can shuffle with up to 2 half vectors, set the new 'half'
15240 // shuffle mask accordingly.
15241 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15242 HalfMask[i] = HalfElt;
15243 HalfIdx1 = HalfIdx;
15244 continue;
15245 }
15246 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15247 HalfMask[i] = HalfElt + HalfNumElts;
15248 HalfIdx2 = HalfIdx;
15249 continue;
15250 }
15251
15252 // Too many half vectors referenced.
15253 return false;
15254 }
15255
15256 return true;
15257}
15258
15259/// Given the output values from getHalfShuffleMask(), create a half width
15260/// shuffle of extracted vectors followed by an insert back to full width.
15262 ArrayRef<int> HalfMask, int HalfIdx1,
15263 int HalfIdx2, bool UndefLower,
15264 SelectionDAG &DAG, bool UseConcat = false) {
15265 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
15266 assert(V1.getValueType().isSimple() && "Expecting only simple types");
15267
15268 MVT VT = V1.getSimpleValueType();
15269 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15270 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15271
15272 auto getHalfVector = [&](int HalfIdx) {
15273 if (HalfIdx < 0)
15274 return DAG.getUNDEF(HalfVT);
15275 SDValue V = (HalfIdx < 2 ? V1 : V2);
15276 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15277 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15278 DAG.getIntPtrConstant(HalfIdx, DL));
15279 };
15280
15281 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15282 SDValue Half1 = getHalfVector(HalfIdx1);
15283 SDValue Half2 = getHalfVector(HalfIdx2);
15284 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15285 if (UseConcat) {
15286 SDValue Op0 = V;
15287 SDValue Op1 = DAG.getUNDEF(HalfVT);
15288 if (UndefLower)
15289 std::swap(Op0, Op1);
15290 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15291 }
15292
15293 unsigned Offset = UndefLower ? HalfNumElts : 0;
15294 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15296}
15297
15298/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15299/// This allows for fast cases such as subvector extraction/insertion
15300/// or shuffling smaller vector types which can lower more efficiently.
15302 SDValue V2, ArrayRef<int> Mask,
15303 const X86Subtarget &Subtarget,
15304 SelectionDAG &DAG) {
15305 assert((VT.is256BitVector() || VT.is512BitVector()) &&
15306 "Expected 256-bit or 512-bit vector");
15307
15308 bool UndefLower = isUndefLowerHalf(Mask);
15309 if (!UndefLower && !isUndefUpperHalf(Mask))
15310 return SDValue();
15311
15312 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
15313 "Completely undef shuffle mask should have been simplified already");
15314
15315 // Upper half is undef and lower half is whole upper subvector.
15316 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15317 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15318 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15319 if (!UndefLower &&
15320 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15321 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15322 DAG.getIntPtrConstant(HalfNumElts, DL));
15323 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15324 DAG.getIntPtrConstant(0, DL));
15325 }
15326
15327 // Lower half is undef and upper half is whole lower subvector.
15328 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15329 if (UndefLower &&
15330 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15331 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15332 DAG.getIntPtrConstant(0, DL));
15333 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15334 DAG.getIntPtrConstant(HalfNumElts, DL));
15335 }
15336
15337 int HalfIdx1, HalfIdx2;
15338 SmallVector<int, 8> HalfMask(HalfNumElts);
15339 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15340 return SDValue();
15341
15342 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
15343
15344 // Only shuffle the halves of the inputs when useful.
15345 unsigned NumLowerHalves =
15346 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15347 unsigned NumUpperHalves =
15348 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15349 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
15350
15351 // Determine the larger pattern of undef/halves, then decide if it's worth
15352 // splitting the shuffle based on subtarget capabilities and types.
15353 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
15354 if (!UndefLower) {
15355 // XXXXuuuu: no insert is needed.
15356 // Always extract lowers when setting lower - these are all free subreg ops.
15357 if (NumUpperHalves == 0)
15358 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15359 UndefLower, DAG);
15360
15361 if (NumUpperHalves == 1) {
15362 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
15363 if (Subtarget.hasAVX2()) {
15364 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
15365 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
15366 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
15367 (!isSingleSHUFPSMask(HalfMask) ||
15368 Subtarget.hasFastVariableCrossLaneShuffle()))
15369 return SDValue();
15370 // If this is a unary shuffle (assume that the 2nd operand is
15371 // canonicalized to undef), then we can use vpermpd. Otherwise, we
15372 // are better off extracting the upper half of 1 operand and using a
15373 // narrow shuffle.
15374 if (EltWidth == 64 && V2.isUndef())
15375 return SDValue();
15376 }
15377 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15378 if (Subtarget.hasAVX512() && VT.is512BitVector())
15379 return SDValue();
15380 // Extract + narrow shuffle is better than the wide alternative.
15381 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15382 UndefLower, DAG);
15383 }
15384
15385 // Don't extract both uppers, instead shuffle and then extract.
15386 assert(NumUpperHalves == 2 && "Half vector count went wrong");
15387 return SDValue();
15388 }
15389
15390 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
15391 if (NumUpperHalves == 0) {
15392 // AVX2 has efficient 64-bit element cross-lane shuffles.
15393 // TODO: Refine to account for unary shuffle, splat, and other masks?
15394 if (Subtarget.hasAVX2() && EltWidth == 64)
15395 return SDValue();
15396 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15397 if (Subtarget.hasAVX512() && VT.is512BitVector())
15398 return SDValue();
15399 // Narrow shuffle + insert is better than the wide alternative.
15400 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15401 UndefLower, DAG);
15402 }
15403
15404 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
15405 return SDValue();
15406}
15407
15408/// Handle case where shuffle sources are coming from the same 128-bit lane and
15409/// every lane can be represented as the same repeating mask - allowing us to
15410/// shuffle the sources with the repeating shuffle and then permute the result
15411/// to the destination lanes.
15413 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15414 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15415 int NumElts = VT.getVectorNumElements();
15416 int NumLanes = VT.getSizeInBits() / 128;
15417 int NumLaneElts = NumElts / NumLanes;
15418
15419 // On AVX2 we may be able to just shuffle the lowest elements and then
15420 // broadcast the result.
15421 if (Subtarget.hasAVX2()) {
15422 for (unsigned BroadcastSize : {16, 32, 64}) {
15423 if (BroadcastSize <= VT.getScalarSizeInBits())
15424 continue;
15425 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
15426
15427 // Attempt to match a repeating pattern every NumBroadcastElts,
15428 // accounting for UNDEFs but only references the lowest 128-bit
15429 // lane of the inputs.
15430 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
15431 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15432 for (int j = 0; j != NumBroadcastElts; ++j) {
15433 int M = Mask[i + j];
15434 if (M < 0)
15435 continue;
15436 int &R = RepeatMask[j];
15437 if (0 != ((M % NumElts) / NumLaneElts))
15438 return false;
15439 if (0 <= R && R != M)
15440 return false;
15441 R = M;
15442 }
15443 return true;
15444 };
15445
15446 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
15447 if (!FindRepeatingBroadcastMask(RepeatMask))
15448 continue;
15449
15450 // Shuffle the (lowest) repeated elements in place for broadcast.
15451 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
15452
15453 // Shuffle the actual broadcast.
15454 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
15455 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15456 for (int j = 0; j != NumBroadcastElts; ++j)
15457 BroadcastMask[i + j] = j;
15458
15459 // Avoid returning the same shuffle operation. For example,
15460 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
15461 if (BroadcastMask == Mask)
15462 return SDValue();
15463
15464 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
15465 BroadcastMask);
15466 }
15467 }
15468
15469 // Bail if the shuffle mask doesn't cross 128-bit lanes.
15470 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
15471 return SDValue();
15472
15473 // Bail if we already have a repeated lane shuffle mask.
15474 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15475 return SDValue();
15476
15477 // Helper to look for repeated mask in each split sublane, and that those
15478 // sublanes can then be permuted into place.
15479 auto ShuffleSubLanes = [&](int SubLaneScale) {
15480 int NumSubLanes = NumLanes * SubLaneScale;
15481 int NumSubLaneElts = NumLaneElts / SubLaneScale;
15482
15483 // Check that all the sources are coming from the same lane and see if we
15484 // can form a repeating shuffle mask (local to each sub-lane). At the same
15485 // time, determine the source sub-lane for each destination sub-lane.
15486 int TopSrcSubLane = -1;
15487 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
15488 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
15489 SubLaneScale,
15490 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
15491
15492 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
15493 // Extract the sub-lane mask, check that it all comes from the same lane
15494 // and normalize the mask entries to come from the first lane.
15495 int SrcLane = -1;
15496 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
15497 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15498 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
15499 if (M < 0)
15500 continue;
15501 int Lane = (M % NumElts) / NumLaneElts;
15502 if ((0 <= SrcLane) && (SrcLane != Lane))
15503 return SDValue();
15504 SrcLane = Lane;
15505 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
15506 SubLaneMask[Elt] = LocalM;
15507 }
15508
15509 // Whole sub-lane is UNDEF.
15510 if (SrcLane < 0)
15511 continue;
15512
15513 // Attempt to match against the candidate repeated sub-lane masks.
15514 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
15515 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
15516 for (int i = 0; i != NumSubLaneElts; ++i) {
15517 if (M1[i] < 0 || M2[i] < 0)
15518 continue;
15519 if (M1[i] != M2[i])
15520 return false;
15521 }
15522 return true;
15523 };
15524
15525 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
15526 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
15527 continue;
15528
15529 // Merge the sub-lane mask into the matching repeated sub-lane mask.
15530 for (int i = 0; i != NumSubLaneElts; ++i) {
15531 int M = SubLaneMask[i];
15532 if (M < 0)
15533 continue;
15534 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
15535 "Unexpected mask element");
15536 RepeatedSubLaneMask[i] = M;
15537 }
15538
15539 // Track the top most source sub-lane - by setting the remaining to
15540 // UNDEF we can greatly simplify shuffle matching.
15541 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
15542 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
15543 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
15544 break;
15545 }
15546
15547 // Bail if we failed to find a matching repeated sub-lane mask.
15548 if (Dst2SrcSubLanes[DstSubLane] < 0)
15549 return SDValue();
15550 }
15551 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
15552 "Unexpected source lane");
15553
15554 // Create a repeating shuffle mask for the entire vector.
15555 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
15556 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
15557 int Lane = SubLane / SubLaneScale;
15558 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
15559 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15560 int M = RepeatedSubLaneMask[Elt];
15561 if (M < 0)
15562 continue;
15563 int Idx = (SubLane * NumSubLaneElts) + Elt;
15564 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
15565 }
15566 }
15567
15568 // Shuffle each source sub-lane to its destination.
15569 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
15570 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
15571 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
15572 if (SrcSubLane < 0)
15573 continue;
15574 for (int j = 0; j != NumSubLaneElts; ++j)
15575 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
15576 }
15577
15578 // Avoid returning the same shuffle operation.
15579 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
15580 if (RepeatedMask == Mask || SubLaneMask == Mask)
15581 return SDValue();
15582
15583 SDValue RepeatedShuffle =
15584 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
15585
15586 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
15587 SubLaneMask);
15588 };
15589
15590 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
15591 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
15592 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
15593 // Otherwise we can only permute whole 128-bit lanes.
15594 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
15595 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
15596 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
15597 MinSubLaneScale = 2;
15598 MaxSubLaneScale =
15599 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
15600 }
15601 if (Subtarget.hasBWI() && VT == MVT::v64i8)
15602 MinSubLaneScale = MaxSubLaneScale = 4;
15603
15604 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
15605 if (SDValue Shuffle = ShuffleSubLanes(Scale))
15606 return Shuffle;
15607
15608 return SDValue();
15609}
15610
15612 bool &ForceV1Zero, bool &ForceV2Zero,
15613 unsigned &ShuffleImm, ArrayRef<int> Mask,
15614 const APInt &Zeroable) {
15615 int NumElts = VT.getVectorNumElements();
15616 assert(VT.getScalarSizeInBits() == 64 &&
15617 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
15618 "Unexpected data type for VSHUFPD");
15619 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
15620 "Illegal shuffle mask");
15621
15622 bool ZeroLane[2] = { true, true };
15623 for (int i = 0; i < NumElts; ++i)
15624 ZeroLane[i & 1] &= Zeroable[i];
15625
15626 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
15627 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
15628 ShuffleImm = 0;
15629 bool ShufpdMask = true;
15630 bool CommutableMask = true;
15631 for (int i = 0; i < NumElts; ++i) {
15632 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
15633 continue;
15634 if (Mask[i] < 0)
15635 return false;
15636 int Val = (i & 6) + NumElts * (i & 1);
15637 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
15638 if (Mask[i] < Val || Mask[i] > Val + 1)
15639 ShufpdMask = false;
15640 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
15641 CommutableMask = false;
15642 ShuffleImm |= (Mask[i] % 2) << i;
15643 }
15644
15645 if (!ShufpdMask && !CommutableMask)
15646 return false;
15647
15648 if (!ShufpdMask && CommutableMask)
15649 std::swap(V1, V2);
15650
15651 ForceV1Zero = ZeroLane[0];
15652 ForceV2Zero = ZeroLane[1];
15653 return true;
15654}
15655
15657 SDValue V2, ArrayRef<int> Mask,
15658 const APInt &Zeroable,
15659 const X86Subtarget &Subtarget,
15660 SelectionDAG &DAG) {
15661 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
15662 "Unexpected data type for VSHUFPD");
15663
15664 unsigned Immediate = 0;
15665 bool ForceV1Zero = false, ForceV2Zero = false;
15666 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
15667 Mask, Zeroable))
15668 return SDValue();
15669
15670 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
15671 if (ForceV1Zero)
15672 V1 = getZeroVector(VT, Subtarget, DAG, DL);
15673 if (ForceV2Zero)
15674 V2 = getZeroVector(VT, Subtarget, DAG, DL);
15675
15676 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
15677 DAG.getTargetConstant(Immediate, DL, MVT::i8));
15678}
15679
15680// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
15681// by zeroable elements in the remaining 24 elements. Turn this into two
15682// vmovqb instructions shuffled together.
15684 SDValue V1, SDValue V2,
15685 ArrayRef<int> Mask,
15686 const APInt &Zeroable,
15687 SelectionDAG &DAG) {
15688 assert(VT == MVT::v32i8 && "Unexpected type!");
15689
15690 // The first 8 indices should be every 8th element.
15691 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
15692 return SDValue();
15693
15694 // Remaining elements need to be zeroable.
15695 if (Zeroable.countl_one() < (Mask.size() - 8))
15696 return SDValue();
15697
15698 V1 = DAG.getBitcast(MVT::v4i64, V1);
15699 V2 = DAG.getBitcast(MVT::v4i64, V2);
15700
15701 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
15702 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
15703
15704 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
15705 // the upper bits of the result using an unpckldq.
15706 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
15707 { 0, 1, 2, 3, 16, 17, 18, 19,
15708 4, 5, 6, 7, 20, 21, 22, 23 });
15709 // Insert the unpckldq into a zero vector to widen to v32i8.
15710 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
15711 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
15712 DAG.getIntPtrConstant(0, DL));
15713}
15714
15715// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
15716// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
15717// =>
15718// ul = unpckl v1, v2
15719// uh = unpckh v1, v2
15720// a = vperm ul, uh
15721// b = vperm ul, uh
15722//
15723// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
15724// and permute. We cannot directly match v3 because it is split into two
15725// 256-bit vectors in earlier isel stages. Therefore, this function matches a
15726// pair of 256-bit shuffles and makes sure the masks are consecutive.
15727//
15728// Once unpck and permute nodes are created, the permute corresponding to this
15729// shuffle is returned, while the other permute replaces the other half of the
15730// shuffle in the selection dag.
15732 SDValue V1, SDValue V2,
15733 ArrayRef<int> Mask,
15734 SelectionDAG &DAG) {
15735 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
15736 VT != MVT::v32i8)
15737 return SDValue();
15738 // <B0, B1, B0+1, B1+1, ..., >
15739 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
15740 unsigned Begin1) {
15741 size_t Size = Mask.size();
15742 assert(Size % 2 == 0 && "Expected even mask size");
15743 for (unsigned I = 0; I < Size; I += 2) {
15744 if (Mask[I] != (int)(Begin0 + I / 2) ||
15745 Mask[I + 1] != (int)(Begin1 + I / 2))
15746 return false;
15747 }
15748 return true;
15749 };
15750 // Check which half is this shuffle node
15751 int NumElts = VT.getVectorNumElements();
15752 size_t FirstQtr = NumElts / 2;
15753 size_t ThirdQtr = NumElts + NumElts / 2;
15754 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
15755 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
15756 if (!IsFirstHalf && !IsSecondHalf)
15757 return SDValue();
15758
15759 // Find the intersection between shuffle users of V1 and V2.
15760 SmallVector<SDNode *, 2> Shuffles;
15761 for (SDNode *User : V1->uses())
15762 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
15763 User->getOperand(1) == V2)
15764 Shuffles.push_back(User);
15765 // Limit user size to two for now.
15766 if (Shuffles.size() != 2)
15767 return SDValue();
15768 // Find out which half of the 512-bit shuffles is each smaller shuffle
15769 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
15770 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
15771 SDNode *FirstHalf;
15772 SDNode *SecondHalf;
15773 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
15774 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
15775 FirstHalf = Shuffles[0];
15776 SecondHalf = Shuffles[1];
15777 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
15778 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
15779 FirstHalf = Shuffles[1];
15780 SecondHalf = Shuffles[0];
15781 } else {
15782 return SDValue();
15783 }
15784 // Lower into unpck and perm. Return the perm of this shuffle and replace
15785 // the other.
15786 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
15787 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
15788 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
15789 DAG.getTargetConstant(0x20, DL, MVT::i8));
15790 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
15791 DAG.getTargetConstant(0x31, DL, MVT::i8));
15792 if (IsFirstHalf) {
15793 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
15794 return Perm1;
15795 }
15796 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
15797 return Perm2;
15798}
15799
15800/// Handle lowering of 4-lane 64-bit floating point shuffles.
15801///
15802/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
15803/// isn't available.
15805 const APInt &Zeroable, SDValue V1, SDValue V2,
15806 const X86Subtarget &Subtarget,
15807 SelectionDAG &DAG) {
15808 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
15809 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
15810 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
15811
15812 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
15813 Subtarget, DAG))
15814 return V;
15815
15816 if (V2.isUndef()) {
15817 // Check for being able to broadcast a single element.
15818 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
15819 Mask, Subtarget, DAG))
15820 return Broadcast;
15821
15822 // Use low duplicate instructions for masks that match their pattern.
15823 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
15824 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
15825
15826 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
15827 // Non-half-crossing single input shuffles can be lowered with an
15828 // interleaved permutation.
15829 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
15830 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
15831 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
15832 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
15833 }
15834
15835 // With AVX2 we have direct support for this permutation.
15836 if (Subtarget.hasAVX2())
15837 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
15838 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15839
15840 // Try to create an in-lane repeating shuffle mask and then shuffle the
15841 // results into the target lanes.
15843 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
15844 return V;
15845
15846 // Try to permute the lanes and then use a per-lane permute.
15847 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
15848 Mask, DAG, Subtarget))
15849 return V;
15850
15851 // Otherwise, fall back.
15852 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
15853 DAG, Subtarget);
15854 }
15855
15856 // Use dedicated unpack instructions for masks that match their pattern.
15857 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
15858 return V;
15859
15860 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
15861 Zeroable, Subtarget, DAG))
15862 return Blend;
15863
15864 // Check if the blend happens to exactly fit that of SHUFPD.
15865 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
15866 Zeroable, Subtarget, DAG))
15867 return Op;
15868
15869 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
15870 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
15871
15872 // If we have lane crossing shuffles AND they don't all come from the lower
15873 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15874 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
15875 // canonicalize to a blend of splat which isn't necessary for this combine.
15876 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
15877 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
15878 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
15879 (V2.getOpcode() != ISD::BUILD_VECTOR))
15880 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
15881
15882 // If we have one input in place, then we can permute the other input and
15883 // blend the result.
15884 if (V1IsInPlace || V2IsInPlace)
15885 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
15886 Subtarget, DAG);
15887
15888 // Try to create an in-lane repeating shuffle mask and then shuffle the
15889 // results into the target lanes.
15891 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
15892 return V;
15893
15894 // Try to simplify this by merging 128-bit lanes to enable a lane-based
15895 // shuffle. However, if we have AVX2 and either inputs are already in place,
15896 // we will be able to shuffle even across lanes the other input in a single
15897 // instruction so skip this pattern.
15898 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
15900 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
15901 return V;
15902
15903 // If we have VLX support, we can use VEXPAND.
15904 if (Subtarget.hasVLX())
15905 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
15906 DAG, Subtarget))
15907 return V;
15908
15909 // If we have AVX2 then we always want to lower with a blend because an v4 we
15910 // can fully permute the elements.
15911 if (Subtarget.hasAVX2())
15912 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
15913 Subtarget, DAG);
15914
15915 // Otherwise fall back on generic lowering.
15916 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,
15917 Subtarget, DAG);
15918}
15919
15920/// Handle lowering of 4-lane 64-bit integer shuffles.
15921///
15922/// This routine is only called when we have AVX2 and thus a reasonable
15923/// instruction set for v4i64 shuffling..
15925 const APInt &Zeroable, SDValue V1, SDValue V2,
15926 const X86Subtarget &Subtarget,
15927 SelectionDAG &DAG) {
15928 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
15929 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
15930 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
15931 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
15932
15933 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
15934 Subtarget, DAG))
15935 return V;
15936
15937 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
15938 Zeroable, Subtarget, DAG))
15939 return Blend;
15940
15941 // Check for being able to broadcast a single element.
15942 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
15943 Subtarget, DAG))
15944 return Broadcast;
15945
15946 // Try to use shift instructions if fast.
15947 if (Subtarget.preferLowerShuffleAsShift())
15948 if (SDValue Shift =
15949 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
15950 Subtarget, DAG, /*BitwiseOnly*/ true))
15951 return Shift;
15952
15953 if (V2.isUndef()) {
15954 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
15955 // can use lower latency instructions that will operate on both lanes.
15956 SmallVector<int, 2> RepeatedMask;
15957 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
15958 SmallVector<int, 4> PSHUFDMask;
15959 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
15960 return DAG.getBitcast(
15961 MVT::v4i64,
15962 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
15963 DAG.getBitcast(MVT::v8i32, V1),
15964 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
15965 }
15966
15967 // AVX2 provides a direct instruction for permuting a single input across
15968 // lanes.
15969 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
15970 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15971 }
15972
15973 // Try to use shift instructions.
15974 if (SDValue Shift =
15975 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
15976 DAG, /*BitwiseOnly*/ false))
15977 return Shift;
15978
15979 // If we have VLX support, we can use VALIGN or VEXPAND.
15980 if (Subtarget.hasVLX()) {
15981 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
15982 Zeroable, Subtarget, DAG))
15983 return Rotate;
15984
15985 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
15986 DAG, Subtarget))
15987 return V;
15988 }
15989
15990 // Try to use PALIGNR.
15991 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
15992 Subtarget, DAG))
15993 return Rotate;
15994
15995 // Use dedicated unpack instructions for masks that match their pattern.
15996 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
15997 return V;
15998
15999 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16000 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16001
16002 // If we have one input in place, then we can permute the other input and
16003 // blend the result.
16004 if (V1IsInPlace || V2IsInPlace)
16005 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16006 Subtarget, DAG);
16007
16008 // Try to create an in-lane repeating shuffle mask and then shuffle the
16009 // results into the target lanes.
16011 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16012 return V;
16013
16014 // Try to lower to PERMQ(BLENDD(V1,V2)).
16015 if (SDValue V =
16016 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
16017 return V;
16018
16019 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16020 // shuffle. However, if we have AVX2 and either inputs are already in place,
16021 // we will be able to shuffle even across lanes the other input in a single
16022 // instruction so skip this pattern.
16023 if (!V1IsInPlace && !V2IsInPlace)
16025 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16026 return Result;
16027
16028 // Otherwise fall back on generic blend lowering.
16029 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16030 Subtarget, DAG);
16031}
16032
16033/// Handle lowering of 8-lane 32-bit floating point shuffles.
16034///
16035/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16036/// isn't available.
16038 const APInt &Zeroable, SDValue V1, SDValue V2,
16039 const X86Subtarget &Subtarget,
16040 SelectionDAG &DAG) {
16041 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16042 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16043 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16044
16045 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16046 Zeroable, Subtarget, DAG))
16047 return Blend;
16048
16049 // Check for being able to broadcast a single element.
16050 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16051 Subtarget, DAG))
16052 return Broadcast;
16053
16054 if (!Subtarget.hasAVX2()) {
16055 SmallVector<int> InLaneMask;
16056 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16057
16058 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
16059 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
16060 /*SimpleOnly*/ true))
16061 return R;
16062 }
16063 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16064 Zeroable, Subtarget, DAG))
16065 return DAG.getBitcast(MVT::v8f32, ZExt);
16066
16067 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16068 // options to efficiently lower the shuffle.
16069 SmallVector<int, 4> RepeatedMask;
16070 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16071 assert(RepeatedMask.size() == 4 &&
16072 "Repeated masks must be half the mask width!");
16073
16074 // Use even/odd duplicate instructions for masks that match their pattern.
16075 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16076 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16077 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16078 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16079
16080 if (V2.isUndef())
16081 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16082 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16083
16084 // Use dedicated unpack instructions for masks that match their pattern.
16085 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
16086 return V;
16087
16088 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16089 // have already handled any direct blends.
16090 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16091 }
16092
16093 // Try to create an in-lane repeating shuffle mask and then shuffle the
16094 // results into the target lanes.
16096 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16097 return V;
16098
16099 // If we have a single input shuffle with different shuffle patterns in the
16100 // two 128-bit lanes use the variable mask to VPERMILPS.
16101 if (V2.isUndef()) {
16102 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16103 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16104 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16105 }
16106 if (Subtarget.hasAVX2()) {
16107 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16108 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
16109 }
16110 // Otherwise, fall back.
16111 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
16112 DAG, Subtarget);
16113 }
16114
16115 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16116 // shuffle.
16118 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16119 return Result;
16120
16121 // If we have VLX support, we can use VEXPAND.
16122 if (Subtarget.hasVLX())
16123 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
16124 DAG, Subtarget))
16125 return V;
16126
16127 // Try to match an interleave of two v8f32s and lower them as unpck and
16128 // permutes using ymms. This needs to go before we try to split the vectors.
16129 //
16130 // TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits
16131 // this path inadvertently.
16132 if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())
16133 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
16134 Mask, DAG))
16135 return V;
16136
16137 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16138 // since after split we get a more efficient code using vpunpcklwd and
16139 // vpunpckhwd instrs than vblend.
16140 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
16141 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,
16142 DAG);
16143
16144 // If we have AVX2 then we always want to lower with a blend because at v8 we
16145 // can fully permute the elements.
16146 if (Subtarget.hasAVX2())
16147 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
16148 Subtarget, DAG);
16149
16150 // Otherwise fall back on generic lowering.
16151 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,
16152 Subtarget, DAG);
16153}
16154
16155/// Handle lowering of 8-lane 32-bit integer shuffles.
16156///
16157/// This routine is only called when we have AVX2 and thus a reasonable
16158/// instruction set for v8i32 shuffling..
16160 const APInt &Zeroable, SDValue V1, SDValue V2,
16161 const X86Subtarget &Subtarget,
16162 SelectionDAG &DAG) {
16163 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16164 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16165 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16166 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
16167
16168 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16169
16170 // Whenever we can lower this as a zext, that instruction is strictly faster
16171 // than any alternative. It also allows us to fold memory operands into the
16172 // shuffle in many cases.
16173 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16174 Zeroable, Subtarget, DAG))
16175 return ZExt;
16176
16177 // Try to match an interleave of two v8i32s and lower them as unpck and
16178 // permutes using ymms. This needs to go before we try to split the vectors.
16179 if (!Subtarget.hasAVX512())
16180 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
16181 Mask, DAG))
16182 return V;
16183
16184 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16185 // since after split we get a more efficient code than vblend by using
16186 // vpunpcklwd and vpunpckhwd instrs.
16187 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
16188 !Subtarget.hasAVX512())
16189 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,
16190 DAG);
16191
16192 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
16193 Zeroable, Subtarget, DAG))
16194 return Blend;
16195
16196 // Check for being able to broadcast a single element.
16197 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
16198 Subtarget, DAG))
16199 return Broadcast;
16200
16201 // Try to use shift instructions if fast.
16202 if (Subtarget.preferLowerShuffleAsShift()) {
16203 if (SDValue Shift =
16204 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16205 Subtarget, DAG, /*BitwiseOnly*/ true))
16206 return Shift;
16207 if (NumV2Elements == 0)
16208 if (SDValue Rotate =
16209 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16210 return Rotate;
16211 }
16212
16213 // If the shuffle mask is repeated in each 128-bit lane we can use more
16214 // efficient instructions that mirror the shuffles across the two 128-bit
16215 // lanes.
16216 SmallVector<int, 4> RepeatedMask;
16217 bool Is128BitLaneRepeatedShuffle =
16218 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
16219 if (Is128BitLaneRepeatedShuffle) {
16220 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16221 if (V2.isUndef())
16222 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
16223 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16224
16225 // Use dedicated unpack instructions for masks that match their pattern.
16226 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
16227 return V;
16228 }
16229
16230 // Try to use shift instructions.
16231 if (SDValue Shift =
16232 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
16233 DAG, /*BitwiseOnly*/ false))
16234 return Shift;
16235
16236 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
16237 if (SDValue Rotate =
16238 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16239 return Rotate;
16240
16241 // If we have VLX support, we can use VALIGN or EXPAND.
16242 if (Subtarget.hasVLX()) {
16243 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16244 Zeroable, Subtarget, DAG))
16245 return Rotate;
16246
16247 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
16248 DAG, Subtarget))
16249 return V;
16250 }
16251
16252 // Try to use byte rotation instructions.
16253 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
16254 Subtarget, DAG))
16255 return Rotate;
16256
16257 // Try to create an in-lane repeating shuffle mask and then shuffle the
16258 // results into the target lanes.
16260 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16261 return V;
16262
16263 if (V2.isUndef()) {
16264 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16265 // because that should be faster than the variable permute alternatives.
16266 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
16267 return V;
16268
16269 // If the shuffle patterns aren't repeated but it's a single input, directly
16270 // generate a cross-lane VPERMD instruction.
16271 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16272 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
16273 }
16274
16275 // Assume that a single SHUFPS is faster than an alternative sequence of
16276 // multiple instructions (even if the CPU has a domain penalty).
16277 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16278 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16279 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
16280 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
16281 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
16282 CastV1, CastV2, DAG);
16283 return DAG.getBitcast(MVT::v8i32, ShufPS);
16284 }
16285
16286 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16287 // shuffle.
16289 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16290 return Result;
16291
16292 // Otherwise fall back on generic blend lowering.
16293 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
16294 Subtarget, DAG);
16295}
16296
16297/// Handle lowering of 16-lane 16-bit integer shuffles.
16298///
16299/// This routine is only called when we have AVX2 and thus a reasonable
16300/// instruction set for v16i16 shuffling..
16302 const APInt &Zeroable, SDValue V1, SDValue V2,
16303 const X86Subtarget &Subtarget,
16304 SelectionDAG &DAG) {
16305 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16306 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16307 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16308 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
16309
16310 // Whenever we can lower this as a zext, that instruction is strictly faster
16311 // than any alternative. It also allows us to fold memory operands into the
16312 // shuffle in many cases.
16314 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16315 return ZExt;
16316
16317 // Check for being able to broadcast a single element.
16318 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16319 Subtarget, DAG))
16320 return Broadcast;
16321
16322 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16323 Zeroable, Subtarget, DAG))
16324 return Blend;
16325
16326 // Use dedicated unpack instructions for masks that match their pattern.
16327 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
16328 return V;
16329
16330 // Use dedicated pack instructions for masks that match their pattern.
16331 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
16332 Subtarget))
16333 return V;
16334
16335 // Try to use lower using a truncation.
16336 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16337 Subtarget, DAG))
16338 return V;
16339
16340 // Try to use shift instructions.
16341 if (SDValue Shift =
16342 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16343 Subtarget, DAG, /*BitwiseOnly*/ false))
16344 return Shift;
16345
16346 // Try to use byte rotation instructions.
16347 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
16348 Subtarget, DAG))
16349 return Rotate;
16350
16351 // Try to create an in-lane repeating shuffle mask and then shuffle the
16352 // results into the target lanes.
16354 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16355 return V;
16356
16357 if (V2.isUndef()) {
16358 // Try to use bit rotation instructions.
16359 if (SDValue Rotate =
16360 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
16361 return Rotate;
16362
16363 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16364 // because that should be faster than the variable permute alternatives.
16365 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
16366 return V;
16367
16368 // There are no generalized cross-lane shuffle operations available on i16
16369 // element types.
16370 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
16372 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16373 return V;
16374
16375 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
16376 DAG, Subtarget);
16377 }
16378
16379 SmallVector<int, 8> RepeatedMask;
16380 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
16381 // As this is a single-input shuffle, the repeated mask should be
16382 // a strictly valid v8i16 mask that we can pass through to the v8i16
16383 // lowering to handle even the v16 case.
16385 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
16386 }
16387 }
16388
16389 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
16390 Zeroable, Subtarget, DAG))
16391 return PSHUFB;
16392
16393 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
16394 if (Subtarget.hasBWI())
16395 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
16396
16397 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16398 // shuffle.
16400 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16401 return Result;
16402
16403 // Try to permute the lanes and then use a per-lane permute.
16405 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16406 return V;
16407
16408 // Try to match an interleave of two v16i16s and lower them as unpck and
16409 // permutes using ymms.
16410 if (!Subtarget.hasAVX512())
16411 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
16412 Mask, DAG))
16413 return V;
16414
16415 // Otherwise fall back on generic lowering.
16416 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
16417 Subtarget, DAG);
16418}
16419
16420/// Handle lowering of 32-lane 8-bit integer shuffles.
16421///
16422/// This routine is only called when we have AVX2 and thus a reasonable
16423/// instruction set for v32i8 shuffling..
16425 const APInt &Zeroable, SDValue V1, SDValue V2,
16426 const X86Subtarget &Subtarget,
16427 SelectionDAG &DAG) {
16428 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
16429 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
16430 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
16431 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
16432
16433 // Whenever we can lower this as a zext, that instruction is strictly faster
16434 // than any alternative. It also allows us to fold memory operands into the
16435 // shuffle in many cases.
16436 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
16437 Zeroable, Subtarget, DAG))
16438 return ZExt;
16439
16440 // Check for being able to broadcast a single element.
16441 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
16442 Subtarget, DAG))
16443 return Broadcast;
16444
16445 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
16446 Zeroable, Subtarget, DAG))
16447 return Blend;
16448
16449 // Use dedicated unpack instructions for masks that match their pattern.
16450 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
16451 return V;
16452
16453 // Use dedicated pack instructions for masks that match their pattern.
16454 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
16455 Subtarget))
16456 return V;
16457
16458 // Try to use lower using a truncation.
16459 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
16460 Subtarget, DAG))
16461 return V;
16462
16463 // Try to use shift instructions.
16464 if (SDValue Shift =
16465 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
16466 DAG, /*BitwiseOnly*/ false))
16467 return Shift;
16468
16469 // Try to use byte rotation instructions.
16470 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
16471 Subtarget, DAG))
16472 return Rotate;
16473
16474 // Try to use bit rotation instructions.
16475 if (V2.isUndef())
16476 if (SDValue Rotate =
16477 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
16478 return Rotate;
16479
16480 // Try to create an in-lane repeating shuffle mask and then shuffle the
16481 // results into the target lanes.
16483 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16484 return V;
16485
16486 // There are no generalized cross-lane shuffle operations available on i8
16487 // element types.
16488 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
16489 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16490 // because that should be faster than the variable permute alternatives.
16491 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
16492 return V;
16493
16495 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16496 return V;
16497
16498 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
16499 DAG, Subtarget);
16500 }
16501
16502 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
16503 Zeroable, Subtarget, DAG))
16504 return PSHUFB;
16505
16506 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
16507 if (Subtarget.hasVBMI())
16508 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
16509
16510 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16511 // shuffle.
16513 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16514 return Result;
16515
16516 // Try to permute the lanes and then use a per-lane permute.
16518 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16519 return V;
16520
16521 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16522 // by zeroable elements in the remaining 24 elements. Turn this into two
16523 // vmovqb instructions shuffled together.
16524 if (Subtarget.hasVLX())
16525 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
16526 Mask, Zeroable, DAG))
16527 return V;
16528
16529 // Try to match an interleave of two v32i8s and lower them as unpck and
16530 // permutes using ymms.
16531 if (!Subtarget.hasAVX512())
16532 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
16533 Mask, DAG))
16534 return V;
16535
16536 // Otherwise fall back on generic lowering.
16537 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
16538 Subtarget, DAG);
16539}
16540
16541/// High-level routine to lower various 256-bit x86 vector shuffles.
16542///
16543/// This routine either breaks down the specific type of a 256-bit x86 vector
16544/// shuffle or splits it into two 128-bit shuffles and fuses the results back
16545/// together based on the available instructions.
16547 SDValue V1, SDValue V2, const APInt &Zeroable,
16548 const X86Subtarget &Subtarget,
16549 SelectionDAG &DAG) {
16550 // If we have a single input to the zero element, insert that into V1 if we
16551 // can do so cheaply.
16552 int NumElts = VT.getVectorNumElements();
16553 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
16554
16555 if (NumV2Elements == 1 && Mask[0] >= NumElts)
16557 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
16558 return Insertion;
16559
16560 // Handle special cases where the lower or upper half is UNDEF.
16561 if (SDValue V =
16562 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
16563 return V;
16564
16565 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
16566 // can check for those subtargets here and avoid much of the subtarget
16567 // querying in the per-vector-type lowering routines. With AVX1 we have
16568 // essentially *zero* ability to manipulate a 256-bit vector with integer
16569 // types. Since we'll use floating point types there eventually, just
16570 // immediately cast everything to a float and operate entirely in that domain.
16571 if (VT.isInteger() && !Subtarget.hasAVX2()) {
16572 int ElementBits = VT.getScalarSizeInBits();
16573 if (ElementBits < 32) {
16574 // No floating point type available, if we can't use the bit operations
16575 // for masking/blending then decompose into 128-bit vectors.
16576 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
16577 Subtarget, DAG))
16578 return V;
16579 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
16580 return V;
16581 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
16582 }
16583
16584 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
16586 V1 = DAG.getBitcast(FpVT, V1);
16587 V2 = DAG.getBitcast(FpVT, V2);
16588 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
16589 }
16590
16591 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
16592 V1 = DAG.getBitcast(MVT::v16i16, V1);
16593 V2 = DAG.getBitcast(MVT::v16i16, V2);
16594 return DAG.getBitcast(VT,
16595 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
16596 }
16597
16598 switch (VT.SimpleTy) {
16599 case MVT::v4f64:
16600 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16601 case MVT::v4i64:
16602 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16603 case MVT::v8f32:
16604 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16605 case MVT::v8i32:
16606 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16607 case MVT::v16i16:
16608 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16609 case MVT::v32i8:
16610 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16611
16612 default:
16613 llvm_unreachable("Not a valid 256-bit x86 vector type!");
16614 }
16615}
16616
16617/// Try to lower a vector shuffle as a 128-bit shuffles.
16619 const APInt &Zeroable, SDValue V1, SDValue V2,
16620 const X86Subtarget &Subtarget,
16621 SelectionDAG &DAG) {
16622 assert(VT.getScalarSizeInBits() == 64 &&
16623 "Unexpected element type size for 128bit shuffle.");
16624
16625 // To handle 256 bit vector requires VLX and most probably
16626 // function lowerV2X128VectorShuffle() is better solution.
16627 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
16628
16629 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
16630 SmallVector<int, 4> Widened128Mask;
16631 if (!canWidenShuffleElements(Mask, Widened128Mask))
16632 return SDValue();
16633 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
16634
16635 // Try to use an insert into a zero vector.
16636 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
16637 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
16638 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
16639 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
16640 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16641 DAG.getIntPtrConstant(0, DL));
16642 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16643 getZeroVector(VT, Subtarget, DAG, DL), LoV,
16644 DAG.getIntPtrConstant(0, DL));
16645 }
16646
16647 // Check for patterns which can be matched with a single insert of a 256-bit
16648 // subvector.
16649 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
16650 if (OnlyUsesV1 ||
16651 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
16652 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
16653 SDValue SubVec =
16654 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
16655 DAG.getIntPtrConstant(0, DL));
16656 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16657 DAG.getIntPtrConstant(4, DL));
16658 }
16659
16660 // See if this is an insertion of the lower 128-bits of V2 into V1.
16661 bool IsInsert = true;
16662 int V2Index = -1;
16663 for (int i = 0; i < 4; ++i) {
16664 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
16665 if (Widened128Mask[i] < 0)
16666 continue;
16667
16668 // Make sure all V1 subvectors are in place.
16669 if (Widened128Mask[i] < 4) {
16670 if (Widened128Mask[i] != i) {
16671 IsInsert = false;
16672 break;
16673 }
16674 } else {
16675 // Make sure we only have a single V2 index and its the lowest 128-bits.
16676 if (V2Index >= 0 || Widened128Mask[i] != 4) {
16677 IsInsert = false;
16678 break;
16679 }
16680 V2Index = i;
16681 }
16682 }
16683 if (IsInsert && V2Index >= 0) {
16684 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16685 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
16686 DAG.getIntPtrConstant(0, DL));
16687 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
16688 }
16689
16690 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
16691 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
16692 // possible we at least ensure the lanes stay sequential to help later
16693 // combines.
16694 SmallVector<int, 2> Widened256Mask;
16695 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
16696 Widened128Mask.clear();
16697 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
16698 }
16699
16700 // Try to lower to vshuf64x2/vshuf32x4.
16701 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
16702 int PermMask[4] = {-1, -1, -1, -1};
16703 // Ensure elements came from the same Op.
16704 for (int i = 0; i < 4; ++i) {
16705 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
16706 if (Widened128Mask[i] < 0)
16707 continue;
16708
16709 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
16710 unsigned OpIndex = i / 2;
16711 if (Ops[OpIndex].isUndef())
16712 Ops[OpIndex] = Op;
16713 else if (Ops[OpIndex] != Op)
16714 return SDValue();
16715
16716 PermMask[i] = Widened128Mask[i] % 4;
16717 }
16718
16719 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
16720 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
16721}
16722
16723/// Handle lowering of 8-lane 64-bit floating point shuffles.
16725 const APInt &Zeroable, SDValue V1, SDValue V2,
16726 const X86Subtarget &Subtarget,
16727 SelectionDAG &DAG) {
16728 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
16729 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
16730 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16731
16732 if (V2.isUndef()) {
16733 // Use low duplicate instructions for masks that match their pattern.
16734 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
16735 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
16736
16737 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
16738 // Non-half-crossing single input shuffles can be lowered with an
16739 // interleaved permutation.
16740 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16741 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
16742 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
16743 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
16744 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
16745 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16746 }
16747
16748 SmallVector<int, 4> RepeatedMask;
16749 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
16750 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
16751 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16752 }
16753
16754 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
16755 V2, Subtarget, DAG))
16756 return Shuf128;
16757
16758 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
16759 return Unpck;
16760
16761 // Check if the blend happens to exactly fit that of SHUFPD.
16762 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
16763 Zeroable, Subtarget, DAG))
16764 return Op;
16765
16766 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
16767 DAG, Subtarget))
16768 return V;
16769
16770 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
16771 Zeroable, Subtarget, DAG))
16772 return Blend;
16773
16774 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
16775}
16776
16777/// Handle lowering of 16-lane 32-bit floating point shuffles.
16779 const APInt &Zeroable, SDValue V1, SDValue V2,
16780 const X86Subtarget &Subtarget,
16781 SelectionDAG &DAG) {
16782 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
16783 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
16784 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16785
16786 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16787 // options to efficiently lower the shuffle.
16788 SmallVector<int, 4> RepeatedMask;
16789 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
16790 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16791
16792 // Use even/odd duplicate instructions for masks that match their pattern.
16793 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16794 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
16795 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16796 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
16797
16798 if (V2.isUndef())
16799 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
16800 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16801
16802 // Use dedicated unpack instructions for masks that match their pattern.
16803 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
16804 return V;
16805
16806 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
16807 Zeroable, Subtarget, DAG))
16808 return Blend;
16809
16810 // Otherwise, fall back to a SHUFPS sequence.
16811 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
16812 }
16813
16814 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
16815 Zeroable, Subtarget, DAG))
16816 return Blend;
16817
16819 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
16820 return DAG.getBitcast(MVT::v16f32, ZExt);
16821
16822 // Try to create an in-lane repeating shuffle mask and then shuffle the
16823 // results into the target lanes.
16825 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
16826 return V;
16827
16828 // If we have a single input shuffle with different shuffle patterns in the
16829 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
16830 if (V2.isUndef() &&
16831 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
16832 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
16833 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
16834 }
16835
16836 // If we have AVX512F support, we can use VEXPAND.
16837 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
16838 V1, V2, DAG, Subtarget))
16839 return V;
16840
16841 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
16842}
16843
16844/// Handle lowering of 8-lane 64-bit integer shuffles.
16846 const APInt &Zeroable, SDValue V1, SDValue V2,
16847 const X86Subtarget &Subtarget,
16848 SelectionDAG &DAG) {
16849 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
16850 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
16851 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16852
16853 // Try to use shift instructions if fast.
16854 if (Subtarget.preferLowerShuffleAsShift())
16855 if (SDValue Shift =
16856 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
16857 Subtarget, DAG, /*BitwiseOnly*/ true))
16858 return Shift;
16859
16860 if (V2.isUndef()) {
16861 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16862 // can use lower latency instructions that will operate on all four
16863 // 128-bit lanes.
16864 SmallVector<int, 2> Repeated128Mask;
16865 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
16866 SmallVector<int, 4> PSHUFDMask;
16867 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
16868 return DAG.getBitcast(
16869 MVT::v8i64,
16870 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
16871 DAG.getBitcast(MVT::v16i32, V1),
16872 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16873 }
16874
16875 SmallVector<int, 4> Repeated256Mask;
16876 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
16877 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
16878 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
16879 }
16880
16881 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
16882 V2, Subtarget, DAG))
16883 return Shuf128;
16884
16885 // Try to use shift instructions.
16886 if (SDValue Shift =
16887 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
16888 DAG, /*BitwiseOnly*/ false))
16889 return Shift;
16890
16891 // Try to use VALIGN.
16892 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
16893 Zeroable, Subtarget, DAG))
16894 return Rotate;
16895
16896 // Try to use PALIGNR.
16897 if (Subtarget.hasBWI())
16898 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
16899 Subtarget, DAG))
16900 return Rotate;
16901
16902 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
16903 return Unpck;
16904
16905 // If we have AVX512F support, we can use VEXPAND.
16906 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
16907 DAG, Subtarget))
16908 return V;
16909
16910 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
16911 Zeroable, Subtarget, DAG))
16912 return Blend;
16913
16914 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
16915}
16916
16917/// Handle lowering of 16-lane 32-bit integer shuffles.
16919 const APInt &Zeroable, SDValue V1, SDValue V2,
16920 const X86Subtarget &Subtarget,
16921 SelectionDAG &DAG) {
16922 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
16923 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
16924 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16925
16926 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
16927
16928 // Whenever we can lower this as a zext, that instruction is strictly faster
16929 // than any alternative. It also allows us to fold memory operands into the
16930 // shuffle in many cases.
16932 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
16933 return ZExt;
16934
16935 // Try to use shift instructions if fast.
16936 if (Subtarget.preferLowerShuffleAsShift()) {
16937 if (SDValue Shift =
16938 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
16939 Subtarget, DAG, /*BitwiseOnly*/ true))
16940 return Shift;
16941 if (NumV2Elements == 0)
16942 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
16943 Subtarget, DAG))
16944 return Rotate;
16945 }
16946
16947 // If the shuffle mask is repeated in each 128-bit lane we can use more
16948 // efficient instructions that mirror the shuffles across the four 128-bit
16949 // lanes.
16950 SmallVector<int, 4> RepeatedMask;
16951 bool Is128BitLaneRepeatedShuffle =
16952 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
16953 if (Is128BitLaneRepeatedShuffle) {
16954 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16955 if (V2.isUndef())
16956 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
16957 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16958
16959 // Use dedicated unpack instructions for masks that match their pattern.
16960 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
16961 return V;
16962 }
16963
16964 // Try to use shift instructions.
16965 if (SDValue Shift =
16966 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
16967 Subtarget, DAG, /*BitwiseOnly*/ false))
16968 return Shift;
16969
16970 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
16971 if (SDValue Rotate =
16972 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
16973 return Rotate;
16974
16975 // Try to use VALIGN.
16976 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
16977 Zeroable, Subtarget, DAG))
16978 return Rotate;
16979
16980 // Try to use byte rotation instructions.
16981 if (Subtarget.hasBWI())
16982 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
16983 Subtarget, DAG))
16984 return Rotate;
16985
16986 // Assume that a single SHUFPS is faster than using a permv shuffle.
16987 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16988 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16989 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
16990 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
16991 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
16992 CastV1, CastV2, DAG);
16993 return DAG.getBitcast(MVT::v16i32, ShufPS);
16994 }
16995
16996 // Try to create an in-lane repeating shuffle mask and then shuffle the
16997 // results into the target lanes.
16999 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17000 return V;
17001
17002 // If we have AVX512F support, we can use VEXPAND.
17003 if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
17004 DAG, Subtarget))
17005 return V;
17006
17007 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17008 Zeroable, Subtarget, DAG))
17009 return Blend;
17010
17011 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17012}
17013
17014/// Handle lowering of 32-lane 16-bit integer shuffles.
17016 const APInt &Zeroable, SDValue V1, SDValue V2,
17017 const X86Subtarget &Subtarget,
17018 SelectionDAG &DAG) {
17019 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17020 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17021 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17022 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17023
17024 // Whenever we can lower this as a zext, that instruction is strictly faster
17025 // than any alternative. It also allows us to fold memory operands into the
17026 // shuffle in many cases.
17028 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17029 return ZExt;
17030
17031 // Use dedicated unpack instructions for masks that match their pattern.
17032 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
17033 return V;
17034
17035 // Use dedicated pack instructions for masks that match their pattern.
17036 if (SDValue V =
17037 lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
17038 return V;
17039
17040 // Try to use shift instructions.
17041 if (SDValue Shift =
17042 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
17043 Subtarget, DAG, /*BitwiseOnly*/ false))
17044 return Shift;
17045
17046 // Try to use byte rotation instructions.
17047 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17048 Subtarget, DAG))
17049 return Rotate;
17050
17051 if (V2.isUndef()) {
17052 // Try to use bit rotation instructions.
17053 if (SDValue Rotate =
17054 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17055 return Rotate;
17056
17057 SmallVector<int, 8> RepeatedMask;
17058 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17059 // As this is a single-input shuffle, the repeated mask should be
17060 // a strictly valid v8i16 mask that we can pass through to the v8i16
17061 // lowering to handle even the v32 case.
17062 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17063 RepeatedMask, Subtarget, DAG);
17064 }
17065 }
17066
17067 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17068 Zeroable, Subtarget, DAG))
17069 return Blend;
17070
17071 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17072 Zeroable, Subtarget, DAG))
17073 return PSHUFB;
17074
17075 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17076}
17077
17078/// Handle lowering of 64-lane 8-bit integer shuffles.
17080 const APInt &Zeroable, SDValue V1, SDValue V2,
17081 const X86Subtarget &Subtarget,
17082 SelectionDAG &DAG) {
17083 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17084 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17085 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
17086 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17087
17088 // Whenever we can lower this as a zext, that instruction is strictly faster
17089 // than any alternative. It also allows us to fold memory operands into the
17090 // shuffle in many cases.
17092 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17093 return ZExt;
17094
17095 // Use dedicated unpack instructions for masks that match their pattern.
17096 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
17097 return V;
17098
17099 // Use dedicated pack instructions for masks that match their pattern.
17100 if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,
17101 Subtarget))
17102 return V;
17103
17104 // Try to use shift instructions.
17105 if (SDValue Shift =
17106 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
17107 DAG, /*BitwiseOnly*/ false))
17108 return Shift;
17109
17110 // Try to use byte rotation instructions.
17111 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17112 Subtarget, DAG))
17113 return Rotate;
17114
17115 // Try to use bit rotation instructions.
17116 if (V2.isUndef())
17117 if (SDValue Rotate =
17118 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
17119 return Rotate;
17120
17121 // Lower as AND if possible.
17122 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
17123 Zeroable, Subtarget, DAG))
17124 return Masked;
17125
17126 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
17127 Zeroable, Subtarget, DAG))
17128 return PSHUFB;
17129
17130 // Try to create an in-lane repeating shuffle mask and then shuffle the
17131 // results into the target lanes.
17133 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17134 return V;
17135
17137 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
17138 return Result;
17139
17140 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
17141 Zeroable, Subtarget, DAG))
17142 return Blend;
17143
17144 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
17145 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
17146 // PALIGNR will be cheaper than the second PSHUFB+OR.
17147 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
17148 Mask, Subtarget, DAG))
17149 return V;
17150
17151 // If we can't directly blend but can use PSHUFB, that will be better as it
17152 // can both shuffle and set up the inefficient blend.
17153 bool V1InUse, V2InUse;
17154 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
17155 DAG, V1InUse, V2InUse);
17156 }
17157
17158 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17159 // shuffle.
17160 if (!V2.isUndef())
17162 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17163 return Result;
17164
17165 // VBMI can use VPERMV/VPERMV3 byte shuffles.
17166 if (Subtarget.hasVBMI())
17167 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
17168
17169 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17170}
17171
17172/// High-level routine to lower various 512-bit x86 vector shuffles.
17173///
17174/// This routine either breaks down the specific type of a 512-bit x86 vector
17175/// shuffle or splits it into two 256-bit shuffles and fuses the results back
17176/// together based on the available instructions.
17178 MVT VT, SDValue V1, SDValue V2,
17179 const APInt &Zeroable,
17180 const X86Subtarget &Subtarget,
17181 SelectionDAG &DAG) {
17182 assert(Subtarget.hasAVX512() &&
17183 "Cannot lower 512-bit vectors w/ basic ISA!");
17184
17185 // If we have a single input to the zero element, insert that into V1 if we
17186 // can do so cheaply.
17187 int NumElts = Mask.size();
17188 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17189
17190 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17192 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17193 return Insertion;
17194
17195 // Handle special cases where the lower or upper half is UNDEF.
17196 if (SDValue V =
17197 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17198 return V;
17199
17200 // Check for being able to broadcast a single element.
17201 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
17202 Subtarget, DAG))
17203 return Broadcast;
17204
17205 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
17206 // Try using bit ops for masking and blending before falling back to
17207 // splitting.
17208 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17209 Subtarget, DAG))
17210 return V;
17211 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17212 return V;
17213
17214 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17215 }
17216
17217 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
17218 if (!Subtarget.hasBWI())
17219 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17220 /*SimpleOnly*/ false);
17221
17222 V1 = DAG.getBitcast(MVT::v32i16, V1);
17223 V2 = DAG.getBitcast(MVT::v32i16, V2);
17224 return DAG.getBitcast(VT,
17225 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
17226 }
17227
17228 // Dispatch to each element type for lowering. If we don't have support for
17229 // specific element type shuffles at 512 bits, immediately split them and
17230 // lower them. Each lowering routine of a given type is allowed to assume that
17231 // the requisite ISA extensions for that element type are available.
17232 switch (VT.SimpleTy) {
17233 case MVT::v8f64:
17234 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17235 case MVT::v16f32:
17236 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17237 case MVT::v8i64:
17238 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17239 case MVT::v16i32:
17240 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17241 case MVT::v32i16:
17242 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17243 case MVT::v64i8:
17244 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17245
17246 default:
17247 llvm_unreachable("Not a valid 512-bit x86 vector type!");
17248 }
17249}
17250
17252 MVT VT, SDValue V1, SDValue V2,
17253 const X86Subtarget &Subtarget,
17254 SelectionDAG &DAG) {
17255 // Shuffle should be unary.
17256 if (!V2.isUndef())
17257 return SDValue();
17258
17259 int ShiftAmt = -1;
17260 int NumElts = Mask.size();
17261 for (int i = 0; i != NumElts; ++i) {
17262 int M = Mask[i];
17263 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
17264 "Unexpected mask index.");
17265 if (M < 0)
17266 continue;
17267
17268 // The first non-undef element determines our shift amount.
17269 if (ShiftAmt < 0) {
17270 ShiftAmt = M - i;
17271 // Need to be shifting right.
17272 if (ShiftAmt <= 0)
17273 return SDValue();
17274 }
17275 // All non-undef elements must shift by the same amount.
17276 if (ShiftAmt != M - i)
17277 return SDValue();
17278 }
17279 assert(ShiftAmt >= 0 && "All undef?");
17280
17281 // Great we found a shift right.
17282 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
17283 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
17284 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17285 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17286 DAG.getIntPtrConstant(0, DL));
17287}
17288
17289// Determine if this shuffle can be implemented with a KSHIFT instruction.
17290// Returns the shift amount if possible or -1 if not. This is a simplified
17291// version of matchShuffleAsShift.
17292static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17293 int MaskOffset, const APInt &Zeroable) {
17294 int Size = Mask.size();
17295
17296 auto CheckZeros = [&](int Shift, bool Left) {
17297 for (int j = 0; j < Shift; ++j)
17298 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17299 return false;
17300
17301 return true;
17302 };
17303
17304 auto MatchShift = [&](int Shift, bool Left) {
17305 unsigned Pos = Left ? Shift : 0;
17306 unsigned Low = Left ? 0 : Shift;
17307 unsigned Len = Size - Shift;
17308 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17309 };
17310
17311 for (int Shift = 1; Shift != Size; ++Shift)
17312 for (bool Left : {true, false})
17313 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17315 return Shift;
17316 }
17317
17318 return -1;
17319}
17320
17321
17322// Lower vXi1 vector shuffles.
17323// There is no a dedicated instruction on AVX-512 that shuffles the masks.
17324// The only way to shuffle bits is to sign-extend the mask vector to SIMD
17325// vector, shuffle and then truncate it back.
17327 MVT VT, SDValue V1, SDValue V2,
17328 const APInt &Zeroable,
17329 const X86Subtarget &Subtarget,
17330 SelectionDAG &DAG) {
17331 assert(Subtarget.hasAVX512() &&
17332 "Cannot lower 512-bit vectors w/o basic ISA!");
17333
17334 int NumElts = Mask.size();
17335 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17336
17337 // Try to recognize shuffles that are just padding a subvector with zeros.
17338 int SubvecElts = 0;
17339 int Src = -1;
17340 for (int i = 0; i != NumElts; ++i) {
17341 if (Mask[i] >= 0) {
17342 // Grab the source from the first valid mask. All subsequent elements need
17343 // to use this same source.
17344 if (Src < 0)
17345 Src = Mask[i] / NumElts;
17346 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
17347 break;
17348 }
17349
17350 ++SubvecElts;
17351 }
17352 assert(SubvecElts != NumElts && "Identity shuffle?");
17353
17354 // Clip to a power 2.
17355 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
17356
17357 // Make sure the number of zeroable bits in the top at least covers the bits
17358 // not covered by the subvector.
17359 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
17360 assert(Src >= 0 && "Expected a source!");
17361 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
17362 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
17363 Src == 0 ? V1 : V2,
17364 DAG.getIntPtrConstant(0, DL));
17365 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17366 DAG.getConstant(0, DL, VT),
17367 Extract, DAG.getIntPtrConstant(0, DL));
17368 }
17369
17370 // Try a simple shift right with undef elements. Later we'll try with zeros.
17371 if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
17372 DAG))
17373 return Shift;
17374
17375 // Try to match KSHIFTs.
17376 unsigned Offset = 0;
17377 for (SDValue V : { V1, V2 }) {
17378 unsigned Opcode;
17379 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
17380 if (ShiftAmt >= 0) {
17381 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
17382 MVT WideVT = Res.getSimpleValueType();
17383 // Widened right shifts need two shifts to ensure we shift in zeroes.
17384 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
17385 int WideElts = WideVT.getVectorNumElements();
17386 // Shift left to put the original vector in the MSBs of the new size.
17387 Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
17388 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
17389 // Increase the shift amount to account for the left shift.
17390 ShiftAmt += WideElts - NumElts;
17391 }
17392
17393 Res = DAG.getNode(Opcode, DL, WideVT, Res,
17394 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17395 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17396 DAG.getIntPtrConstant(0, DL));
17397 }
17398 Offset += NumElts; // Increment for next iteration.
17399 }
17400
17401 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
17402 // ops instead.
17403 // TODO: What other unary shuffles would benefit from this?
17404 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
17405 SDValue Op0 = V1.getOperand(0);
17406 SDValue Op1 = V1.getOperand(1);
17407 ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
17408 EVT OpVT = Op0.getValueType();
17409 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
17410 return DAG.getSetCC(
17411 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
17412 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
17413 }
17414
17415 MVT ExtVT;
17416 switch (VT.SimpleTy) {
17417 default:
17418 llvm_unreachable("Expected a vector of i1 elements");
17419 case MVT::v2i1:
17420 ExtVT = MVT::v2i64;
17421 break;
17422 case MVT::v4i1:
17423 ExtVT = MVT::v4i32;
17424 break;
17425 case MVT::v8i1:
17426 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
17427 // shuffle.
17428 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
17429 break;
17430 case MVT::v16i1:
17431 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17432 // 256-bit operation available.
17433 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
17434 break;
17435 case MVT::v32i1:
17436 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17437 // 256-bit operation available.
17438 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
17439 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
17440 break;
17441 case MVT::v64i1:
17442 // Fall back to scalarization. FIXME: We can do better if the shuffle
17443 // can be partitioned cleanly.
17444 if (!Subtarget.useBWIRegs())
17445 return SDValue();
17446 ExtVT = MVT::v64i8;
17447 break;
17448 }
17449
17450 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
17451 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
17452
17453 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
17454 // i1 was sign extended we can use X86ISD::CVT2MASK.
17455 int NumElems = VT.getVectorNumElements();
17456 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
17457 (Subtarget.hasDQI() && (NumElems < 32)))
17458 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
17459 Shuffle, ISD::SETGT);
17460
17461 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
17462}
17463
17464/// Helper function that returns true if the shuffle mask should be
17465/// commuted to improve canonicalization.
17467 int NumElements = Mask.size();
17468
17469 int NumV1Elements = 0, NumV2Elements = 0;
17470 for (int M : Mask)
17471 if (M < 0)
17472 continue;
17473 else if (M < NumElements)
17474 ++NumV1Elements;
17475 else
17476 ++NumV2Elements;
17477
17478 // Commute the shuffle as needed such that more elements come from V1 than
17479 // V2. This allows us to match the shuffle pattern strictly on how many
17480 // elements come from V1 without handling the symmetric cases.
17481 if (NumV2Elements > NumV1Elements)
17482 return true;
17483
17484 assert(NumV1Elements > 0 && "No V1 indices");
17485
17486 if (NumV2Elements == 0)
17487 return false;
17488
17489 // When the number of V1 and V2 elements are the same, try to minimize the
17490 // number of uses of V2 in the low half of the vector. When that is tied,
17491 // ensure that the sum of indices for V1 is equal to or lower than the sum
17492 // indices for V2. When those are equal, try to ensure that the number of odd
17493 // indices for V1 is lower than the number of odd indices for V2.
17494 if (NumV1Elements == NumV2Elements) {
17495 int LowV1Elements = 0, LowV2Elements = 0;
17496 for (int M : Mask.slice(0, NumElements / 2))
17497 if (M >= NumElements)
17498 ++LowV2Elements;
17499 else if (M >= 0)
17500 ++LowV1Elements;
17501 if (LowV2Elements > LowV1Elements)
17502 return true;
17503 if (LowV2Elements == LowV1Elements) {
17504 int SumV1Indices = 0, SumV2Indices = 0;
17505 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17506 if (Mask[i] >= NumElements)
17507 SumV2Indices += i;
17508 else if (Mask[i] >= 0)
17509 SumV1Indices += i;
17510 if (SumV2Indices < SumV1Indices)
17511 return true;
17512 if (SumV2Indices == SumV1Indices) {
17513 int NumV1OddIndices = 0, NumV2OddIndices = 0;
17514 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17515 if (Mask[i] >= NumElements)
17516 NumV2OddIndices += i % 2;
17517 else if (Mask[i] >= 0)
17518 NumV1OddIndices += i % 2;
17519 if (NumV2OddIndices < NumV1OddIndices)
17520 return true;
17521 }
17522 }
17523 }
17524
17525 return false;
17526}
17527
17529 const X86Subtarget &Subtarget) {
17530 if (!Subtarget.hasAVX512())
17531 return false;
17532
17533 if (!V.getValueType().isSimple())
17534 return false;
17535
17536 MVT VT = V.getSimpleValueType().getScalarType();
17537 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
17538 return false;
17539
17540 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
17541 // are preferable to blendw/blendvb/masked-mov.
17542 if ((VT == MVT::i16 || VT == MVT::i8) &&
17543 V.getSimpleValueType().getSizeInBits() < 512)
17544 return false;
17545
17546 auto HasMaskOperation = [&](SDValue V) {
17547 // TODO: Currently we only check limited opcode. We probably extend
17548 // it to all binary operation by checking TLI.isBinOp().
17549 switch (V->getOpcode()) {
17550 default:
17551 return false;
17552 case ISD::ADD:
17553 case ISD::SUB:
17554 case ISD::AND:
17555 case ISD::XOR:
17556 case ISD::OR:
17557 case ISD::SMAX:
17558 case ISD::SMIN:
17559 case ISD::UMAX:
17560 case ISD::UMIN:
17561 case ISD::ABS:
17562 case ISD::SHL:
17563 case ISD::SRL:
17564 case ISD::SRA:
17565 case ISD::MUL:
17566 break;
17567 }
17568 if (!V->hasOneUse())
17569 return false;
17570
17571 return true;
17572 };
17573
17574 if (HasMaskOperation(V))
17575 return true;
17576
17577 return false;
17578}
17579
17580// Forward declaration.
17583 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
17584 const X86Subtarget &Subtarget);
17585
17586 /// Top-level lowering for x86 vector shuffles.
17587///
17588/// This handles decomposition, canonicalization, and lowering of all x86
17589/// vector shuffles. Most of the specific lowering strategies are encapsulated
17590/// above in helper routines. The canonicalization attempts to widen shuffles
17591/// to involve fewer lanes of wider elements, consolidate symmetric patterns
17592/// s.t. only one of the two inputs needs to be tested, etc.
17594 SelectionDAG &DAG) {
17595 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
17596 ArrayRef<int> OrigMask = SVOp->getMask();
17597 SDValue V1 = Op.getOperand(0);
17598 SDValue V2 = Op.getOperand(1);
17599 MVT VT = Op.getSimpleValueType();
17600 int NumElements = VT.getVectorNumElements();
17601 SDLoc DL(Op);
17602 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
17603
17604 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
17605 "Can't lower MMX shuffles");
17606
17607 bool V1IsUndef = V1.isUndef();
17608 bool V2IsUndef = V2.isUndef();
17609 if (V1IsUndef && V2IsUndef)
17610 return DAG.getUNDEF(VT);
17611
17612 // When we create a shuffle node we put the UNDEF node to second operand,
17613 // but in some cases the first operand may be transformed to UNDEF.
17614 // In this case we should just commute the node.
17615 if (V1IsUndef)
17616 return DAG.getCommutedVectorShuffle(*SVOp);
17617
17618 // Check for non-undef masks pointing at an undef vector and make the masks
17619 // undef as well. This makes it easier to match the shuffle based solely on
17620 // the mask.
17621 if (V2IsUndef &&
17622 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
17623 SmallVector<int, 8> NewMask(OrigMask);
17624 for (int &M : NewMask)
17625 if (M >= NumElements)
17626 M = -1;
17627 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
17628 }
17629
17630 // Check for illegal shuffle mask element index values.
17631 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
17632 (void)MaskUpperLimit;
17633 assert(llvm::all_of(OrigMask,
17634 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
17635 "Out of bounds shuffle index");
17636
17637 // We actually see shuffles that are entirely re-arrangements of a set of
17638 // zero inputs. This mostly happens while decomposing complex shuffles into
17639 // simple ones. Directly lower these as a buildvector of zeros.
17640 APInt KnownUndef, KnownZero;
17641 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
17642
17643 APInt Zeroable = KnownUndef | KnownZero;
17644 if (Zeroable.isAllOnes())
17645 return getZeroVector(VT, Subtarget, DAG, DL);
17646
17647 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
17648
17649 // Try to collapse shuffles into using a vector type with fewer elements but
17650 // wider element types. We cap this to not form integers or floating point
17651 // elements wider than 64 bits. It does not seem beneficial to form i128
17652 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
17653 SmallVector<int, 16> WidenedMask;
17654 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
17655 !canCombineAsMaskOperation(V1, Subtarget) &&
17656 !canCombineAsMaskOperation(V2, Subtarget) &&
17657 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
17658 // Shuffle mask widening should not interfere with a broadcast opportunity
17659 // by obfuscating the operands with bitcasts.
17660 // TODO: Avoid lowering directly from this top-level function: make this
17661 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
17662 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
17663 Subtarget, DAG))
17664 return Broadcast;
17665
17666 MVT NewEltVT = VT.isFloatingPoint()
17669 int NewNumElts = NumElements / 2;
17670 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
17671 // Make sure that the new vector type is legal. For example, v2f64 isn't
17672 // legal on SSE1.
17673 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
17674 if (V2IsZero) {
17675 // Modify the new Mask to take all zeros from the all-zero vector.
17676 // Choose indices that are blend-friendly.
17677 bool UsedZeroVector = false;
17678 assert(is_contained(WidenedMask, SM_SentinelZero) &&
17679 "V2's non-undef elements are used?!");
17680 for (int i = 0; i != NewNumElts; ++i)
17681 if (WidenedMask[i] == SM_SentinelZero) {
17682 WidenedMask[i] = i + NewNumElts;
17683 UsedZeroVector = true;
17684 }
17685 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
17686 // some elements to be undef.
17687 if (UsedZeroVector)
17688 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
17689 }
17690 V1 = DAG.getBitcast(NewVT, V1);
17691 V2 = DAG.getBitcast(NewVT, V2);
17692 return DAG.getBitcast(
17693 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
17694 }
17695 }
17696
17697 SmallVector<SDValue> Ops = {V1, V2};
17698 SmallVector<int> Mask(OrigMask);
17699
17700 // Canonicalize the shuffle with any horizontal ops inputs.
17701 // NOTE: This may update Ops and Mask.
17703 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
17704 return DAG.getBitcast(VT, HOp);
17705
17706 V1 = DAG.getBitcast(VT, Ops[0]);
17707 V2 = DAG.getBitcast(VT, Ops[1]);
17708 assert(NumElements == (int)Mask.size() &&
17709 "canonicalizeShuffleMaskWithHorizOp "
17710 "shouldn't alter the shuffle mask size");
17711
17712 // Commute the shuffle if it will improve canonicalization.
17715 std::swap(V1, V2);
17716 }
17717
17718 // For each vector width, delegate to a specialized lowering routine.
17719 if (VT.is128BitVector())
17720 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17721
17722 if (VT.is256BitVector())
17723 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17724
17725 if (VT.is512BitVector())
17726 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17727
17728 if (Is1BitVector)
17729 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
17730
17731 llvm_unreachable("Unimplemented!");
17732}
17733
17734/// Try to lower a VSELECT instruction to a vector shuffle.
17736 const X86Subtarget &Subtarget,
17737 SelectionDAG &DAG) {
17738 SDValue Cond = Op.getOperand(0);
17739 SDValue LHS = Op.getOperand(1);
17740 SDValue RHS = Op.getOperand(2);
17741 MVT VT = Op.getSimpleValueType();
17742
17743 // Only non-legal VSELECTs reach this lowering, convert those into generic
17744 // shuffles and re-use the shuffle lowering path for blends.
17748 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
17749 }
17750
17751 return SDValue();
17752}
17753
17754SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
17755 SDValue Cond = Op.getOperand(0);
17756 SDValue LHS = Op.getOperand(1);
17757 SDValue RHS = Op.getOperand(2);
17758
17759 SDLoc dl(Op);
17760 MVT VT = Op.getSimpleValueType();
17761 if (isSoftF16(VT, Subtarget)) {
17763 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
17764 DAG.getBitcast(NVT, LHS),
17765 DAG.getBitcast(NVT, RHS)));
17766 }
17767
17768 // A vselect where all conditions and data are constants can be optimized into
17769 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
17773 return SDValue();
17774
17775 // Try to lower this to a blend-style vector shuffle. This can handle all
17776 // constant condition cases.
17777 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
17778 return BlendOp;
17779
17780 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
17781 // with patterns on the mask registers on AVX-512.
17782 MVT CondVT = Cond.getSimpleValueType();
17783 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
17784 if (CondEltSize == 1)
17785 return Op;
17786
17787 // Variable blends are only legal from SSE4.1 onward.
17788 if (!Subtarget.hasSSE41())
17789 return SDValue();
17790
17791 unsigned EltSize = VT.getScalarSizeInBits();
17792 unsigned NumElts = VT.getVectorNumElements();
17793
17794 // Expand v32i16/v64i8 without BWI.
17795 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
17796 return SDValue();
17797
17798 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
17799 // into an i1 condition so that we can use the mask-based 512-bit blend
17800 // instructions.
17801 if (VT.getSizeInBits() == 512) {
17802 // Build a mask by testing the condition against zero.
17803 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
17804 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
17805 DAG.getConstant(0, dl, CondVT),
17806 ISD::SETNE);
17807 // Now return a new VSELECT using the mask.
17808 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
17809 }
17810
17811 // SEXT/TRUNC cases where the mask doesn't match the destination size.
17812 if (CondEltSize != EltSize) {
17813 // If we don't have a sign splat, rely on the expansion.
17814 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
17815 return SDValue();
17816
17817 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
17818 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
17819 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
17820 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
17821 }
17822
17823 // Only some types will be legal on some subtargets. If we can emit a legal
17824 // VSELECT-matching blend, return Op, and but if we need to expand, return
17825 // a null value.
17826 switch (VT.SimpleTy) {
17827 default:
17828 // Most of the vector types have blends past SSE4.1.
17829 return Op;
17830
17831 case MVT::v32i8:
17832 // The byte blends for AVX vectors were introduced only in AVX2.
17833 if (Subtarget.hasAVX2())
17834 return Op;
17835
17836 return SDValue();
17837
17838 case MVT::v8i16:
17839 case MVT::v16i16: {
17840 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
17841 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
17842 Cond = DAG.getBitcast(CastVT, Cond);
17843 LHS = DAG.getBitcast(CastVT, LHS);
17844 RHS = DAG.getBitcast(CastVT, RHS);
17845 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
17846 return DAG.getBitcast(VT, Select);
17847 }
17848 }
17849}
17850
17852 MVT VT = Op.getSimpleValueType();
17853 SDValue Vec = Op.getOperand(0);
17854 SDValue Idx = Op.getOperand(1);
17855 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
17856 SDLoc dl(Op);
17857
17859 return SDValue();
17860
17861 if (VT.getSizeInBits() == 8) {
17862 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
17863 // we're going to zero extend the register or fold the store.
17866 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
17867 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
17868 DAG.getBitcast(MVT::v4i32, Vec), Idx));
17869
17870 unsigned IdxVal = Idx->getAsZExtVal();
17871 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
17872 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
17873 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
17874 }
17875
17876 if (VT == MVT::f32) {
17877 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
17878 // the result back to FR32 register. It's only worth matching if the
17879 // result has a single use which is a store or a bitcast to i32. And in
17880 // the case of a store, it's not worth it if the index is a constant 0,
17881 // because a MOVSSmr can be used instead, which is smaller and faster.
17882 if (!Op.hasOneUse())
17883 return SDValue();
17884 SDNode *User = *Op.getNode()->use_begin();
17885 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
17886 (User->getOpcode() != ISD::BITCAST ||
17887 User->getValueType(0) != MVT::i32))
17888 return SDValue();
17889 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
17890 DAG.getBitcast(MVT::v4i32, Vec), Idx);
17891 return DAG.getBitcast(MVT::f32, Extract);
17892 }
17893
17894 if (VT == MVT::i32 || VT == MVT::i64)
17895 return Op;
17896
17897 return SDValue();
17898}
17899
17900/// Extract one bit from mask vector, like v16i1 or v8i1.
17901/// AVX-512 feature.
17903 const X86Subtarget &Subtarget) {
17904 SDValue Vec = Op.getOperand(0);
17905 SDLoc dl(Vec);
17906 MVT VecVT = Vec.getSimpleValueType();
17907 SDValue Idx = Op.getOperand(1);
17908 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
17909 MVT EltVT = Op.getSimpleValueType();
17910
17911 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
17912 "Unexpected vector type in ExtractBitFromMaskVector");
17913
17914 // variable index can't be handled in mask registers,
17915 // extend vector to VR512/128
17916 if (!IdxC) {
17917 unsigned NumElts = VecVT.getVectorNumElements();
17918 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
17919 // than extending to 128/256bit.
17920 if (NumElts == 1) {
17921 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
17923 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
17924 }
17925 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
17926 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
17927 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
17928 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
17929 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
17930 }
17931
17932 unsigned IdxVal = IdxC->getZExtValue();
17933 if (IdxVal == 0) // the operation is legal
17934 return Op;
17935
17936 // Extend to natively supported kshift.
17937 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
17938
17939 // Use kshiftr instruction to move to the lower element.
17940 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
17941 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
17942
17943 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
17944 DAG.getIntPtrConstant(0, dl));
17945}
17946
17947// Helper to find all the extracted elements from a vector.
17949 MVT VT = N->getSimpleValueType(0);
17950 unsigned NumElts = VT.getVectorNumElements();
17951 APInt DemandedElts = APInt::getZero(NumElts);
17952 for (SDNode *User : N->uses()) {
17953 switch (User->getOpcode()) {
17954 case X86ISD::PEXTRB:
17955 case X86ISD::PEXTRW:
17957 if (!isa<ConstantSDNode>(User->getOperand(1))) {
17958 DemandedElts.setAllBits();
17959 return DemandedElts;
17960 }
17961 DemandedElts.setBit(User->getConstantOperandVal(1));
17962 break;
17963 case ISD::BITCAST: {
17964 if (!User->getValueType(0).isSimple() ||
17965 !User->getValueType(0).isVector()) {
17966 DemandedElts.setAllBits();
17967 return DemandedElts;
17968 }
17969 APInt DemandedSrcElts = getExtractedDemandedElts(User);
17970 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
17971 break;
17972 }
17973 default:
17974 DemandedElts.setAllBits();
17975 return DemandedElts;
17976 }
17977 }
17978 return DemandedElts;
17979}
17980
17981SDValue
17982X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
17983 SelectionDAG &DAG) const {
17984 SDLoc dl(Op);
17985 SDValue Vec = Op.getOperand(0);
17986 MVT VecVT = Vec.getSimpleValueType();
17987 SDValue Idx = Op.getOperand(1);
17988 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
17989
17990 if (VecVT.getVectorElementType() == MVT::i1)
17991 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
17992
17993 if (!IdxC) {
17994 // Its more profitable to go through memory (1 cycles throughput)
17995 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
17996 // IACA tool was used to get performance estimation
17997 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
17998 //
17999 // example : extractelement <16 x i8> %a, i32 %i
18000 //
18001 // Block Throughput: 3.00 Cycles
18002 // Throughput Bottleneck: Port5
18003 //
18004 // | Num Of | Ports pressure in cycles | |
18005 // | Uops | 0 - DV | 5 | 6 | 7 | |
18006 // ---------------------------------------------
18007 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
18008 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
18009 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
18010 // Total Num Of Uops: 4
18011 //
18012 //
18013 // Block Throughput: 1.00 Cycles
18014 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18015 //
18016 // | | Ports pressure in cycles | |
18017 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18018 // ---------------------------------------------------------
18019 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18020 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18021 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
18022 // Total Num Of Uops: 4
18023
18024 return SDValue();
18025 }
18026
18027 unsigned IdxVal = IdxC->getZExtValue();
18028
18029 // If this is a 256-bit vector result, first extract the 128-bit vector and
18030 // then extract the element from the 128-bit vector.
18031 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18032 // Get the 128-bit vector.
18033 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18034 MVT EltVT = VecVT.getVectorElementType();
18035
18036 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18037 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
18038
18039 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18040 // this can be done with a mask.
18041 IdxVal &= ElemsPerChunk - 1;
18042 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18043 DAG.getIntPtrConstant(IdxVal, dl));
18044 }
18045
18046 assert(VecVT.is128BitVector() && "Unexpected vector length");
18047
18048 MVT VT = Op.getSimpleValueType();
18049
18050 if (VT == MVT::i16) {
18051 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18052 // we're going to zero extend the register or fold the store (SSE41 only).
18053 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
18054 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
18055 if (Subtarget.hasFP16())
18056 return Op;
18057
18058 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18059 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18060 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18061 }
18062
18063 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18064 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18065 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18066 }
18067
18068 if (Subtarget.hasSSE41())
18069 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18070 return Res;
18071
18072 // Only extract a single element from a v16i8 source - determine the common
18073 // DWORD/WORD that all extractions share, and extract the sub-byte.
18074 // TODO: Add QWORD MOVQ extraction?
18075 if (VT == MVT::i8) {
18076 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
18077 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
18078
18079 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18080 int DWordIdx = IdxVal / 4;
18081 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
18082 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18083 DAG.getBitcast(MVT::v4i32, Vec),
18084 DAG.getIntPtrConstant(DWordIdx, dl));
18085 int ShiftVal = (IdxVal % 4) * 8;
18086 if (ShiftVal != 0)
18087 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18088 DAG.getConstant(ShiftVal, dl, MVT::i8));
18089 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18090 }
18091
18092 int WordIdx = IdxVal / 2;
18093 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
18094 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18095 DAG.getBitcast(MVT::v8i16, Vec),
18096 DAG.getIntPtrConstant(WordIdx, dl));
18097 int ShiftVal = (IdxVal % 2) * 8;
18098 if (ShiftVal != 0)
18099 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18100 DAG.getConstant(ShiftVal, dl, MVT::i8));
18101 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18102 }
18103 }
18104
18105 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
18106 if (IdxVal == 0)
18107 return Op;
18108
18109 // Shuffle the element to the lowest element, then movss or movsh.
18111 Mask[0] = static_cast<int>(IdxVal);
18112 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18113 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18114 DAG.getIntPtrConstant(0, dl));
18115 }
18116
18117 if (VT.getSizeInBits() == 64) {
18118 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18119 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18120 // to match extract_elt for f64.
18121 if (IdxVal == 0)
18122 return Op;
18123
18124 // UNPCKHPD the element to the lowest double word, then movsd.
18125 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18126 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18127 int Mask[2] = { 1, -1 };
18128 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18129 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18130 DAG.getIntPtrConstant(0, dl));
18131 }
18132
18133 return SDValue();
18134}
18135
18136/// Insert one bit to mask vector, like v16i1 or v8i1.
18137/// AVX-512 feature.
18139 const X86Subtarget &Subtarget) {
18140 SDLoc dl(Op);
18141 SDValue Vec = Op.getOperand(0);
18142 SDValue Elt = Op.getOperand(1);
18143 SDValue Idx = Op.getOperand(2);
18144 MVT VecVT = Vec.getSimpleValueType();
18145
18146 if (!isa<ConstantSDNode>(Idx)) {
18147 // Non constant index. Extend source and destination,
18148 // insert element and then truncate the result.
18149 unsigned NumElts = VecVT.getVectorNumElements();
18150 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18151 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18152 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18153 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18154 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18155 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18156 }
18157
18158 // Copy into a k-register, extract to v1i1 and insert_subvector.
18159 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18160 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18161}
18162
18163SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18164 SelectionDAG &DAG) const {
18165 MVT VT = Op.getSimpleValueType();
18166 MVT EltVT = VT.getVectorElementType();
18167 unsigned NumElts = VT.getVectorNumElements();
18168 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18169
18170 if (EltVT == MVT::i1)
18171 return InsertBitToMaskVector(Op, DAG, Subtarget);
18172
18173 SDLoc dl(Op);
18174 SDValue N0 = Op.getOperand(0);
18175 SDValue N1 = Op.getOperand(1);
18176 SDValue N2 = Op.getOperand(2);
18177 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18178
18179 if (EltVT == MVT::bf16) {
18181 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
18182 DAG.getBitcast(IVT, N0),
18183 DAG.getBitcast(MVT::i16, N1), N2);
18184 return DAG.getBitcast(VT, Res);
18185 }
18186
18187 if (!N2C) {
18188 // Variable insertion indices, usually we're better off spilling to stack,
18189 // but AVX512 can use a variable compare+select by comparing against all
18190 // possible vector indices, and FP insertion has less gpr->simd traffic.
18191 if (!(Subtarget.hasBWI() ||
18192 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18193 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
18194 return SDValue();
18195
18196 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18197 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18198 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18199 return SDValue();
18200
18201 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18202 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18203 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18204
18205 SmallVector<SDValue, 16> RawIndices;
18206 for (unsigned I = 0; I != NumElts; ++I)
18207 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18208 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18209
18210 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18211 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18213 }
18214
18215 if (N2C->getAPIntValue().uge(NumElts))
18216 return SDValue();
18217 uint64_t IdxVal = N2C->getZExtValue();
18218
18219 bool IsZeroElt = X86::isZeroNode(N1);
18220 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18221
18222 if (IsZeroElt || IsAllOnesElt) {
18223 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
18224 // We don't deal with i8 0 since it appears to be handled elsewhere.
18225 if (IsAllOnesElt &&
18226 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
18227 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
18228 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
18229 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
18230 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
18231 CstVectorElts[IdxVal] = OnesCst;
18232 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
18233 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
18234 }
18235 // See if we can do this more efficiently with a blend shuffle with a
18236 // rematerializable vector.
18237 if (Subtarget.hasSSE41() &&
18238 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
18239 SmallVector<int, 8> BlendMask;
18240 for (unsigned i = 0; i != NumElts; ++i)
18241 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18242 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
18243 : getOnesVector(VT, DAG, dl);
18244 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
18245 }
18246 }
18247
18248 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
18249 // into that, and then insert the subvector back into the result.
18250 if (VT.is256BitVector() || VT.is512BitVector()) {
18251 // With a 256-bit vector, we can insert into the zero element efficiently
18252 // using a blend if we have AVX or AVX2 and the right data type.
18253 if (VT.is256BitVector() && IdxVal == 0) {
18254 // TODO: It is worthwhile to cast integer to floating point and back
18255 // and incur a domain crossing penalty if that's what we'll end up
18256 // doing anyway after extracting to a 128-bit vector.
18257 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
18258 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
18259 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18260 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
18261 DAG.getTargetConstant(1, dl, MVT::i8));
18262 }
18263 }
18264
18265 unsigned NumEltsIn128 = 128 / EltSizeInBits;
18266 assert(isPowerOf2_32(NumEltsIn128) &&
18267 "Vectors will always have power-of-two number of elements.");
18268
18269 // If we are not inserting into the low 128-bit vector chunk,
18270 // then prefer the broadcast+blend sequence.
18271 // FIXME: relax the profitability check iff all N1 uses are insertions.
18272 if (IdxVal >= NumEltsIn128 &&
18273 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
18274 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
18275 X86::mayFoldLoad(N1, Subtarget)))) {
18276 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
18277 SmallVector<int, 8> BlendMask;
18278 for (unsigned i = 0; i != NumElts; ++i)
18279 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18280 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
18281 }
18282
18283 // Get the desired 128-bit vector chunk.
18284 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
18285
18286 // Insert the element into the desired chunk.
18287 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
18288 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
18289
18290 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
18291 DAG.getIntPtrConstant(IdxIn128, dl));
18292
18293 // Insert the changed part back into the bigger vector
18294 return insert128BitVector(N0, V, IdxVal, DAG, dl);
18295 }
18296 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
18297
18298 // This will be just movw/movd/movq/movsh/movss/movsd.
18299 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
18300 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
18301 EltVT == MVT::f16 || EltVT == MVT::i64) {
18302 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18303 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18304 }
18305
18306 // We can't directly insert an i8 or i16 into a vector, so zero extend
18307 // it to i32 first.
18308 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
18309 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
18310 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
18311 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
18312 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18313 return DAG.getBitcast(VT, N1);
18314 }
18315 }
18316
18317 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
18318 // argument. SSE41 required for pinsrb.
18319 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
18320 unsigned Opc;
18321 if (VT == MVT::v8i16) {
18322 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
18323 Opc = X86ISD::PINSRW;
18324 } else {
18325 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
18326 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
18327 Opc = X86ISD::PINSRB;
18328 }
18329
18330 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
18331 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
18332 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
18333 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
18334 }
18335
18336 if (Subtarget.hasSSE41()) {
18337 if (EltVT == MVT::f32) {
18338 // Bits [7:6] of the constant are the source select. This will always be
18339 // zero here. The DAG Combiner may combine an extract_elt index into
18340 // these bits. For example (insert (extract, 3), 2) could be matched by
18341 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
18342 // Bits [5:4] of the constant are the destination select. This is the
18343 // value of the incoming immediate.
18344 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
18345 // combine either bitwise AND or insert of float 0.0 to set these bits.
18346
18347 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
18348 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
18349 // If this is an insertion of 32-bits into the low 32-bits of
18350 // a vector, we prefer to generate a blend with immediate rather
18351 // than an insertps. Blends are simpler operations in hardware and so
18352 // will always have equal or better performance than insertps.
18353 // But if optimizing for size and there's a load folding opportunity,
18354 // generate insertps because blendps does not have a 32-bit memory
18355 // operand form.
18356 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18357 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
18358 DAG.getTargetConstant(1, dl, MVT::i8));
18359 }
18360 // Create this as a scalar to vector..
18361 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18362 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
18363 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
18364 }
18365
18366 // PINSR* works with constant index.
18367 if (EltVT == MVT::i32 || EltVT == MVT::i64)
18368 return Op;
18369 }
18370
18371 return SDValue();
18372}
18373
18375 SelectionDAG &DAG) {
18376 SDLoc dl(Op);
18377 MVT OpVT = Op.getSimpleValueType();
18378
18379 // It's always cheaper to replace a xor+movd with xorps and simplifies further
18380 // combines.
18381 if (X86::isZeroNode(Op.getOperand(0)))
18382 return getZeroVector(OpVT, Subtarget, DAG, dl);
18383
18384 // If this is a 256-bit vector result, first insert into a 128-bit
18385 // vector and then insert into the 256-bit vector.
18386 if (!OpVT.is128BitVector()) {
18387 // Insert into a 128-bit vector.
18388 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
18390 OpVT.getVectorNumElements() / SizeFactor);
18391
18392 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
18393
18394 // Insert the 128-bit vector.
18395 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
18396 }
18397 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
18398 "Expected an SSE type!");
18399
18400 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
18401 // tblgen.
18402 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
18403 return Op;
18404
18405 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
18406 return DAG.getBitcast(
18407 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
18408}
18409
18410// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
18411// simple superregister reference or explicit instructions to insert
18412// the upper bits of a vector.
18414 SelectionDAG &DAG) {
18415 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
18416
18417 return insert1BitVector(Op, DAG, Subtarget);
18418}
18419
18421 SelectionDAG &DAG) {
18422 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
18423 "Only vXi1 extract_subvectors need custom lowering");
18424
18425 SDLoc dl(Op);
18426 SDValue Vec = Op.getOperand(0);
18427 uint64_t IdxVal = Op.getConstantOperandVal(1);
18428
18429 if (IdxVal == 0) // the operation is legal
18430 return Op;
18431
18432 // Extend to natively supported kshift.
18433 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18434
18435 // Shift to the LSB.
18436 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18437 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18438
18439 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
18440 DAG.getIntPtrConstant(0, dl));
18441}
18442
18443// Returns the appropriate wrapper opcode for a global reference.
18444unsigned X86TargetLowering::getGlobalWrapperKind(
18445 const GlobalValue *GV, const unsigned char OpFlags) const {
18446 // References to absolute symbols are never PC-relative.
18447 if (GV && GV->isAbsoluteSymbolRef())
18448 return X86ISD::Wrapper;
18449
18450 // The following OpFlags under RIP-rel PIC use RIP.
18451 if (Subtarget.isPICStyleRIPRel() &&
18452 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
18453 OpFlags == X86II::MO_DLLIMPORT))
18454 return X86ISD::WrapperRIP;
18455
18456 // GOTPCREL references must always use RIP.
18457 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
18458 return X86ISD::WrapperRIP;
18459
18460 return X86ISD::Wrapper;
18461}
18462
18463// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
18464// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
18465// one of the above mentioned nodes. It has to be wrapped because otherwise
18466// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
18467// be used to form addressing mode. These wrapped nodes will be selected
18468// into MOV32ri.
18469SDValue
18470X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
18471 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
18472
18473 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18474 // global base reg.
18475 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18476
18477 auto PtrVT = getPointerTy(DAG.getDataLayout());
18479 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
18480 SDLoc DL(CP);
18481 Result =
18482 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
18483 // With PIC, the address is actually $g + Offset.
18484 if (OpFlag) {
18485 Result =
18486 DAG.getNode(ISD::ADD, DL, PtrVT,
18487 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18488 }
18489
18490 return Result;
18491}
18492
18493SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
18494 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
18495
18496 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18497 // global base reg.
18498 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18499
18500 auto PtrVT = getPointerTy(DAG.getDataLayout());
18501 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
18502 SDLoc DL(JT);
18503 Result =
18504 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
18505
18506 // With PIC, the address is actually $g + Offset.
18507 if (OpFlag)
18508 Result =
18509 DAG.getNode(ISD::ADD, DL, PtrVT,
18510 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18511
18512 return Result;
18513}
18514
18515SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
18516 SelectionDAG &DAG) const {
18517 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
18518}
18519
18520SDValue
18521X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
18522 // Create the TargetBlockAddressAddress node.
18523 unsigned char OpFlags =
18525 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
18526 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
18527 SDLoc dl(Op);
18528 auto PtrVT = getPointerTy(DAG.getDataLayout());
18529 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
18530 Result =
18531 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
18532
18533 // With PIC, the address is actually $g + Offset.
18534 if (isGlobalRelativeToPICBase(OpFlags)) {
18535 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
18536 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
18537 }
18538
18539 return Result;
18540}
18541
18542/// Creates target global address or external symbol nodes for calls or
18543/// other uses.
18544SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
18545 bool ForCall) const {
18546 // Unpack the global address or external symbol.
18547 const SDLoc &dl = SDLoc(Op);
18548 const GlobalValue *GV = nullptr;
18549 int64_t Offset = 0;
18550 const char *ExternalSym = nullptr;
18551 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
18552 GV = G->getGlobal();
18553 Offset = G->getOffset();
18554 } else {
18555 const auto *ES = cast<ExternalSymbolSDNode>(Op);
18556 ExternalSym = ES->getSymbol();
18557 }
18558
18559 // Calculate some flags for address lowering.
18561 unsigned char OpFlags;
18562 if (ForCall)
18563 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
18564 else
18565 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
18566 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
18567 bool NeedsLoad = isGlobalStubReference(OpFlags);
18568
18570 auto PtrVT = getPointerTy(DAG.getDataLayout());
18572
18573 if (GV) {
18574 // Create a target global address if this is a global. If possible, fold the
18575 // offset into the global address reference. Otherwise, ADD it on later.
18576 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
18577 // allowed because if the address of foo is 0, the ELF R_X86_64_32
18578 // relocation will compute to a negative value, which is invalid.
18579 int64_t GlobalOffset = 0;
18580 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
18582 std::swap(GlobalOffset, Offset);
18583 }
18584 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
18585 } else {
18586 // If this is not a global address, this must be an external symbol.
18587 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
18588 }
18589
18590 // If this is a direct call, avoid the wrapper if we don't need to do any
18591 // loads or adds. This allows SDAG ISel to match direct calls.
18592 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
18593 return Result;
18594
18595 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
18596
18597 // With PIC, the address is actually $g + Offset.
18598 if (HasPICReg) {
18599 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
18600 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
18601 }
18602
18603 // For globals that require a load from a stub to get the address, emit the
18604 // load.
18605 if (NeedsLoad)
18606 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
18608
18609 // If there was a non-zero offset that we didn't fold, create an explicit
18610 // addition for it.
18611 if (Offset != 0)
18612 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
18613 DAG.getConstant(Offset, dl, PtrVT));
18614
18615 return Result;
18616}
18617
18618SDValue
18619X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
18620 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
18621}
18622
18623static SDValue
18625 SDValue *InGlue, const EVT PtrVT, unsigned ReturnReg,
18626 unsigned char OperandFlags, bool LocalDynamic = false) {
18628 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18629 SDLoc dl(GA);
18630 SDValue TGA;
18631 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
18632 if (LocalDynamic && UseTLSDESC) {
18633 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
18634 auto UI = TGA->use_begin();
18635 // Reuse existing GetTLSADDR node if we can find it.
18636 if (UI != TGA->use_end())
18637 return SDValue(*UI->use_begin()->use_begin(), 0);
18638 } else {
18639 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
18640 GA->getOffset(), OperandFlags);
18641 }
18642
18643 X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC
18644 : LocalDynamic ? X86ISD::TLSBASEADDR
18646
18647 if (InGlue) {
18648 SDValue Ops[] = { Chain, TGA, *InGlue };
18649 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
18650 } else {
18651 SDValue Ops[] = { Chain, TGA };
18652 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
18653 }
18654
18655 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
18656 MFI.setAdjustsStack(true);
18657 MFI.setHasCalls(true);
18658
18659 SDValue Glue = Chain.getValue(1);
18660 SDValue Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
18661
18662 if (!UseTLSDESC)
18663 return Ret;
18664
18665 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
18666 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
18667
18669 SDValue Offset =
18670 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
18672 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
18673}
18674
18675// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
18676static SDValue
18678 const EVT PtrVT) {
18679 SDValue InGlue;
18680 SDLoc dl(GA); // ? function entry point might be better
18681 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
18683 SDLoc(), PtrVT), InGlue);
18684 InGlue = Chain.getValue(1);
18685
18686 return GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX, X86II::MO_TLSGD);
18687}
18688
18689// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
18690static SDValue
18692 const EVT PtrVT) {
18693 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
18694 X86::RAX, X86II::MO_TLSGD);
18695}
18696
18697// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
18698static SDValue
18700 const EVT PtrVT) {
18701 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
18702 X86::EAX, X86II::MO_TLSGD);
18703}
18704
18706 SelectionDAG &DAG, const EVT PtrVT,
18707 bool Is64Bit, bool Is64BitLP64) {
18708 SDLoc dl(GA);
18709
18710 // Get the start address of the TLS block for this module.
18714
18715 SDValue Base;
18716 if (Is64Bit) {
18717 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
18718 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,
18719 X86II::MO_TLSLD, /*LocalDynamic=*/true);
18720 } else {
18721 SDValue InGlue;
18722 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
18723 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InGlue);
18724 InGlue = Chain.getValue(1);
18725 Base = GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX,
18726 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
18727 }
18728
18729 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
18730 // of Base.
18731
18732 // Build x@dtpoff.
18733 unsigned char OperandFlags = X86II::MO_DTPOFF;
18734 unsigned WrapperKind = X86ISD::Wrapper;
18735 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
18736 GA->getValueType(0),
18737 GA->getOffset(), OperandFlags);
18738 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
18739
18740 // Add x@dtpoff with the base.
18741 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
18742}
18743
18744// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
18746 const EVT PtrVT, TLSModel::Model model,
18747 bool is64Bit, bool isPIC) {
18748 SDLoc dl(GA);
18749
18750 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
18752 PointerType::get(*DAG.getContext(), is64Bit ? 257 : 256));
18753
18754 SDValue ThreadPointer =
18755 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
18757
18758 unsigned char OperandFlags = 0;
18759 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
18760 // initialexec.
18761 unsigned WrapperKind = X86ISD::Wrapper;
18762 if (model == TLSModel::LocalExec) {
18763 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
18764 } else if (model == TLSModel::InitialExec) {
18765 if (is64Bit) {
18766 OperandFlags = X86II::MO_GOTTPOFF;
18767 WrapperKind = X86ISD::WrapperRIP;
18768 } else {
18769 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
18770 }
18771 } else {
18772 llvm_unreachable("Unexpected model");
18773 }
18774
18775 // emit "addl x@ntpoff,%eax" (local exec)
18776 // or "addl x@indntpoff,%eax" (initial exec)
18777 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
18778 SDValue TGA =
18779 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
18780 GA->getOffset(), OperandFlags);
18781 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
18782
18783 if (model == TLSModel::InitialExec) {
18784 if (isPIC && !is64Bit) {
18785 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
18786 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
18787 Offset);
18788 }
18789
18790 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
18792 }
18793
18794 // The address of the thread local variable is the add of the thread
18795 // pointer with the offset of the variable.
18796 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
18797}
18798
18799SDValue
18800X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
18801
18802 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
18803
18804 if (DAG.getTarget().useEmulatedTLS())
18805 return LowerToTLSEmulatedModel(GA, DAG);
18806
18807 const GlobalValue *GV = GA->getGlobal();
18808 auto PtrVT = getPointerTy(DAG.getDataLayout());
18809 bool PositionIndependent = isPositionIndependent();
18810
18811 if (Subtarget.isTargetELF()) {
18812 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
18813 switch (model) {
18815 if (Subtarget.is64Bit()) {
18816 if (Subtarget.isTarget64BitLP64())
18817 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
18818 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
18819 }
18820 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
18822 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
18823 Subtarget.isTarget64BitLP64());
18826 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
18827 PositionIndependent);
18828 }
18829 llvm_unreachable("Unknown TLS model.");
18830 }
18831
18832 if (Subtarget.isTargetDarwin()) {
18833 // Darwin only has one model of TLS. Lower to that.
18834 unsigned char OpFlag = 0;
18835 unsigned WrapperKind = 0;
18836
18837 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18838 // global base reg.
18839 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
18840 if (PIC32) {
18841 OpFlag = X86II::MO_TLVP_PIC_BASE;
18842 WrapperKind = X86ISD::Wrapper;
18843 } else {
18844 OpFlag = X86II::MO_TLVP;
18845 WrapperKind = X86ISD::WrapperRIP;
18846 }
18847 SDLoc DL(Op);
18849 GA->getValueType(0),
18850 GA->getOffset(), OpFlag);
18851 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
18852
18853 // With PIC32, the address is actually $g + Offset.
18854 if (PIC32)
18855 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
18856 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
18857 Offset);
18858
18859 // Lowering the machine isd will make sure everything is in the right
18860 // location.
18861 SDValue Chain = DAG.getEntryNode();
18862 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18863 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
18864 SDValue Args[] = { Chain, Offset };
18865 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
18866 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
18867
18868 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
18870 MFI.setAdjustsStack(true);
18871
18872 // And our return value (tls address) is in the standard call return value
18873 // location.
18874 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
18875 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
18876 }
18877
18878 if (Subtarget.isOSWindows()) {
18879 // Just use the implicit TLS architecture
18880 // Need to generate something similar to:
18881 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
18882 // ; from TEB
18883 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
18884 // mov rcx, qword [rdx+rcx*8]
18885 // mov eax, .tls$:tlsvar
18886 // [rax+rcx] contains the address
18887 // Windows 64bit: gs:0x58
18888 // Windows 32bit: fs:__tls_array
18889
18890 SDLoc dl(GA);
18891 SDValue Chain = DAG.getEntryNode();
18892
18893 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
18894 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
18895 // use its literal value of 0x2C.
18897 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), 256)
18898 : PointerType::get(*DAG.getContext(), 257));
18899
18900 SDValue TlsArray = Subtarget.is64Bit()
18901 ? DAG.getIntPtrConstant(0x58, dl)
18902 : (Subtarget.isTargetWindowsGNU()
18903 ? DAG.getIntPtrConstant(0x2C, dl)
18904 : DAG.getExternalSymbol("_tls_array", PtrVT));
18905
18907 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
18908
18909 SDValue res;
18911 res = ThreadPointer;
18912 } else {
18913 // Load the _tls_index variable
18914 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
18915 if (Subtarget.is64Bit())
18916 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
18917 MachinePointerInfo(), MVT::i32);
18918 else
18919 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
18920
18921 const DataLayout &DL = DAG.getDataLayout();
18922 SDValue Scale =
18923 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
18924 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
18925
18926 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
18927 }
18928
18929 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
18930
18931 // Get the offset of start of .tls section
18932 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
18933 GA->getValueType(0),
18935 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
18936
18937 // The address of the thread local variable is the add of the thread
18938 // pointer with the offset of the variable.
18939 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
18940 }
18941
18942 llvm_unreachable("TLS not implemented for this target.");
18943}
18944
18946 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
18948 TLSModel::Model Model = TM.getTLSModel(&GV);
18949 switch (Model) {
18952 // We can include the %fs segment register in addressing modes.
18953 return true;
18956 // These models do not result in %fs relative addresses unless
18957 // TLS descriptior are used.
18958 //
18959 // Even in the case of TLS descriptors we currently have no way to model
18960 // the difference between %fs access and the computations needed for the
18961 // offset and returning `true` for TLS-desc currently duplicates both
18962 // which is detrimental :-/
18963 return false;
18964 }
18965 }
18966 return false;
18967}
18968
18969/// Lower SRA_PARTS and friends, which return two i32 values
18970/// and take a 2 x i32 value to shift plus a shift amount.
18971/// TODO: Can this be moved to general expansion code?
18973 SDValue Lo, Hi;
18974 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
18975 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
18976}
18977
18978// Try to use a packed vector operation to handle i64 on 32-bit targets when
18979// AVX512DQ is enabled.
18981 SelectionDAG &DAG,
18982 const X86Subtarget &Subtarget) {
18983 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
18984 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
18985 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
18986 Op.getOpcode() == ISD::UINT_TO_FP) &&
18987 "Unexpected opcode!");
18988 bool IsStrict = Op->isStrictFPOpcode();
18989 unsigned OpNo = IsStrict ? 1 : 0;
18990 SDValue Src = Op.getOperand(OpNo);
18991 MVT SrcVT = Src.getSimpleValueType();
18992 MVT VT = Op.getSimpleValueType();
18993
18994 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
18995 (VT != MVT::f32 && VT != MVT::f64))
18996 return SDValue();
18997
18998 // Pack the i64 into a vector, do the operation and extract.
18999
19000 // Using 256-bit to ensure result is 128-bits for f32 case.
19001 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19002 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19003 MVT VecVT = MVT::getVectorVT(VT, NumElts);
19004
19005 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19006 if (IsStrict) {
19007 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19008 {Op.getOperand(0), InVec});
19009 SDValue Chain = CvtVec.getValue(1);
19010 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19011 DAG.getIntPtrConstant(0, dl));
19012 return DAG.getMergeValues({Value, Chain}, dl);
19013 }
19014
19015 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19016
19017 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19018 DAG.getIntPtrConstant(0, dl));
19019}
19020
19021// Try to use a packed vector operation to handle i64 on 32-bit targets.
19023 const X86Subtarget &Subtarget) {
19024 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19025 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19026 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19027 Op.getOpcode() == ISD::UINT_TO_FP) &&
19028 "Unexpected opcode!");
19029 bool IsStrict = Op->isStrictFPOpcode();
19030 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19031 MVT SrcVT = Src.getSimpleValueType();
19032 MVT VT = Op.getSimpleValueType();
19033
19034 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
19035 return SDValue();
19036
19037 // Pack the i64 into a vector, do the operation and extract.
19038
19039 assert(Subtarget.hasFP16() && "Expected FP16");
19040
19041 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
19042 if (IsStrict) {
19043 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
19044 {Op.getOperand(0), InVec});
19045 SDValue Chain = CvtVec.getValue(1);
19046 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19047 DAG.getIntPtrConstant(0, dl));
19048 return DAG.getMergeValues({Value, Chain}, dl);
19049 }
19050
19051 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
19052
19053 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19054 DAG.getIntPtrConstant(0, dl));
19055}
19056
19057static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19058 const X86Subtarget &Subtarget) {
19059 switch (Opcode) {
19060 case ISD::SINT_TO_FP:
19061 // TODO: Handle wider types with AVX/AVX512.
19062 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19063 return false;
19064 // CVTDQ2PS or (V)CVTDQ2PD
19065 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19066
19067 case ISD::UINT_TO_FP:
19068 // TODO: Handle wider types and i64 elements.
19069 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19070 return false;
19071 // VCVTUDQ2PS or VCVTUDQ2PD
19072 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19073
19074 default:
19075 return false;
19076 }
19077}
19078
19079/// Given a scalar cast operation that is extracted from a vector, try to
19080/// vectorize the cast op followed by extraction. This will avoid an expensive
19081/// round-trip between XMM and GPR.
19083 SelectionDAG &DAG,
19084 const X86Subtarget &Subtarget) {
19085 // TODO: This could be enhanced to handle smaller integer types by peeking
19086 // through an extend.
19087 SDValue Extract = Cast.getOperand(0);
19088 MVT DestVT = Cast.getSimpleValueType();
19089 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19090 !isa<ConstantSDNode>(Extract.getOperand(1)))
19091 return SDValue();
19092
19093 // See if we have a 128-bit vector cast op for this type of cast.
19094 SDValue VecOp = Extract.getOperand(0);
19095 MVT FromVT = VecOp.getSimpleValueType();
19096 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19097 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19098 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19099 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19100 return SDValue();
19101
19102 // If we are extracting from a non-zero element, first shuffle the source
19103 // vector to allow extracting from element zero.
19104 if (!isNullConstant(Extract.getOperand(1))) {
19105 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19106 Mask[0] = Extract.getConstantOperandVal(1);
19107 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19108 }
19109 // If the source vector is wider than 128-bits, extract the low part. Do not
19110 // create an unnecessarily wide vector cast op.
19111 if (FromVT != Vec128VT)
19112 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19113
19114 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19115 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19116 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19117 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19118 DAG.getIntPtrConstant(0, DL));
19119}
19120
19121/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19122/// try to vectorize the cast ops. This will avoid an expensive round-trip
19123/// between XMM and GPR.
19124static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
19125 SelectionDAG &DAG,
19126 const X86Subtarget &Subtarget) {
19127 // TODO: Allow FP_TO_UINT.
19128 SDValue CastToInt = CastToFP.getOperand(0);
19129 MVT VT = CastToFP.getSimpleValueType();
19130 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19131 return SDValue();
19132
19133 MVT IntVT = CastToInt.getSimpleValueType();
19134 SDValue X = CastToInt.getOperand(0);
19135 MVT SrcVT = X.getSimpleValueType();
19136 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19137 return SDValue();
19138
19139 // See if we have 128-bit vector cast instructions for this type of cast.
19140 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19141 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19142 IntVT != MVT::i32)
19143 return SDValue();
19144
19145 unsigned SrcSize = SrcVT.getSizeInBits();
19146 unsigned IntSize = IntVT.getSizeInBits();
19147 unsigned VTSize = VT.getSizeInBits();
19148 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19149 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19150 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19151
19152 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19153 unsigned ToIntOpcode =
19154 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19155 unsigned ToFPOpcode =
19156 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19157
19158 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19159 //
19160 // We are not defining the high elements (for example, zero them) because
19161 // that could nullify any performance advantage that we hoped to gain from
19162 // this vector op hack. We do not expect any adverse effects (like denorm
19163 // penalties) with cast ops.
19164 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
19165 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19166 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19167 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19168 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19169}
19170
19172 SelectionDAG &DAG,
19173 const X86Subtarget &Subtarget) {
19174 bool IsStrict = Op->isStrictFPOpcode();
19175 MVT VT = Op->getSimpleValueType(0);
19176 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19177
19178 if (Subtarget.hasDQI()) {
19179 assert(!Subtarget.hasVLX() && "Unexpected features");
19180
19181 assert((Src.getSimpleValueType() == MVT::v2i64 ||
19182 Src.getSimpleValueType() == MVT::v4i64) &&
19183 "Unsupported custom type");
19184
19185 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19186 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19187 "Unexpected VT!");
19188 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19189
19190 // Need to concat with zero vector for strict fp to avoid spurious
19191 // exceptions.
19192 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19193 : DAG.getUNDEF(MVT::v8i64);
19194 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19195 DAG.getIntPtrConstant(0, DL));
19196 SDValue Res, Chain;
19197 if (IsStrict) {
19198 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19199 {Op->getOperand(0), Src});
19200 Chain = Res.getValue(1);
19201 } else {
19202 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19203 }
19204
19205 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19206 DAG.getIntPtrConstant(0, DL));
19207
19208 if (IsStrict)
19209 return DAG.getMergeValues({Res, Chain}, DL);
19210 return Res;
19211 }
19212
19213 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19214 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19215 if (VT != MVT::v4f32 || IsSigned)
19216 return SDValue();
19217
19218 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
19219 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
19220 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
19221 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
19222 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
19223 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
19224 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
19225 SmallVector<SDValue, 4> SignCvts(4);
19226 SmallVector<SDValue, 4> Chains(4);
19227 for (int i = 0; i != 4; ++i) {
19228 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
19229 DAG.getIntPtrConstant(i, DL));
19230 if (IsStrict) {
19231 SignCvts[i] =
19232 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
19233 {Op.getOperand(0), Elt});
19234 Chains[i] = SignCvts[i].getValue(1);
19235 } else {
19236 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
19237 }
19238 }
19239 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
19240
19241 SDValue Slow, Chain;
19242 if (IsStrict) {
19243 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
19244 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
19245 {Chain, SignCvt, SignCvt});
19246 Chain = Slow.getValue(1);
19247 } else {
19248 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
19249 }
19250
19251 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
19252 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
19253
19254 if (IsStrict)
19255 return DAG.getMergeValues({Cvt, Chain}, DL);
19256
19257 return Cvt;
19258}
19259
19261 SelectionDAG &DAG) {
19262 bool IsStrict = Op->isStrictFPOpcode();
19263 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19264 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19265 MVT VT = Op.getSimpleValueType();
19266 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
19267
19268 SDValue Rnd = DAG.getIntPtrConstant(0, dl);
19269 if (IsStrict)
19270 return DAG.getNode(
19271 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
19272 {Chain,
19273 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
19274 Rnd});
19275 return DAG.getNode(ISD::FP_ROUND, dl, VT,
19276 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
19277}
19278
19279static bool isLegalConversion(MVT VT, bool IsSigned,
19280 const X86Subtarget &Subtarget) {
19281 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
19282 return true;
19283 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
19284 return true;
19285 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
19286 return true;
19287 if (Subtarget.useAVX512Regs()) {
19288 if (VT == MVT::v16i32)
19289 return true;
19290 if (VT == MVT::v8i64 && Subtarget.hasDQI())
19291 return true;
19292 }
19293 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
19294 (VT == MVT::v2i64 || VT == MVT::v4i64))
19295 return true;
19296 return false;
19297}
19298
19299SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
19300 SelectionDAG &DAG) const {
19301 bool IsStrict = Op->isStrictFPOpcode();
19302 unsigned OpNo = IsStrict ? 1 : 0;
19303 SDValue Src = Op.getOperand(OpNo);
19304 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19305 MVT SrcVT = Src.getSimpleValueType();
19306 MVT VT = Op.getSimpleValueType();
19307 SDLoc dl(Op);
19308
19309 if (isSoftF16(VT, Subtarget))
19310 return promoteXINT_TO_FP(Op, dl, DAG);
19311 else if (isLegalConversion(SrcVT, true, Subtarget))
19312 return Op;
19313
19314 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
19315 return LowerWin64_INT128_TO_FP(Op, DAG);
19316
19317 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
19318 return Extract;
19319
19320 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
19321 return R;
19322
19323 if (SrcVT.isVector()) {
19324 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
19325 // Note: Since v2f64 is a legal type. We don't need to zero extend the
19326 // source for strict FP.
19327 if (IsStrict)
19328 return DAG.getNode(
19329 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
19330 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19331 DAG.getUNDEF(SrcVT))});
19332 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
19333 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19334 DAG.getUNDEF(SrcVT)));
19335 }
19336 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
19337 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
19338
19339 return SDValue();
19340 }
19341
19342 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
19343 "Unknown SINT_TO_FP to lower!");
19344
19345 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
19346
19347 // These are really Legal; return the operand so the caller accepts it as
19348 // Legal.
19349 if (SrcVT == MVT::i32 && UseSSEReg)
19350 return Op;
19351 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
19352 return Op;
19353
19354 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
19355 return V;
19356 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
19357 return V;
19358
19359 // SSE doesn't have an i16 conversion so we need to promote.
19360 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
19361 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
19362 if (IsStrict)
19363 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
19364 {Chain, Ext});
19365
19366 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
19367 }
19368
19369 if (VT == MVT::f128 || !Subtarget.hasX87())
19370 return SDValue();
19371
19372 SDValue ValueToStore = Src;
19373 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
19374 // Bitcasting to f64 here allows us to do a single 64-bit store from
19375 // an SSE register, avoiding the store forwarding penalty that would come
19376 // with two 32-bit stores.
19377 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
19378
19379 unsigned Size = SrcVT.getStoreSize();
19380 Align Alignment(Size);
19382 auto PtrVT = getPointerTy(MF.getDataLayout());
19383 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
19384 MachinePointerInfo MPI =
19386 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19387 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
19388 std::pair<SDValue, SDValue> Tmp =
19389 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
19390
19391 if (IsStrict)
19392 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
19393
19394 return Tmp.first;
19395}
19396
19397std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
19398 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
19399 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
19400 // Build the FILD
19401 SDVTList Tys;
19402 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
19403 if (useSSE)
19404 Tys = DAG.getVTList(MVT::f80, MVT::Other);
19405 else
19406 Tys = DAG.getVTList(DstVT, MVT::Other);
19407
19408 SDValue FILDOps[] = {Chain, Pointer};
19409 SDValue Result =
19410 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
19411 Alignment, MachineMemOperand::MOLoad);
19412 Chain = Result.getValue(1);
19413
19414 if (useSSE) {
19416 unsigned SSFISize = DstVT.getStoreSize();
19417 int SSFI =
19418 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
19419 auto PtrVT = getPointerTy(MF.getDataLayout());
19420 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19421 Tys = DAG.getVTList(MVT::Other);
19422 SDValue FSTOps[] = {Chain, Result, StackSlot};
19425 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
19426
19427 Chain =
19428 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
19429 Result = DAG.getLoad(
19430 DstVT, DL, Chain, StackSlot,
19432 Chain = Result.getValue(1);
19433 }
19434
19435 return { Result, Chain };
19436}
19437
19438/// Horizontal vector math instructions may be slower than normal math with
19439/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
19440/// implementation, and likely shuffle complexity of the alternate sequence.
19441static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
19442 const X86Subtarget &Subtarget) {
19443 bool IsOptimizingSize = DAG.shouldOptForSize();
19444 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
19445 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
19446}
19447
19448/// 64-bit unsigned integer to double expansion.
19450 SelectionDAG &DAG,
19451 const X86Subtarget &Subtarget) {
19452 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
19453 // when converting 0 when rounding toward negative infinity. Caller will
19454 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
19455 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
19456 // This algorithm is not obvious. Here it is what we're trying to output:
19457 /*
19458 movq %rax, %xmm0
19459 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
19460 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
19461 #ifdef __SSE3__
19462 haddpd %xmm0, %xmm0
19463 #else
19464 pshufd $0x4e, %xmm0, %xmm1
19465 addpd %xmm1, %xmm0
19466 #endif
19467 */
19468
19470
19471 // Build some magic constants.
19472 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
19474 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
19475 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
19476
19478 CV1.push_back(
19479 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19480 APInt(64, 0x4330000000000000ULL))));
19481 CV1.push_back(
19482 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19483 APInt(64, 0x4530000000000000ULL))));
19484 Constant *C1 = ConstantVector::get(CV1);
19485 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
19486
19487 // Load the 64-bit value into an XMM register.
19488 SDValue XR1 =
19489 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
19490 SDValue CLod0 = DAG.getLoad(
19491 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
19493 SDValue Unpck1 =
19494 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
19495
19496 SDValue CLod1 = DAG.getLoad(
19497 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
19499 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
19500 // TODO: Are there any fast-math-flags to propagate here?
19501 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
19502 SDValue Result;
19503
19504 if (Subtarget.hasSSE3() &&
19505 shouldUseHorizontalOp(true, DAG, Subtarget)) {
19506 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
19507 } else {
19508 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
19509 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
19510 }
19511 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
19512 DAG.getIntPtrConstant(0, dl));
19513 return Result;
19514}
19515
19516/// 32-bit unsigned integer to float expansion.
19518 SelectionDAG &DAG,
19519 const X86Subtarget &Subtarget) {
19520 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19521 // FP constant to bias correct the final result.
19522 SDValue Bias = DAG.getConstantFP(
19523 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
19524
19525 // Load the 32-bit value into an XMM register.
19526 SDValue Load =
19527 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
19528
19529 // Zero out the upper parts of the register.
19530 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
19531
19532 // Or the load with the bias.
19533 SDValue Or = DAG.getNode(
19534 ISD::OR, dl, MVT::v2i64,
19535 DAG.getBitcast(MVT::v2i64, Load),
19536 DAG.getBitcast(MVT::v2i64,
19537 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
19538 Or =
19539 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
19540 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
19541
19542 if (Op.getNode()->isStrictFPOpcode()) {
19543 // Subtract the bias.
19544 // TODO: Are there any fast-math-flags to propagate here?
19545 SDValue Chain = Op.getOperand(0);
19546 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
19547 {Chain, Or, Bias});
19548
19549 if (Op.getValueType() == Sub.getValueType())
19550 return Sub;
19551
19552 // Handle final rounding.
19553 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
19554 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
19555
19556 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
19557 }
19558
19559 // Subtract the bias.
19560 // TODO: Are there any fast-math-flags to propagate here?
19561 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
19562
19563 // Handle final rounding.
19564 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
19565}
19566
19568 SelectionDAG &DAG,
19569 const X86Subtarget &Subtarget) {
19570 if (Op.getSimpleValueType() != MVT::v2f64)
19571 return SDValue();
19572
19573 bool IsStrict = Op->isStrictFPOpcode();
19574
19575 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
19576 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
19577
19578 if (Subtarget.hasAVX512()) {
19579 if (!Subtarget.hasVLX()) {
19580 // Let generic type legalization widen this.
19581 if (!IsStrict)
19582 return SDValue();
19583 // Otherwise pad the integer input with 0s and widen the operation.
19584 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
19585 DAG.getConstant(0, DL, MVT::v2i32));
19586 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
19587 {Op.getOperand(0), N0});
19588 SDValue Chain = Res.getValue(1);
19589 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
19590 DAG.getIntPtrConstant(0, DL));
19591 return DAG.getMergeValues({Res, Chain}, DL);
19592 }
19593
19594 // Legalize to v4i32 type.
19595 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
19596 DAG.getUNDEF(MVT::v2i32));
19597 if (IsStrict)
19598 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
19599 {Op.getOperand(0), N0});
19600 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
19601 }
19602
19603 // Zero extend to 2i64, OR with the floating point representation of 2^52.
19604 // This gives us the floating point equivalent of 2^52 + the i32 integer
19605 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
19606 // point leaving just our i32 integers in double format.
19607 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
19608 SDValue VBias = DAG.getConstantFP(
19609 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
19610 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
19611 DAG.getBitcast(MVT::v2i64, VBias));
19612 Or = DAG.getBitcast(MVT::v2f64, Or);
19613
19614 if (IsStrict)
19615 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
19616 {Op.getOperand(0), Or, VBias});
19617 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
19618}
19619
19621 SelectionDAG &DAG,
19622 const X86Subtarget &Subtarget) {
19623 bool IsStrict = Op->isStrictFPOpcode();
19624 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
19625 MVT VecIntVT = V.getSimpleValueType();
19626 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
19627 "Unsupported custom type");
19628
19629 if (Subtarget.hasAVX512()) {
19630 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
19631 assert(!Subtarget.hasVLX() && "Unexpected features");
19632 MVT VT = Op->getSimpleValueType(0);
19633
19634 // v8i32->v8f64 is legal with AVX512 so just return it.
19635 if (VT == MVT::v8f64)
19636 return Op;
19637
19638 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
19639 "Unexpected VT!");
19640 MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
19641 MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
19642 // Need to concat with zero vector for strict fp to avoid spurious
19643 // exceptions.
19644 SDValue Tmp =
19645 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
19646 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
19647 DAG.getIntPtrConstant(0, DL));
19648 SDValue Res, Chain;
19649 if (IsStrict) {
19650 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
19651 {Op->getOperand(0), V});
19652 Chain = Res.getValue(1);
19653 } else {
19654 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
19655 }
19656
19657 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19658 DAG.getIntPtrConstant(0, DL));
19659
19660 if (IsStrict)
19661 return DAG.getMergeValues({Res, Chain}, DL);
19662 return Res;
19663 }
19664
19665 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
19666 Op->getSimpleValueType(0) == MVT::v4f64) {
19667 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
19668 Constant *Bias = ConstantFP::get(
19669 *DAG.getContext(),
19670 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
19671 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
19672 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
19673 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
19674 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
19675 SDValue VBias = DAG.getMemIntrinsicNode(
19676 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
19679
19680 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
19681 DAG.getBitcast(MVT::v4i64, VBias));
19682 Or = DAG.getBitcast(MVT::v4f64, Or);
19683
19684 if (IsStrict)
19685 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
19686 {Op.getOperand(0), Or, VBias});
19687 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
19688 }
19689
19690 // The algorithm is the following:
19691 // #ifdef __SSE4_1__
19692 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
19693 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
19694 // (uint4) 0x53000000, 0xaa);
19695 // #else
19696 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
19697 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
19698 // #endif
19699 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
19700 // return (float4) lo + fhi;
19701
19702 bool Is128 = VecIntVT == MVT::v4i32;
19703 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
19704 // If we convert to something else than the supported type, e.g., to v4f64,
19705 // abort early.
19706 if (VecFloatVT != Op->getSimpleValueType(0))
19707 return SDValue();
19708
19709 // In the #idef/#else code, we have in common:
19710 // - The vector of constants:
19711 // -- 0x4b000000
19712 // -- 0x53000000
19713 // - A shift:
19714 // -- v >> 16
19715
19716 // Create the splat vector for 0x4b000000.
19717 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
19718 // Create the splat vector for 0x53000000.
19719 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
19720
19721 // Create the right shift.
19722 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
19723 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
19724
19725 SDValue Low, High;
19726 if (Subtarget.hasSSE41()) {
19727 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
19728 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
19729 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
19730 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
19731 // Low will be bitcasted right away, so do not bother bitcasting back to its
19732 // original type.
19733 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
19734 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
19735 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
19736 // (uint4) 0x53000000, 0xaa);
19737 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
19738 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
19739 // High will be bitcasted right away, so do not bother bitcasting back to
19740 // its original type.
19741 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
19742 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
19743 } else {
19744 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
19745 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
19746 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
19747 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
19748
19749 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
19750 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
19751 }
19752
19753 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
19754 SDValue VecCstFSub = DAG.getConstantFP(
19755 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
19756
19757 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
19758 // NOTE: By using fsub of a positive constant instead of fadd of a negative
19759 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
19760 // enabled. See PR24512.
19761 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
19762 // TODO: Are there any fast-math-flags to propagate here?
19763 // (float4) lo;
19764 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
19765 // return (float4) lo + fhi;
19766 if (IsStrict) {
19767 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
19768 {Op.getOperand(0), HighBitcast, VecCstFSub});
19769 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
19770 {FHigh.getValue(1), LowBitcast, FHigh});
19771 }
19772
19773 SDValue FHigh =
19774 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
19775 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
19776}
19777
19779 const X86Subtarget &Subtarget) {
19780 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19781 SDValue N0 = Op.getOperand(OpNo);
19782 MVT SrcVT = N0.getSimpleValueType();
19783
19784 switch (SrcVT.SimpleTy) {
19785 default:
19786 llvm_unreachable("Custom UINT_TO_FP is not supported!");
19787 case MVT::v2i32:
19788 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
19789 case MVT::v4i32:
19790 case MVT::v8i32:
19791 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
19792 case MVT::v2i64:
19793 case MVT::v4i64:
19794 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
19795 }
19796}
19797
19798SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
19799 SelectionDAG &DAG) const {
19800 bool IsStrict = Op->isStrictFPOpcode();
19801 unsigned OpNo = IsStrict ? 1 : 0;
19802 SDValue Src = Op.getOperand(OpNo);
19803 SDLoc dl(Op);
19804 auto PtrVT = getPointerTy(DAG.getDataLayout());
19805 MVT SrcVT = Src.getSimpleValueType();
19806 MVT DstVT = Op->getSimpleValueType(0);
19807 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
19808
19809 // Bail out when we don't have native conversion instructions.
19810 if (DstVT == MVT::f128)
19811 return SDValue();
19812
19813 if (isSoftF16(DstVT, Subtarget))
19814 return promoteXINT_TO_FP(Op, dl, DAG);
19815 else if (isLegalConversion(SrcVT, false, Subtarget))
19816 return Op;
19817
19818 if (DstVT.isVector())
19819 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
19820
19821 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
19822 return LowerWin64_INT128_TO_FP(Op, DAG);
19823
19824 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
19825 return Extract;
19826
19827 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
19828 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
19829 // Conversions from unsigned i32 to f32/f64 are legal,
19830 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
19831 return Op;
19832 }
19833
19834 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
19835 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
19836 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
19837 if (IsStrict)
19838 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
19839 {Chain, Src});
19840 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
19841 }
19842
19843 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
19844 return V;
19845 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
19846 return V;
19847
19848 // The transform for i64->f64 isn't correct for 0 when rounding to negative
19849 // infinity. It produces -0.0, so disable under strictfp.
19850 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
19851 !IsStrict)
19852 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
19853 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
19854 // negative infinity. So disable under strictfp. Using FILD instead.
19855 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
19856 !IsStrict)
19857 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
19858 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
19859 (DstVT == MVT::f32 || DstVT == MVT::f64))
19860 return SDValue();
19861
19862 // Make a 64-bit buffer, and use it to build an FILD.
19863 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
19864 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
19865 Align SlotAlign(8);
19866 MachinePointerInfo MPI =
19868 if (SrcVT == MVT::i32) {
19869 SDValue OffsetSlot =
19870 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
19871 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
19872 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
19873 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
19874 std::pair<SDValue, SDValue> Tmp =
19875 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
19876 if (IsStrict)
19877 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
19878
19879 return Tmp.first;
19880 }
19881
19882 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
19883 SDValue ValueToStore = Src;
19884 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
19885 // Bitcasting to f64 here allows us to do a single 64-bit store from
19886 // an SSE register, avoiding the store forwarding penalty that would come
19887 // with two 32-bit stores.
19888 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
19889 }
19890 SDValue Store =
19891 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
19892 // For i64 source, we need to add the appropriate power of 2 if the input
19893 // was negative. We must be careful to do the computation in x87 extended
19894 // precision, not in SSE.
19895 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
19896 SDValue Ops[] = {Store, StackSlot};
19897 SDValue Fild =
19898 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
19899 SlotAlign, MachineMemOperand::MOLoad);
19900 Chain = Fild.getValue(1);
19901
19902 // Check whether the sign bit is set.
19903 SDValue SignSet = DAG.getSetCC(
19904 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
19905 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
19906
19907 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
19908 APInt FF(64, 0x5F80000000000000ULL);
19909 SDValue FudgePtr =
19910 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
19911 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
19912
19913 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
19914 SDValue Zero = DAG.getIntPtrConstant(0, dl);
19915 SDValue Four = DAG.getIntPtrConstant(4, dl);
19916 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
19917 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
19918
19919 // Load the value out, extending it from f32 to f80.
19920 SDValue Fudge = DAG.getExtLoad(
19921 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
19923 CPAlignment);
19924 Chain = Fudge.getValue(1);
19925 // Extend everything to 80 bits to force it to be done on x87.
19926 // TODO: Are there any fast-math-flags to propagate here?
19927 if (IsStrict) {
19928 unsigned Opc = ISD::STRICT_FADD;
19929 // Windows needs the precision control changed to 80bits around this add.
19930 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
19932
19933 SDValue Add =
19934 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
19935 // STRICT_FP_ROUND can't handle equal types.
19936 if (DstVT == MVT::f80)
19937 return Add;
19938 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
19939 {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
19940 }
19941 unsigned Opc = ISD::FADD;
19942 // Windows needs the precision control changed to 80bits around this add.
19943 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
19944 Opc = X86ISD::FP80_ADD;
19945
19946 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
19947 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
19948 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
19949}
19950
19951// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
19952// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
19953// just return an SDValue().
19954// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
19955// to i16, i32 or i64, and we lower it to a legal sequence and return the
19956// result.
19957SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
19958 bool IsSigned,
19959 SDValue &Chain) const {
19960 bool IsStrict = Op->isStrictFPOpcode();
19961 SDLoc DL(Op);
19962
19963 EVT DstTy = Op.getValueType();
19964 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
19965 EVT TheVT = Value.getValueType();
19966 auto PtrVT = getPointerTy(DAG.getDataLayout());
19967
19968 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
19969 // f16 must be promoted before using the lowering in this routine.
19970 // fp128 does not use this lowering.
19971 return SDValue();
19972 }
19973
19974 // If using FIST to compute an unsigned i64, we'll need some fixup
19975 // to handle values above the maximum signed i64. A FIST is always
19976 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
19977 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
19978
19979 // FIXME: This does not generate an invalid exception if the input does not
19980 // fit in i32. PR44019
19981 if (!IsSigned && DstTy != MVT::i64) {
19982 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
19983 // The low 32 bits of the fist result will have the correct uint32 result.
19984 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
19985 DstTy = MVT::i64;
19986 }
19987
19988 assert(DstTy.getSimpleVT() <= MVT::i64 &&
19989 DstTy.getSimpleVT() >= MVT::i16 &&
19990 "Unknown FP_TO_INT to lower!");
19991
19992 // We lower FP->int64 into FISTP64 followed by a load from a temporary
19993 // stack slot.
19995 unsigned MemSize = DstTy.getStoreSize();
19996 int SSFI =
19997 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
19998 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19999
20000 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20001
20002 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20003
20004 if (UnsignedFixup) {
20005 //
20006 // Conversion to unsigned i64 is implemented with a select,
20007 // depending on whether the source value fits in the range
20008 // of a signed i64. Let Thresh be the FP equivalent of
20009 // 0x8000000000000000ULL.
20010 //
20011 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20012 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20013 // FistSrc = (Value - FltOfs);
20014 // Fist-to-mem64 FistSrc
20015 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20016 // to XOR'ing the high 32 bits with Adjust.
20017 //
20018 // Being a power of 2, Thresh is exactly representable in all FP formats.
20019 // For X87 we'd like to use the smallest FP type for this constant, but
20020 // for DAG type consistency we have to match the FP operand type.
20021
20022 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20024 bool LosesInfo = false;
20025 if (TheVT == MVT::f64)
20026 // The rounding mode is irrelevant as the conversion should be exact.
20028 &LosesInfo);
20029 else if (TheVT == MVT::f80)
20030 Status = Thresh.convert(APFloat::x87DoubleExtended(),
20031 APFloat::rmNearestTiesToEven, &LosesInfo);
20032
20033 assert(Status == APFloat::opOK && !LosesInfo &&
20034 "FP conversion should have been exact");
20035
20036 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20037
20038 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20039 *DAG.getContext(), TheVT);
20040 SDValue Cmp;
20041 if (IsStrict) {
20042 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20043 /*IsSignaling*/ true);
20044 Chain = Cmp.getValue(1);
20045 } else {
20046 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20047 }
20048
20049 // Our preferred lowering of
20050 //
20051 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20052 //
20053 // is
20054 //
20055 // (Value >= Thresh) << 63
20056 //
20057 // but since we can get here after LegalOperations, DAGCombine might do the
20058 // wrong thing if we create a select. So, directly create the preferred
20059 // version.
20060 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20061 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20062 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20063
20064 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20065 DAG.getConstantFP(0.0, DL, TheVT));
20066
20067 if (IsStrict) {
20068 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20069 { Chain, Value, FltOfs });
20070 Chain = Value.getValue(1);
20071 } else
20072 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20073 }
20074
20076
20077 // FIXME This causes a redundant load/store if the SSE-class value is already
20078 // in memory, such as if it is on the callstack.
20079 if (isScalarFPTypeInSSEReg(TheVT)) {
20080 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
20081 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20082 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20083 SDValue Ops[] = { Chain, StackSlot };
20084
20085 unsigned FLDSize = TheVT.getStoreSize();
20086 assert(FLDSize <= MemSize && "Stack slot not big enough");
20088 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20089 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20090 Chain = Value.getValue(1);
20091 }
20092
20093 // Build the FP_TO_INT*_IN_MEM
20095 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20096 SDValue Ops[] = { Chain, Value, StackSlot };
20098 DAG.getVTList(MVT::Other),
20099 Ops, DstTy, MMO);
20100
20101 SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
20102 Chain = Res.getValue(1);
20103
20104 // If we need an unsigned fixup, XOR the result with adjust.
20105 if (UnsignedFixup)
20106 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20107
20108 return Res;
20109}
20110
20112 const X86Subtarget &Subtarget) {
20113 MVT VT = Op.getSimpleValueType();
20114 SDValue In = Op.getOperand(0);
20115 MVT InVT = In.getSimpleValueType();
20116 SDLoc dl(Op);
20117 unsigned Opc = Op.getOpcode();
20118
20119 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20120 assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
20121 "Unexpected extension opcode");
20123 "Expected same number of elements");
20124 assert((VT.getVectorElementType() == MVT::i16 ||
20125 VT.getVectorElementType() == MVT::i32 ||
20126 VT.getVectorElementType() == MVT::i64) &&
20127 "Unexpected element type");
20128 assert((InVT.getVectorElementType() == MVT::i8 ||
20129 InVT.getVectorElementType() == MVT::i16 ||
20130 InVT.getVectorElementType() == MVT::i32) &&
20131 "Unexpected element type");
20132
20133 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
20134
20135 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20136 assert(InVT == MVT::v32i8 && "Unexpected VT!");
20137 return splitVectorIntUnary(Op, DAG, dl);
20138 }
20139
20140 if (Subtarget.hasInt256())
20141 return Op;
20142
20143 // Optimize vectors in AVX mode:
20144 //
20145 // v8i16 -> v8i32
20146 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20147 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20148 // Concat upper and lower parts.
20149 //
20150 // v4i32 -> v4i64
20151 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20152 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20153 // Concat upper and lower parts.
20154 //
20155 MVT HalfVT = VT.getHalfNumVectorElementsVT();
20156 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20157
20158 // Short-circuit if we can determine that each 128-bit half is the same value.
20159 // Otherwise, this is difficult to match and optimize.
20160 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20161 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20162 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20163
20164 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20165 SDValue Undef = DAG.getUNDEF(InVT);
20166 bool NeedZero = Opc == ISD::ZERO_EXTEND;
20167 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20168 OpHi = DAG.getBitcast(HalfVT, OpHi);
20169
20170 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20171}
20172
20173// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20174static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20175 const SDLoc &dl, SelectionDAG &DAG) {
20176 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
20177 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20178 DAG.getIntPtrConstant(0, dl));
20179 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20180 DAG.getIntPtrConstant(8, dl));
20181 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20182 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20183 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20184 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20185}
20186
20188 const X86Subtarget &Subtarget,
20189 SelectionDAG &DAG) {
20190 MVT VT = Op->getSimpleValueType(0);
20191 SDValue In = Op->getOperand(0);
20192 MVT InVT = In.getSimpleValueType();
20193 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20194 SDLoc DL(Op);
20195 unsigned NumElts = VT.getVectorNumElements();
20196
20197 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20198 // avoids a constant pool load.
20199 if (VT.getVectorElementType() != MVT::i8) {
20200 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20201 return DAG.getNode(ISD::SRL, DL, VT, Extend,
20202 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20203 }
20204
20205 // Extend VT if BWI is not supported.
20206 MVT ExtVT = VT;
20207 if (!Subtarget.hasBWI()) {
20208 // If v16i32 is to be avoided, we'll need to split and concatenate.
20209 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
20210 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
20211
20212 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
20213 }
20214
20215 // Widen to 512-bits if VLX is not supported.
20216 MVT WideVT = ExtVT;
20217 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
20218 NumElts *= 512 / ExtVT.getSizeInBits();
20219 InVT = MVT::getVectorVT(MVT::i1, NumElts);
20220 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
20221 In, DAG.getIntPtrConstant(0, DL));
20222 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
20223 NumElts);
20224 }
20225
20226 SDValue One = DAG.getConstant(1, DL, WideVT);
20227 SDValue Zero = DAG.getConstant(0, DL, WideVT);
20228
20229 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
20230
20231 // Truncate if we had to extend above.
20232 if (VT != ExtVT) {
20233 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
20234 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
20235 }
20236
20237 // Extract back to 128/256-bit if we widened.
20238 if (WideVT != VT)
20239 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
20240 DAG.getIntPtrConstant(0, DL));
20241
20242 return SelectedVal;
20243}
20244
20246 SelectionDAG &DAG) {
20247 SDValue In = Op.getOperand(0);
20248 MVT SVT = In.getSimpleValueType();
20249
20250 if (SVT.getVectorElementType() == MVT::i1)
20251 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
20252
20253 assert(Subtarget.hasAVX() && "Expected AVX support");
20254 return LowerAVXExtend(Op, DAG, Subtarget);
20255}
20256
20257/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
20258/// It makes use of the fact that vectors with enough leading sign/zero bits
20259/// prevent the PACKSS/PACKUS from saturating the results.
20260/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
20261/// within each 128-bit lane.
20262static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
20263 const SDLoc &DL, SelectionDAG &DAG,
20264 const X86Subtarget &Subtarget) {
20265 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
20266 "Unexpected PACK opcode");
20267 assert(DstVT.isVector() && "VT not a vector?");
20268
20269 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
20270 if (!Subtarget.hasSSE2())
20271 return SDValue();
20272
20273 EVT SrcVT = In.getValueType();
20274
20275 // No truncation required, we might get here due to recursive calls.
20276 if (SrcVT == DstVT)
20277 return In;
20278
20279 unsigned NumElems = SrcVT.getVectorNumElements();
20280 if (NumElems < 2 || !isPowerOf2_32(NumElems) )
20281 return SDValue();
20282
20283 unsigned DstSizeInBits = DstVT.getSizeInBits();
20284 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
20285 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
20286 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
20287
20288 LLVMContext &Ctx = *DAG.getContext();
20289 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
20290 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
20291
20292 // Pack to the largest type possible:
20293 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
20294 EVT InVT = MVT::i16, OutVT = MVT::i8;
20295 if (SrcVT.getScalarSizeInBits() > 16 &&
20296 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
20297 InVT = MVT::i32;
20298 OutVT = MVT::i16;
20299 }
20300
20301 // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
20302 // On pre-AVX512, pack the src in both halves to help value tracking.
20303 if (SrcSizeInBits <= 128) {
20304 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
20305 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
20306 In = widenSubVector(In, false, Subtarget, DAG, DL, 128);
20307 SDValue LHS = DAG.getBitcast(InVT, In);
20308 SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS;
20309 SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS);
20310 Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);
20311 Res = DAG.getBitcast(PackedVT, Res);
20312 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20313 }
20314
20315 // Split lower/upper subvectors.
20316 SDValue Lo, Hi;
20317 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
20318
20319 // If Hi is undef, then don't bother packing it and widen the result instead.
20320 if (Hi.isUndef()) {
20321 EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);
20322 if (SDValue Res =
20323 truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))
20324 return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);
20325 }
20326
20327 unsigned SubSizeInBits = SrcSizeInBits / 2;
20328 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
20329 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
20330
20331 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
20332 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
20333 Lo = DAG.getBitcast(InVT, Lo);
20334 Hi = DAG.getBitcast(InVT, Hi);
20335 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20336 return DAG.getBitcast(DstVT, Res);
20337 }
20338
20339 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
20340 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
20341 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
20342 Lo = DAG.getBitcast(InVT, Lo);
20343 Hi = DAG.getBitcast(InVT, Hi);
20344 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20345
20346 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
20347 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
20348 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
20350 int Scale = 64 / OutVT.getScalarSizeInBits();
20351 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
20352 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
20353
20354 if (DstVT.is256BitVector())
20355 return DAG.getBitcast(DstVT, Res);
20356
20357 // If 512bit -> 128bit truncate another stage.
20358 Res = DAG.getBitcast(PackedVT, Res);
20359 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20360 }
20361
20362 // Recursively pack lower/upper subvectors, concat result and pack again.
20363 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
20364
20365 if (PackedVT.is128BitVector()) {
20366 // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
20367 // type legalization.
20368 SDValue Res =
20369 truncateVectorWithPACK(Opcode, PackedVT, In, DL, DAG, Subtarget);
20370 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20371 }
20372
20373 EVT HalfPackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
20374 Lo = truncateVectorWithPACK(Opcode, HalfPackedVT, Lo, DL, DAG, Subtarget);
20375 Hi = truncateVectorWithPACK(Opcode, HalfPackedVT, Hi, DL, DAG, Subtarget);
20376 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
20377 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20378}
20379
20380/// Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
20381/// e.g. trunc <8 x i32> X to <8 x i16> -->
20382/// MaskX = X & 0xffff (clear high bits to prevent saturation)
20383/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
20385 const X86Subtarget &Subtarget,
20386 SelectionDAG &DAG) {
20387 In = DAG.getZeroExtendInReg(In, DL, DstVT);
20388 return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget);
20389}
20390
20391/// Truncate using inreg sign extension and X86ISD::PACKSS.
20393 const X86Subtarget &Subtarget,
20394 SelectionDAG &DAG) {
20395 EVT SrcVT = In.getValueType();
20396 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In,
20397 DAG.getValueType(DstVT));
20398 return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
20399}
20400
20401/// Helper to determine if \p In truncated to \p DstVT has the necessary
20402/// signbits / leading zero bits to be truncated with PACKSS / PACKUS,
20403/// possibly by converting a SRL node to SRA for sign extension.
20404static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
20405 SDValue In, const SDLoc &DL,
20406 SelectionDAG &DAG,
20407 const X86Subtarget &Subtarget) {
20408 // Requires SSE2.
20409 if (!Subtarget.hasSSE2())
20410 return SDValue();
20411
20412 EVT SrcVT = In.getValueType();
20413 EVT DstSVT = DstVT.getVectorElementType();
20414 EVT SrcSVT = SrcVT.getVectorElementType();
20415 unsigned NumDstEltBits = DstSVT.getSizeInBits();
20416 unsigned NumSrcEltBits = SrcSVT.getSizeInBits();
20417
20418 // Check we have a truncation suited for PACKSS/PACKUS.
20419 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20420 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20421 return SDValue();
20422
20423 assert(NumSrcEltBits > NumDstEltBits && "Bad truncation");
20424 unsigned NumStages = Log2_32(NumSrcEltBits / NumDstEltBits);
20425
20426 // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
20427 // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
20428 // Truncation from v2i64 to v2i8 can be better handled with PSHUFB.
20429 if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) ||
20430 (DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) ||
20431 (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3()))
20432 return SDValue();
20433
20434 // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
20435 // split this for packing.
20436 if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&
20437 !isFreeToSplitVector(In.getNode(), DAG) &&
20438 (!Subtarget.hasAVX() || DAG.ComputeNumSignBits(In) != 64))
20439 return SDValue();
20440
20441 // Don't truncate AVX512 targets as multiple PACK nodes stages.
20442 if (Subtarget.hasAVX512() && NumStages > 1)
20443 return SDValue();
20444
20445 unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);
20446 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
20447
20448 // Truncate with PACKUS if we are truncating a vector with leading zero
20449 // bits that extend all the way to the packed/truncated value.
20450 // e.g. Masks, zext_in_reg, etc.
20451 // Pre-SSE41 we can only use PACKUSWB.
20452 KnownBits Known = DAG.computeKnownBits(In);
20453 if ((NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
20454 PackOpcode = X86ISD::PACKUS;
20455 return In;
20456 }
20457
20458 // Truncate with PACKSS if we are truncating a vector with sign-bits
20459 // that extend all the way to the packed/truncated value.
20460 // e.g. Comparison result, sext_in_reg, etc.
20461 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
20462
20463 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
20464 // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
20465 // see through BITCASTs later on and combines/simplifications can't then use
20466 // it.
20467 if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&
20468 !Subtarget.hasAVX512())
20469 return SDValue();
20470
20471 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
20472 if (MinSignBits < NumSignBits) {
20473 PackOpcode = X86ISD::PACKSS;
20474 return In;
20475 }
20476
20477 // If we have a srl that only generates signbits that we will discard in
20478 // the truncation then we can use PACKSS by converting the srl to a sra.
20479 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
20480 if (In.getOpcode() == ISD::SRL && In->hasOneUse())
20481 if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(In)) {
20482 if (*ShAmt == MinSignBits) {
20483 PackOpcode = X86ISD::PACKSS;
20484 return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
20485 }
20486 }
20487
20488 return SDValue();
20489}
20490
20491/// This function lowers a vector truncation of 'extended sign-bits' or
20492/// 'extended zero-bits' values.
20493/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
20495 const SDLoc &DL,
20496 const X86Subtarget &Subtarget,
20497 SelectionDAG &DAG) {
20498 MVT SrcVT = In.getSimpleValueType();
20499 MVT DstSVT = DstVT.getVectorElementType();
20500 MVT SrcSVT = SrcVT.getVectorElementType();
20501 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20502 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20503 return SDValue();
20504
20505 // If the upper half of the source is undef, then attempt to split and
20506 // only truncate the lower half.
20507 if (DstVT.getSizeInBits() >= 128) {
20508 SmallVector<SDValue> LowerOps;
20509 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
20510 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
20511 if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,
20512 Subtarget, DAG))
20513 return widenSubVector(Res, false, Subtarget, DAG, DL,
20514 DstVT.getSizeInBits());
20515 }
20516 }
20517
20518 unsigned PackOpcode;
20519 if (SDValue Src =
20520 matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG, Subtarget))
20521 return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
20522
20523 return SDValue();
20524}
20525
20526/// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into
20527/// X86ISD::PACKUS/X86ISD::PACKSS operations.
20529 const X86Subtarget &Subtarget,
20530 SelectionDAG &DAG) {
20531 MVT SrcVT = In.getSimpleValueType();
20532 MVT DstSVT = DstVT.getVectorElementType();
20533 MVT SrcSVT = SrcVT.getVectorElementType();
20534 unsigned NumElems = DstVT.getVectorNumElements();
20535 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20536 (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
20537 NumElems >= 8))
20538 return SDValue();
20539
20540 // SSSE3's pshufb results in less instructions in the cases below.
20541 if (Subtarget.hasSSSE3() && NumElems == 8) {
20542 if (SrcSVT == MVT::i16)
20543 return SDValue();
20544 if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))
20545 return SDValue();
20546 }
20547
20548 // If the upper half of the source is undef, then attempt to split and
20549 // only truncate the lower half.
20550 if (DstVT.getSizeInBits() >= 128) {
20551 SmallVector<SDValue> LowerOps;
20552 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
20553 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
20554 if (SDValue Res = LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))
20555 return widenSubVector(Res, false, Subtarget, DAG, DL,
20556 DstVT.getSizeInBits());
20557 }
20558 }
20559
20560 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
20561 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
20562 // truncate 2 x v4i32 to v8i16.
20563 if (Subtarget.hasSSE41() || DstSVT == MVT::i8)
20564 return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);
20565
20566 if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
20567 return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);
20568
20569 // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
20570 if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
20571 MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);
20572 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);
20573 return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);
20574 }
20575
20576 return SDValue();
20577}
20578
20580 const X86Subtarget &Subtarget) {
20581
20582 SDLoc DL(Op);
20583 MVT VT = Op.getSimpleValueType();
20584 SDValue In = Op.getOperand(0);
20585 MVT InVT = In.getSimpleValueType();
20586
20587 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
20588
20589 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
20590 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
20591 if (InVT.getScalarSizeInBits() <= 16) {
20592 if (Subtarget.hasBWI()) {
20593 // legal, will go to VPMOVB2M, VPMOVW2M
20594 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
20595 // We need to shift to get the lsb into sign position.
20596 // Shift packed bytes not supported natively, bitcast to word
20597 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
20598 In = DAG.getNode(ISD::SHL, DL, ExtVT,
20599 DAG.getBitcast(ExtVT, In),
20600 DAG.getConstant(ShiftInx, DL, ExtVT));
20601 In = DAG.getBitcast(InVT, In);
20602 }
20603 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
20604 In, ISD::SETGT);
20605 }
20606 // Use TESTD/Q, extended vector to packed dword/qword.
20607 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
20608 "Unexpected vector type.");
20609 unsigned NumElts = InVT.getVectorNumElements();
20610 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
20611 // We need to change to a wider element type that we have support for.
20612 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
20613 // For 16 element vectors we extend to v16i32 unless we are explicitly
20614 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
20615 // we need to split into two 8 element vectors which we can extend to v8i32,
20616 // truncate and concat the results. There's an additional complication if
20617 // the original type is v16i8. In that case we can't split the v16i8
20618 // directly, so we need to shuffle high elements to low and use
20619 // sign_extend_vector_inreg.
20620 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
20621 SDValue Lo, Hi;
20622 if (InVT == MVT::v16i8) {
20623 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
20624 Hi = DAG.getVectorShuffle(
20625 InVT, DL, In, In,
20626 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
20627 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
20628 } else {
20629 assert(InVT == MVT::v16i16 && "Unexpected VT!");
20630 Lo = extract128BitVector(In, 0, DAG, DL);
20631 Hi = extract128BitVector(In, 8, DAG, DL);
20632 }
20633 // We're split now, just emit two truncates and a concat. The two
20634 // truncates will trigger legalization to come back to this function.
20635 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
20636 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
20637 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
20638 }
20639 // We either have 8 elements or we're allowed to use 512-bit vectors.
20640 // If we have VLX, we want to use the narrowest vector that can get the
20641 // job done so we use vXi32.
20642 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
20643 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
20644 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
20645 InVT = ExtVT;
20646 ShiftInx = InVT.getScalarSizeInBits() - 1;
20647 }
20648
20649 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
20650 // We need to shift to get the lsb into sign position.
20651 In = DAG.getNode(ISD::SHL, DL, InVT, In,
20652 DAG.getConstant(ShiftInx, DL, InVT));
20653 }
20654 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
20655 if (Subtarget.hasDQI())
20656 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
20657 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
20658}
20659
20660SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
20661 SDLoc DL(Op);
20662 MVT VT = Op.getSimpleValueType();
20663 SDValue In = Op.getOperand(0);
20664 MVT InVT = In.getSimpleValueType();
20666 "Invalid TRUNCATE operation");
20667
20668 // If we're called by the type legalizer, handle a few cases.
20669 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20670 if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(InVT)) {
20671 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
20672 VT.is128BitVector() && Subtarget.hasAVX512()) {
20673 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
20674 "Unexpected subtarget!");
20675 // The default behavior is to truncate one step, concatenate, and then
20676 // truncate the remainder. We'd rather produce two 64-bit results and
20677 // concatenate those.
20678 SDValue Lo, Hi;
20679 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
20680
20681 EVT LoVT, HiVT;
20682 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
20683
20684 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
20685 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
20686 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
20687 }
20688
20689 // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
20690 if (!Subtarget.hasAVX512() ||
20691 (InVT.is512BitVector() && VT.is256BitVector()))
20692 if (SDValue SignPack =
20693 LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
20694 return SignPack;
20695
20696 // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
20697 if (!Subtarget.hasAVX512())
20698 return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
20699
20700 // Otherwise let default legalization handle it.
20701 return SDValue();
20702 }
20703
20704 if (VT.getVectorElementType() == MVT::i1)
20705 return LowerTruncateVecI1(Op, DAG, Subtarget);
20706
20707 // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
20708 // concat from subvectors to use VPTRUNC etc.
20709 if (!Subtarget.hasAVX512() || isFreeToSplitVector(In.getNode(), DAG))
20710 if (SDValue SignPack =
20711 LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
20712 return SignPack;
20713
20714 // vpmovqb/w/d, vpmovdb/w, vpmovwb
20715 if (Subtarget.hasAVX512()) {
20716 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
20717 assert(VT == MVT::v32i8 && "Unexpected VT!");
20718 return splitVectorIntUnary(Op, DAG, DL);
20719 }
20720
20721 // word to byte only under BWI. Otherwise we have to promoted to v16i32
20722 // and then truncate that. But we should only do that if we haven't been
20723 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
20724 // handled by isel patterns.
20725 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
20726 Subtarget.canExtendTo512DQ())
20727 return Op;
20728 }
20729
20730 // Handle truncation of V256 to V128 using shuffles.
20731 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
20732
20733 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
20734 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
20735 if (Subtarget.hasInt256()) {
20736 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
20737 In = DAG.getBitcast(MVT::v8i32, In);
20738 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
20739 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
20740 DAG.getIntPtrConstant(0, DL));
20741 }
20742
20743 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20744 DAG.getIntPtrConstant(0, DL));
20745 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20746 DAG.getIntPtrConstant(2, DL));
20747 static const int ShufMask[] = {0, 2, 4, 6};
20748 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
20749 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
20750 }
20751
20752 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
20753 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
20754 if (Subtarget.hasInt256()) {
20755 // The PSHUFB mask:
20756 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
20757 -1, -1, -1, -1, -1, -1, -1, -1,
20758 16, 17, 20, 21, 24, 25, 28, 29,
20759 -1, -1, -1, -1, -1, -1, -1, -1 };
20760 In = DAG.getBitcast(MVT::v32i8, In);
20761 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
20762 In = DAG.getBitcast(MVT::v4i64, In);
20763
20764 static const int ShufMask2[] = {0, 2, -1, -1};
20765 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
20766 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
20767 DAG.getIntPtrConstant(0, DL));
20768 return DAG.getBitcast(MVT::v8i16, In);
20769 }
20770
20771 return Subtarget.hasSSE41()
20772 ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)
20773 : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);
20774 }
20775
20776 if (VT == MVT::v16i8 && InVT == MVT::v16i16)
20777 return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG);
20778
20779 llvm_unreachable("All 256->128 cases should have been handled above!");
20780}
20781
20782// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
20783// behaves on out of range inputs to generate optimized conversions.
20785 SelectionDAG &DAG,
20786 const X86Subtarget &Subtarget) {
20787 MVT SrcVT = Src.getSimpleValueType();
20788 unsigned DstBits = VT.getScalarSizeInBits();
20789 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
20790
20791 // Calculate the converted result for values in the range 0 to
20792 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
20793 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
20794 SDValue Big =
20795 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
20796 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
20797 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
20798
20799 // The "CVTTP2SI" instruction conveniently sets the sign bit if
20800 // and only if the value was out of range. So we can use that
20801 // as our indicator that we rather use "Big" instead of "Small".
20802 //
20803 // Use "Small" if "IsOverflown" has all bits cleared
20804 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
20805
20806 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
20807 // use the slightly slower blendv select instead.
20808 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
20809 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
20810 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
20811 }
20812
20813 SDValue IsOverflown =
20814 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
20815 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
20816 return DAG.getNode(ISD::OR, dl, VT, Small,
20817 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
20818}
20819
20820SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
20821 bool IsStrict = Op->isStrictFPOpcode();
20822 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
20823 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
20824 MVT VT = Op->getSimpleValueType(0);
20825 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
20826 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
20827 MVT SrcVT = Src.getSimpleValueType();
20828 SDLoc dl(Op);
20829
20830 SDValue Res;
20831 if (isSoftF16(SrcVT, Subtarget)) {
20832 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
20833 if (IsStrict)
20834 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
20835 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
20836 {NVT, MVT::Other}, {Chain, Src})});
20837 return DAG.getNode(Op.getOpcode(), dl, VT,
20838 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
20839 } else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) {
20840 return Op;
20841 }
20842
20843 if (VT.isVector()) {
20844 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
20845 MVT ResVT = MVT::v4i32;
20846 MVT TruncVT = MVT::v4i1;
20847 unsigned Opc;
20848 if (IsStrict)
20850 else
20851 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
20852
20853 if (!IsSigned && !Subtarget.hasVLX()) {
20854 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
20855 // Widen to 512-bits.
20856 ResVT = MVT::v8i32;
20857 TruncVT = MVT::v8i1;
20858 Opc = Op.getOpcode();
20859 // Need to concat with zero vector for strict fp to avoid spurious
20860 // exceptions.
20861 // TODO: Should we just do this for non-strict as well?
20862 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
20863 : DAG.getUNDEF(MVT::v8f64);
20864 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
20865 DAG.getIntPtrConstant(0, dl));
20866 }
20867 if (IsStrict) {
20868 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
20869 Chain = Res.getValue(1);
20870 } else {
20871 Res = DAG.getNode(Opc, dl, ResVT, Src);
20872 }
20873
20874 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
20875 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
20876 DAG.getIntPtrConstant(0, dl));
20877 if (IsStrict)
20878 return DAG.getMergeValues({Res, Chain}, dl);
20879 return Res;
20880 }
20881
20882 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
20883 if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
20884 return Op;
20885
20886 MVT ResVT = VT;
20887 MVT EleVT = VT.getVectorElementType();
20888 if (EleVT != MVT::i64)
20889 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
20890
20891 if (SrcVT != MVT::v8f16) {
20892 SDValue Tmp =
20893 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
20894 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
20895 Ops[0] = Src;
20896 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
20897 }
20898
20899 if (IsStrict) {
20900 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
20902 dl, {ResVT, MVT::Other}, {Chain, Src});
20903 Chain = Res.getValue(1);
20904 } else {
20905 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
20906 ResVT, Src);
20907 }
20908
20909 // TODO: Need to add exception check code for strict FP.
20910 if (EleVT.getSizeInBits() < 16) {
20911 ResVT = MVT::getVectorVT(EleVT, 8);
20912 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
20913 }
20914
20915 if (ResVT != VT)
20916 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
20917 DAG.getIntPtrConstant(0, dl));
20918
20919 if (IsStrict)
20920 return DAG.getMergeValues({Res, Chain}, dl);
20921 return Res;
20922 }
20923
20924 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
20925 if (VT.getVectorElementType() == MVT::i16) {
20926 assert((SrcVT.getVectorElementType() == MVT::f32 ||
20927 SrcVT.getVectorElementType() == MVT::f64) &&
20928 "Expected f32/f64 vector!");
20929 MVT NVT = VT.changeVectorElementType(MVT::i32);
20930 if (IsStrict) {
20931 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
20933 dl, {NVT, MVT::Other}, {Chain, Src});
20934 Chain = Res.getValue(1);
20935 } else {
20936 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
20937 NVT, Src);
20938 }
20939
20940 // TODO: Need to add exception check code for strict FP.
20941 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20942
20943 if (IsStrict)
20944 return DAG.getMergeValues({Res, Chain}, dl);
20945 return Res;
20946 }
20947
20948 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
20949 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
20950 assert(!IsSigned && "Expected unsigned conversion!");
20951 assert(Subtarget.useAVX512Regs() && "Requires avx512f");
20952 return Op;
20953 }
20954
20955 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
20956 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
20957 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
20958 Subtarget.useAVX512Regs()) {
20959 assert(!IsSigned && "Expected unsigned conversion!");
20960 assert(!Subtarget.hasVLX() && "Unexpected features!");
20961 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
20962 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
20963 // Need to concat with zero vector for strict fp to avoid spurious
20964 // exceptions.
20965 // TODO: Should we just do this for non-strict as well?
20966 SDValue Tmp =
20967 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
20968 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
20969 DAG.getIntPtrConstant(0, dl));
20970
20971 if (IsStrict) {
20972 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
20973 {Chain, Src});
20974 Chain = Res.getValue(1);
20975 } else {
20976 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
20977 }
20978
20979 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
20980 DAG.getIntPtrConstant(0, dl));
20981
20982 if (IsStrict)
20983 return DAG.getMergeValues({Res, Chain}, dl);
20984 return Res;
20985 }
20986
20987 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
20988 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
20989 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
20990 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
20991 assert(!Subtarget.hasVLX() && "Unexpected features!");
20992 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
20993 // Need to concat with zero vector for strict fp to avoid spurious
20994 // exceptions.
20995 // TODO: Should we just do this for non-strict as well?
20996 SDValue Tmp =
20997 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
20998 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
20999 DAG.getIntPtrConstant(0, dl));
21000
21001 if (IsStrict) {
21002 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21003 {Chain, Src});
21004 Chain = Res.getValue(1);
21005 } else {
21006 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21007 }
21008
21009 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21010 DAG.getIntPtrConstant(0, dl));
21011
21012 if (IsStrict)
21013 return DAG.getMergeValues({Res, Chain}, dl);
21014 return Res;
21015 }
21016
21017 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21018 if (!Subtarget.hasVLX()) {
21019 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21020 // legalizer and then widened again by vector op legalization.
21021 if (!IsStrict)
21022 return SDValue();
21023
21024 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21025 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21026 {Src, Zero, Zero, Zero});
21027 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21028 {Chain, Tmp});
21029 SDValue Chain = Tmp.getValue(1);
21030 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21031 DAG.getIntPtrConstant(0, dl));
21032 return DAG.getMergeValues({Tmp, Chain}, dl);
21033 }
21034
21035 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
21036 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21037 DAG.getUNDEF(MVT::v2f32));
21038 if (IsStrict) {
21039 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21041 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21042 }
21043 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21044 return DAG.getNode(Opc, dl, VT, Tmp);
21045 }
21046
21047 // Generate optimized instructions for pre AVX512 unsigned conversions from
21048 // vXf32 to vXi32.
21049 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21050 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21051 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21052 assert(!IsSigned && "Expected unsigned conversion!");
21053 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21054 }
21055
21056 return SDValue();
21057 }
21058
21059 assert(!VT.isVector());
21060
21061 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21062
21063 if (!IsSigned && UseSSEReg) {
21064 // Conversions from f32/f64 with AVX512 should be legal.
21065 if (Subtarget.hasAVX512())
21066 return Op;
21067
21068 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21069 // behaves on out of range inputs to generate optimized conversions.
21070 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21071 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21072 unsigned DstBits = VT.getScalarSizeInBits();
21073 APInt UIntLimit = APInt::getSignMask(DstBits);
21074 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21075 DAG.getConstant(UIntLimit, dl, VT));
21076 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21077
21078 // Calculate the converted result for values in the range:
21079 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21080 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21081 SDValue Small =
21082 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21083 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21084 SDValue Big = DAG.getNode(
21085 X86ISD::CVTTS2SI, dl, VT,
21086 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21087 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21088
21089 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21090 // and only if the value was out of range. So we can use that
21091 // as our indicator that we rather use "Big" instead of "Small".
21092 //
21093 // Use "Small" if "IsOverflown" has all bits cleared
21094 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21095 SDValue IsOverflown = DAG.getNode(
21096 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21097 return DAG.getNode(ISD::OR, dl, VT, Small,
21098 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21099 }
21100
21101 // Use default expansion for i64.
21102 if (VT == MVT::i64)
21103 return SDValue();
21104
21105 assert(VT == MVT::i32 && "Unexpected VT!");
21106
21107 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21108 // FIXME: This does not generate an invalid exception if the input does not
21109 // fit in i32. PR44019
21110 if (Subtarget.is64Bit()) {
21111 if (IsStrict) {
21112 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
21113 {Chain, Src});
21114 Chain = Res.getValue(1);
21115 } else
21116 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21117
21118 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21119 if (IsStrict)
21120 return DAG.getMergeValues({Res, Chain}, dl);
21121 return Res;
21122 }
21123
21124 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21125 // use fisttp which will be handled later.
21126 if (!Subtarget.hasSSE3())
21127 return SDValue();
21128 }
21129
21130 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21131 // FIXME: This does not generate an invalid exception if the input does not
21132 // fit in i16. PR44019
21133 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21134 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
21135 if (IsStrict) {
21136 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
21137 {Chain, Src});
21138 Chain = Res.getValue(1);
21139 } else
21140 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21141
21142 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21143 if (IsStrict)
21144 return DAG.getMergeValues({Res, Chain}, dl);
21145 return Res;
21146 }
21147
21148 // If this is a FP_TO_SINT using SSEReg we're done.
21149 if (UseSSEReg && IsSigned)
21150 return Op;
21151
21152 // fp128 needs to use a libcall.
21153 if (SrcVT == MVT::f128) {
21154 RTLIB::Libcall LC;
21155 if (IsSigned)
21156 LC = RTLIB::getFPTOSINT(SrcVT, VT);
21157 else
21158 LC = RTLIB::getFPTOUINT(SrcVT, VT);
21159
21160 MakeLibCallOptions CallOptions;
21161 std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
21162 SDLoc(Op), Chain);
21163
21164 if (IsStrict)
21165 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21166
21167 return Tmp.first;
21168 }
21169
21170 // Fall back to X87.
21171 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21172 if (IsStrict)
21173 return DAG.getMergeValues({V, Chain}, dl);
21174 return V;
21175 }
21176
21177 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
21178}
21179
21180SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21181 SelectionDAG &DAG) const {
21182 SDValue Src = Op.getOperand(0);
21183 EVT DstVT = Op.getSimpleValueType();
21184 MVT SrcVT = Src.getSimpleValueType();
21185
21186 if (SrcVT.isVector())
21187 return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();
21188
21189 if (SrcVT == MVT::f16)
21190 return SDValue();
21191
21192 // If the source is in an SSE register, the node is Legal.
21193 if (isScalarFPTypeInSSEReg(SrcVT))
21194 return Op;
21195
21196 return LRINT_LLRINTHelper(Op.getNode(), DAG);
21197}
21198
21199SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
21200 SelectionDAG &DAG) const {
21201 EVT DstVT = N->getValueType(0);
21202 SDValue Src = N->getOperand(0);
21203 EVT SrcVT = Src.getValueType();
21204
21205 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
21206 // f16 must be promoted before using the lowering in this routine.
21207 // fp128 does not use this lowering.
21208 return SDValue();
21209 }
21210
21211 SDLoc DL(N);
21212 SDValue Chain = DAG.getEntryNode();
21213
21214 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
21215
21216 // If we're converting from SSE, the stack slot needs to hold both types.
21217 // Otherwise it only needs to hold the DstVT.
21218 EVT OtherVT = UseSSE ? SrcVT : DstVT;
21219 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
21220 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
21221 MachinePointerInfo MPI =
21223
21224 if (UseSSE) {
21225 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
21226 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
21227 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21228 SDValue Ops[] = { Chain, StackPtr };
21229
21230 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
21231 /*Align*/ std::nullopt,
21233 Chain = Src.getValue(1);
21234 }
21235
21236 SDValue StoreOps[] = { Chain, Src, StackPtr };
21237 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
21238 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
21240
21241 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
21242}
21243
21244SDValue
21245X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
21246 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
21247 // but making use of X86 specifics to produce better instruction sequences.
21248 SDNode *Node = Op.getNode();
21249 bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;
21250 unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
21251 SDLoc dl(SDValue(Node, 0));
21252 SDValue Src = Node->getOperand(0);
21253
21254 // There are three types involved here: SrcVT is the source floating point
21255 // type, DstVT is the type of the result, and TmpVT is the result of the
21256 // intermediate FP_TO_*INT operation we'll use (which may be a promotion of
21257 // DstVT).
21258 EVT SrcVT = Src.getValueType();
21259 EVT DstVT = Node->getValueType(0);
21260 EVT TmpVT = DstVT;
21261
21262 // This code is only for floats and doubles. Fall back to generic code for
21263 // anything else.
21264 if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftF16(SrcVT, Subtarget))
21265 return SDValue();
21266
21267 EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
21268 unsigned SatWidth = SatVT.getScalarSizeInBits();
21269 unsigned DstWidth = DstVT.getScalarSizeInBits();
21270 unsigned TmpWidth = TmpVT.getScalarSizeInBits();
21271 assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&
21272 "Expected saturation width smaller than result width");
21273
21274 // Promote result of FP_TO_*INT to at least 32 bits.
21275 if (TmpWidth < 32) {
21276 TmpVT = MVT::i32;
21277 TmpWidth = 32;
21278 }
21279
21280 // Promote conversions to unsigned 32-bit to 64-bit, because it will allow
21281 // us to use a native signed conversion instead.
21282 if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {
21283 TmpVT = MVT::i64;
21284 TmpWidth = 64;
21285 }
21286
21287 // If the saturation width is smaller than the size of the temporary result,
21288 // we can always use signed conversion, which is native.
21289 if (SatWidth < TmpWidth)
21290 FpToIntOpcode = ISD::FP_TO_SINT;
21291
21292 // Determine minimum and maximum integer values and their corresponding
21293 // floating-point values.
21294 APInt MinInt, MaxInt;
21295 if (IsSigned) {
21296 MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);
21297 MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);
21298 } else {
21299 MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);
21300 MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);
21301 }
21302
21303 APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21304 APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));
21305
21306 APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(
21307 MinInt, IsSigned, APFloat::rmTowardZero);
21308 APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(
21309 MaxInt, IsSigned, APFloat::rmTowardZero);
21310 bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)
21311 && !(MaxStatus & APFloat::opStatus::opInexact);
21312
21313 SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);
21314 SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);
21315
21316 // If the integer bounds are exactly representable as floats, emit a
21317 // min+max+fptoi sequence. Otherwise use comparisons and selects.
21318 if (AreExactFloatBounds) {
21319 if (DstVT != TmpVT) {
21320 // Clamp by MinFloat from below. If Src is NaN, propagate NaN.
21321 SDValue MinClamped = DAG.getNode(
21322 X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);
21323 // Clamp by MaxFloat from above. If Src is NaN, propagate NaN.
21324 SDValue BothClamped = DAG.getNode(
21325 X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);
21326 // Convert clamped value to integer.
21327 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);
21328
21329 // NaN will become INDVAL, with the top bit set and the rest zero.
21330 // Truncation will discard the top bit, resulting in zero.
21331 return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21332 }
21333
21334 // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.
21335 SDValue MinClamped = DAG.getNode(
21336 X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);
21337 // Clamp by MaxFloat from above. NaN cannot occur.
21338 SDValue BothClamped = DAG.getNode(
21339 X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);
21340 // Convert clamped value to integer.
21341 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);
21342
21343 if (!IsSigned) {
21344 // In the unsigned case we're done, because we mapped NaN to MinFloat,
21345 // which is zero.
21346 return FpToInt;
21347 }
21348
21349 // Otherwise, select zero if Src is NaN.
21350 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21351 return DAG.getSelectCC(
21352 dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);
21353 }
21354
21355 SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);
21356 SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);
21357
21358 // Result of direct conversion, which may be selected away.
21359 SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);
21360
21361 if (DstVT != TmpVT) {
21362 // NaN will become INDVAL, with the top bit set and the rest zero.
21363 // Truncation will discard the top bit, resulting in zero.
21364 FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);
21365 }
21366
21367 SDValue Select = FpToInt;
21368 // For signed conversions where we saturate to the same size as the
21369 // result type of the fptoi instructions, INDVAL coincides with integer
21370 // minimum, so we don't need to explicitly check it.
21371 if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {
21372 // If Src ULT MinFloat, select MinInt. In particular, this also selects
21373 // MinInt if Src is NaN.
21374 Select = DAG.getSelectCC(
21375 dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);
21376 }
21377
21378 // If Src OGT MaxFloat, select MaxInt.
21379 Select = DAG.getSelectCC(
21380 dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);
21381
21382 // In the unsigned case we are done, because we mapped NaN to MinInt, which
21383 // is already zero. The promoted case was already handled above.
21384 if (!IsSigned || DstVT != TmpVT) {
21385 return Select;
21386 }
21387
21388 // Otherwise, select 0 if Src is NaN.
21389 SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
21390 return DAG.getSelectCC(
21391 dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
21392}
21393
21394SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
21395 bool IsStrict = Op->isStrictFPOpcode();
21396
21397 SDLoc DL(Op);
21398 MVT VT = Op.getSimpleValueType();
21399 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21400 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21401 MVT SVT = In.getSimpleValueType();
21402
21403 // Let f16->f80 get lowered to a libcall, except for darwin, where we should
21404 // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)
21405 if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&
21406 !Subtarget.getTargetTriple().isOSDarwin()))
21407 return SDValue();
21408
21409 if ((SVT == MVT::v8f16 && Subtarget.hasF16C()) ||
21410 (SVT == MVT::v16f16 && Subtarget.useAVX512Regs()))
21411 return Op;
21412
21413 if (SVT == MVT::f16) {
21414 if (Subtarget.hasFP16())
21415 return Op;
21416
21417 if (VT != MVT::f32) {
21418 if (IsStrict)
21419 return DAG.getNode(
21420 ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},
21421 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,
21422 {MVT::f32, MVT::Other}, {Chain, In})});
21423
21424 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
21425 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));
21426 }
21427
21428 if (!Subtarget.hasF16C()) {
21429 if (!Subtarget.getTargetTriple().isOSDarwin())
21430 return SDValue();
21431
21432 assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall");
21433
21434 // Need a libcall, but ABI for f16 is soft-float on MacOS.
21436 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21437
21438 In = DAG.getBitcast(MVT::i16, In);
21441 Entry.Node = In;
21442 Entry.Ty = EVT(MVT::i16).getTypeForEVT(*DAG.getContext());
21443 Entry.IsSExt = false;
21444 Entry.IsZExt = true;
21445 Args.push_back(Entry);
21446
21448 getLibcallName(RTLIB::FPEXT_F16_F32),
21450 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
21451 CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,
21452 std::move(Args));
21453
21454 SDValue Res;
21455 std::tie(Res,Chain) = LowerCallTo(CLI);
21456 if (IsStrict)
21457 Res = DAG.getMergeValues({Res, Chain}, DL);
21458
21459 return Res;
21460 }
21461
21462 In = DAG.getBitcast(MVT::i16, In);
21463 In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,
21464 getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,
21465 DAG.getIntPtrConstant(0, DL));
21466 SDValue Res;
21467 if (IsStrict) {
21468 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},
21469 {Chain, In});
21470 Chain = Res.getValue(1);
21471 } else {
21472 Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,
21473 DAG.getTargetConstant(4, DL, MVT::i32));
21474 }
21475 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,
21476 DAG.getIntPtrConstant(0, DL));
21477 if (IsStrict)
21478 return DAG.getMergeValues({Res, Chain}, DL);
21479 return Res;
21480 }
21481
21482 if (!SVT.isVector() || SVT.getVectorElementType() == MVT::bf16)
21483 return Op;
21484
21485 if (SVT.getVectorElementType() == MVT::f16) {
21486 if (Subtarget.hasFP16() && isTypeLegal(SVT))
21487 return Op;
21488 assert(Subtarget.hasF16C() && "Unexpected features!");
21489 if (SVT == MVT::v2f16)
21490 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,
21491 DAG.getUNDEF(MVT::v2f16));
21492 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,
21493 DAG.getUNDEF(MVT::v4f16));
21494 if (IsStrict)
21495 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21496 {Op->getOperand(0), Res});
21497 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21498 } else if (VT == MVT::v4f64 || VT == MVT::v8f64) {
21499 return Op;
21500 }
21501
21502 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
21503
21504 SDValue Res =
21505 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
21506 if (IsStrict)
21507 return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
21508 {Op->getOperand(0), Res});
21509 return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
21510}
21511
21512SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
21513 bool IsStrict = Op->isStrictFPOpcode();
21514
21515 SDLoc DL(Op);
21516 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21517 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
21518 MVT VT = Op.getSimpleValueType();
21519 MVT SVT = In.getSimpleValueType();
21520
21521 if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))
21522 return SDValue();
21523
21524 if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&
21525 !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {
21526 if (!Subtarget.getTargetTriple().isOSDarwin())
21527 return SDValue();
21528
21529 // We need a libcall but the ABI for f16 libcalls on MacOS is soft.
21531 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
21532
21535 Entry.Node = In;
21536 Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext());
21537 Entry.IsSExt = false;
21538 Entry.IsZExt = true;
21539 Args.push_back(Entry);
21540
21542 getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16
21543 : RTLIB::FPROUND_F32_F16),
21545 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
21546 CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,
21547 std::move(Args));
21548
21549 SDValue Res;
21550 std::tie(Res, Chain) = LowerCallTo(CLI);
21551
21552 Res = DAG.getBitcast(MVT::f16, Res);
21553
21554 if (IsStrict)
21555 Res = DAG.getMergeValues({Res, Chain}, DL);
21556
21557 return Res;
21558 }
21559
21560 if (VT.getScalarType() == MVT::bf16) {
21561 if (SVT.getScalarType() == MVT::f32 &&
21562 ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
21563 Subtarget.hasAVXNECONVERT()))
21564 return Op;
21565 return SDValue();
21566 }
21567
21568 if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {
21569 if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)
21570 return SDValue();
21571
21572 if (VT.isVector())
21573 return Op;
21574
21575 SDValue Res;
21577 MVT::i32);
21578 if (IsStrict) {
21579 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,
21580 DAG.getConstantFP(0, DL, MVT::v4f32), In,
21581 DAG.getIntPtrConstant(0, DL));
21582 Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},
21583 {Chain, Res, Rnd});
21584 Chain = Res.getValue(1);
21585 } else {
21586 // FIXME: Should we use zeros for upper elements for non-strict?
21587 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);
21588 Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);
21589 }
21590
21591 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
21592 DAG.getIntPtrConstant(0, DL));
21593 Res = DAG.getBitcast(MVT::f16, Res);
21594
21595 if (IsStrict)
21596 return DAG.getMergeValues({Res, Chain}, DL);
21597
21598 return Res;
21599 }
21600
21601 return Op;
21602}
21603
21605 bool IsStrict = Op->isStrictFPOpcode();
21606 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21607 assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
21608 "Unexpected VT!");
21609
21610 SDLoc dl(Op);
21611 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
21612 DAG.getConstant(0, dl, MVT::v8i16), Src,
21613 DAG.getIntPtrConstant(0, dl));
21614
21615 SDValue Chain;
21616 if (IsStrict) {
21617 Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
21618 {Op.getOperand(0), Res});
21619 Chain = Res.getValue(1);
21620 } else {
21621 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
21622 }
21623
21624 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
21625 DAG.getIntPtrConstant(0, dl));
21626
21627 if (IsStrict)
21628 return DAG.getMergeValues({Res, Chain}, dl);
21629
21630 return Res;
21631}
21632
21634 bool IsStrict = Op->isStrictFPOpcode();
21635 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21636 assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
21637 "Unexpected VT!");
21638
21639 SDLoc dl(Op);
21640 SDValue Res, Chain;
21641 if (IsStrict) {
21642 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
21643 DAG.getConstantFP(0, dl, MVT::v4f32), Src,
21644 DAG.getIntPtrConstant(0, dl));
21645 Res = DAG.getNode(
21646 X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
21647 {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
21648 Chain = Res.getValue(1);
21649 } else {
21650 // FIXME: Should we use zeros for upper elements for non-strict?
21651 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
21652 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
21653 DAG.getTargetConstant(4, dl, MVT::i32));
21654 }
21655
21656 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
21657 DAG.getIntPtrConstant(0, dl));
21658
21659 if (IsStrict)
21660 return DAG.getMergeValues({Res, Chain}, dl);
21661
21662 return Res;
21663}
21664
21665SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
21666 SelectionDAG &DAG) const {
21667 SDLoc DL(Op);
21668
21669 MVT SVT = Op.getOperand(0).getSimpleValueType();
21670 if (SVT == MVT::f32 && ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
21671 Subtarget.hasAVXNECONVERT())) {
21672 SDValue Res;
21673 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Op.getOperand(0));
21674 Res = DAG.getNode(X86ISD::CVTNEPS2BF16, DL, MVT::v8bf16, Res);
21675 Res = DAG.getBitcast(MVT::v8i16, Res);
21676 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,
21677 DAG.getIntPtrConstant(0, DL));
21678 }
21679
21680 MakeLibCallOptions CallOptions;
21681 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
21682 SDValue Res =
21683 makeLibCall(DAG, LC, MVT::f16, Op.getOperand(0), CallOptions, DL).first;
21684 return DAG.getBitcast(MVT::i16, Res);
21685}
21686
21687/// Depending on uarch and/or optimizing for size, we might prefer to use a
21688/// vector operation in place of the typical scalar operation.
21690 SelectionDAG &DAG,
21691 const X86Subtarget &Subtarget) {
21692 // If both operands have other uses, this is probably not profitable.
21693 SDValue LHS = Op.getOperand(0);
21694 SDValue RHS = Op.getOperand(1);
21695 if (!LHS.hasOneUse() && !RHS.hasOneUse())
21696 return Op;
21697
21698 // FP horizontal add/sub were added with SSE3. Integer with SSSE3.
21699 bool IsFP = Op.getSimpleValueType().isFloatingPoint();
21700 if (IsFP && !Subtarget.hasSSE3())
21701 return Op;
21702 if (!IsFP && !Subtarget.hasSSSE3())
21703 return Op;
21704
21705 // Extract from a common vector.
21706 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21707 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
21708 LHS.getOperand(0) != RHS.getOperand(0) ||
21709 !isa<ConstantSDNode>(LHS.getOperand(1)) ||
21710 !isa<ConstantSDNode>(RHS.getOperand(1)) ||
21711 !shouldUseHorizontalOp(true, DAG, Subtarget))
21712 return Op;
21713
21714 // Allow commuted 'hadd' ops.
21715 // TODO: Allow commuted (f)sub by negating the result of (F)HSUB?
21716 unsigned HOpcode;
21717 switch (Op.getOpcode()) {
21718 // clang-format off
21719 case ISD::ADD: HOpcode = X86ISD::HADD; break;
21720 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
21721 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
21722 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
21723 default:
21724 llvm_unreachable("Trying to lower unsupported opcode to horizontal op");
21725 // clang-format on
21726 }
21727 unsigned LExtIndex = LHS.getConstantOperandVal(1);
21728 unsigned RExtIndex = RHS.getConstantOperandVal(1);
21729 if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&
21730 (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))
21731 std::swap(LExtIndex, RExtIndex);
21732
21733 if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))
21734 return Op;
21735
21736 SDValue X = LHS.getOperand(0);
21737 EVT VecVT = X.getValueType();
21738 unsigned BitWidth = VecVT.getSizeInBits();
21739 unsigned NumLanes = BitWidth / 128;
21740 unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;
21741 assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&
21742 "Not expecting illegal vector widths here");
21743
21744 // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit
21745 // equivalent, so extract the 256/512-bit source op to 128-bit if we can.
21746 if (BitWidth == 256 || BitWidth == 512) {
21747 unsigned LaneIdx = LExtIndex / NumEltsPerLane;
21748 X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);
21749 LExtIndex %= NumEltsPerLane;
21750 }
21751
21752 // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0
21753 // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0
21754 // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1
21755 // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0
21756 SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);
21757 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,
21758 DAG.getIntPtrConstant(LExtIndex / 2, DL));
21759}
21760
21761/// Depending on uarch and/or optimizing for size, we might prefer to use a
21762/// vector operation in place of the typical scalar operation.
21763SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
21764 assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
21765 "Only expecting float/double");
21766 return lowerAddSubToHorizontalOp(Op, SDLoc(Op), DAG, Subtarget);
21767}
21768
21769/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
21770/// This mode isn't supported in hardware on X86. But as long as we aren't
21771/// compiling with trapping math, we can emulate this with
21772/// trunc(X + copysign(nextafter(0.5, 0.0), X)).
21774 SDValue N0 = Op.getOperand(0);
21775 SDLoc dl(Op);
21776 MVT VT = Op.getSimpleValueType();
21777
21778 // N0 += copysign(nextafter(0.5, 0.0), N0)
21780 bool Ignored;
21781 APFloat Point5Pred = APFloat(0.5f);
21782 Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
21783 Point5Pred.next(/*nextDown*/true);
21784
21785 SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
21786 DAG.getConstantFP(Point5Pred, dl, VT), N0);
21787 N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
21788
21789 // Truncate the result to remove fraction.
21790 return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
21791}
21792
21793/// The only differences between FABS and FNEG are the mask and the logic op.
21794/// FNEG also has a folding opportunity for FNEG(FABS(x)).
21796 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
21797 "Wrong opcode for lowering FABS or FNEG.");
21798
21799 bool IsFABS = (Op.getOpcode() == ISD::FABS);
21800
21801 // If this is a FABS and it has an FNEG user, bail out to fold the combination
21802 // into an FNABS. We'll lower the FABS after that if it is still in use.
21803 if (IsFABS)
21804 for (SDNode *User : Op->uses())
21805 if (User->getOpcode() == ISD::FNEG)
21806 return Op;
21807
21808 SDLoc dl(Op);
21809 MVT VT = Op.getSimpleValueType();
21810
21811 bool IsF128 = (VT == MVT::f128);
21812 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
21814 "Unexpected type in LowerFABSorFNEG");
21815
21816 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOptLevel to
21817 // decide if we should generate a 16-byte constant mask when we only need 4 or
21818 // 8 bytes for the scalar case.
21819
21820 // There are no scalar bitwise logical SSE/AVX instructions, so we
21821 // generate a 16-byte vector constant and logic op even for the scalar case.
21822 // Using a 16-byte mask allows folding the load of the mask with
21823 // the logic op, so it can save (~4 bytes) on code size.
21824 bool IsFakeVector = !VT.isVector() && !IsF128;
21825 MVT LogicVT = VT;
21826 if (IsFakeVector)
21827 LogicVT = (VT == MVT::f64) ? MVT::v2f64
21828 : (VT == MVT::f32) ? MVT::v4f32
21829 : MVT::v8f16;
21830
21831 unsigned EltBits = VT.getScalarSizeInBits();
21832 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
21833 APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :
21834 APInt::getSignMask(EltBits);
21836 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
21837
21838 SDValue Op0 = Op.getOperand(0);
21839 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
21840 unsigned LogicOp = IsFABS ? X86ISD::FAND :
21841 IsFNABS ? X86ISD::FOR :
21843 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
21844
21845 if (VT.isVector() || IsF128)
21846 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
21847
21848 // For the scalar case extend to a 128-bit vector, perform the logic op,
21849 // and extract the scalar result back out.
21850 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
21851 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
21852 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
21853 DAG.getIntPtrConstant(0, dl));
21854}
21855
21857 SDValue Mag = Op.getOperand(0);
21858 SDValue Sign = Op.getOperand(1);
21859 SDLoc dl(Op);
21860
21861 // If the sign operand is smaller, extend it first.
21862 MVT VT = Op.getSimpleValueType();
21863 if (Sign.getSimpleValueType().bitsLT(VT))
21864 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
21865
21866 // And if it is bigger, shrink it first.
21867 if (Sign.getSimpleValueType().bitsGT(VT))
21868 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,
21869 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
21870
21871 // At this point the operands and the result should have the same
21872 // type, and that won't be f80 since that is not custom lowered.
21873 bool IsF128 = (VT == MVT::f128);
21874 assert(VT.isFloatingPoint() && VT != MVT::f80 &&
21876 "Unexpected type in LowerFCOPYSIGN");
21877
21879
21880 // Perform all scalar logic operations as 16-byte vectors because there are no
21881 // scalar FP logic instructions in SSE.
21882 // TODO: This isn't necessary. If we used scalar types, we might avoid some
21883 // unnecessary splats, but we might miss load folding opportunities. Should
21884 // this decision be based on OptimizeForSize?
21885 bool IsFakeVector = !VT.isVector() && !IsF128;
21886 MVT LogicVT = VT;
21887 if (IsFakeVector)
21888 LogicVT = (VT == MVT::f64) ? MVT::v2f64
21889 : (VT == MVT::f32) ? MVT::v4f32
21890 : MVT::v8f16;
21891
21892 // The mask constants are automatically splatted for vector types.
21893 unsigned EltSizeInBits = VT.getScalarSizeInBits();
21894 SDValue SignMask = DAG.getConstantFP(
21895 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
21896 SDValue MagMask = DAG.getConstantFP(
21897 APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);
21898
21899 // First, clear all bits but the sign bit from the second operand (sign).
21900 if (IsFakeVector)
21901 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
21902 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
21903
21904 // Next, clear the sign bit from the first operand (magnitude).
21905 // TODO: If we had general constant folding for FP logic ops, this check
21906 // wouldn't be necessary.
21907 SDValue MagBits;
21908 if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {
21909 APFloat APF = Op0CN->getValueAPF();
21910 APF.clearSign();
21911 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
21912 } else {
21913 // If the magnitude operand wasn't a constant, we need to AND out the sign.
21914 if (IsFakeVector)
21915 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
21916 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
21917 }
21918
21919 // OR the magnitude value with the sign bit.
21920 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
21921 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
21922 DAG.getIntPtrConstant(0, dl));
21923}
21924
21926 SDValue N0 = Op.getOperand(0);
21927 SDLoc dl(Op);
21928 MVT VT = Op.getSimpleValueType();
21929
21930 MVT OpVT = N0.getSimpleValueType();
21931 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
21932 "Unexpected type for FGETSIGN");
21933
21934 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
21935 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
21936 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
21937 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
21938 Res = DAG.getZExtOrTrunc(Res, dl, VT);
21939 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
21940 return Res;
21941}
21942
21943/// Helper for attempting to create a X86ISD::BT node.
21944static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {
21945 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
21946 // instruction. Since the shift amount is in-range-or-undefined, we know
21947 // that doing a bittest on the i32 value is ok. We extend to i32 because
21948 // the encoding for the i16 version is larger than the i32 version.
21949 // Also promote i16 to i32 for performance / code size reason.
21950 if (Src.getValueType().getScalarSizeInBits() < 32)
21951 Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
21952
21953 // No legal type found, give up.
21954 if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))
21955 return SDValue();
21956
21957 // See if we can use the 32-bit instruction instead of the 64-bit one for a
21958 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
21959 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
21960 // known to be zero.
21961 if (Src.getValueType() == MVT::i64 &&
21962 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
21963 Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
21964
21965 // If the operand types disagree, extend the shift amount to match. Since
21966 // BT ignores high bits (like shifts) we can use anyextend.
21967 if (Src.getValueType() != BitNo.getValueType()) {
21968 // Peek through a mask/modulo operation.
21969 // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but
21970 // we probably need a better IsDesirableToPromoteOp to handle this as well.
21971 if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())
21972 BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),
21973 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
21974 BitNo.getOperand(0)),
21975 DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),
21976 BitNo.getOperand(1)));
21977 else
21978 BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);
21979 }
21980
21981 return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);
21982}
21983
21984/// Helper for creating a X86ISD::SETCC node.
21986 SelectionDAG &DAG) {
21987 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
21988 DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);
21989}
21990
21991/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
21992/// recognizable memcmp expansion.
21993static bool isOrXorXorTree(SDValue X, bool Root = true) {
21994 if (X.getOpcode() == ISD::OR)
21995 return isOrXorXorTree(X.getOperand(0), false) &&
21996 isOrXorXorTree(X.getOperand(1), false);
21997 if (Root)
21998 return false;
21999 return X.getOpcode() == ISD::XOR;
22000}
22001
22002/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
22003/// expansion.
22004template <typename F>
22006 EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
22007 SDValue Op0 = X.getOperand(0);
22008 SDValue Op1 = X.getOperand(1);
22009 if (X.getOpcode() == ISD::OR) {
22010 SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22011 SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
22012 if (VecVT != CmpVT)
22013 return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
22014 if (HasPT)
22015 return DAG.getNode(ISD::OR, DL, VecVT, A, B);
22016 return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
22017 }
22018 if (X.getOpcode() == ISD::XOR) {
22019 SDValue A = SToV(Op0);
22020 SDValue B = SToV(Op1);
22021 if (VecVT != CmpVT)
22022 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
22023 if (HasPT)
22024 return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
22025 return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
22026 }
22027 llvm_unreachable("Impossible");
22028}
22029
22030/// Try to map a 128-bit or larger integer comparison to vector instructions
22031/// before type legalization splits it up into chunks.
22034 const SDLoc &DL,
22035 SelectionDAG &DAG,
22036 const X86Subtarget &Subtarget) {
22037 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
22038
22039 // We're looking for an oversized integer equality comparison.
22040 EVT OpVT = X.getValueType();
22041 unsigned OpSize = OpVT.getSizeInBits();
22042 if (!OpVT.isScalarInteger() || OpSize < 128)
22043 return SDValue();
22044
22045 // Ignore a comparison with zero because that gets special treatment in
22046 // EmitTest(). But make an exception for the special case of a pair of
22047 // logically-combined vector-sized operands compared to zero. This pattern may
22048 // be generated by the memcmp expansion pass with oversized integer compares
22049 // (see PR33325).
22050 bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
22051 if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
22052 return SDValue();
22053
22054 // Don't perform this combine if constructing the vector will be expensive.
22055 auto IsVectorBitCastCheap = [](SDValue X) {
22057 return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||
22058 X.getOpcode() == ISD::LOAD;
22059 };
22060 if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
22061 !IsOrXorXorTreeCCZero)
22062 return SDValue();
22063
22064 // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
22065 // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
22066 // Otherwise use PCMPEQ (plus AND) and mask testing.
22067 bool NoImplicitFloatOps =
22069 Attribute::NoImplicitFloat);
22070 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
22071 ((OpSize == 128 && Subtarget.hasSSE2()) ||
22072 (OpSize == 256 && Subtarget.hasAVX()) ||
22073 (OpSize == 512 && Subtarget.useAVX512Regs()))) {
22074 bool HasPT = Subtarget.hasSSE41();
22075
22076 // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
22077 // vector registers are essentially free. (Technically, widening registers
22078 // prevents load folding, but the tradeoff is worth it.)
22079 bool PreferKOT = Subtarget.preferMaskRegisters();
22080 bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
22081
22082 EVT VecVT = MVT::v16i8;
22083 EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
22084 if (OpSize == 256) {
22085 VecVT = MVT::v32i8;
22086 CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
22087 }
22088 EVT CastVT = VecVT;
22089 bool NeedsAVX512FCast = false;
22090 if (OpSize == 512 || NeedZExt) {
22091 if (Subtarget.hasBWI()) {
22092 VecVT = MVT::v64i8;
22093 CmpVT = MVT::v64i1;
22094 if (OpSize == 512)
22095 CastVT = VecVT;
22096 } else {
22097 VecVT = MVT::v16i32;
22098 CmpVT = MVT::v16i1;
22099 CastVT = OpSize == 512 ? VecVT
22100 : OpSize == 256 ? MVT::v8i32
22101 : MVT::v4i32;
22102 NeedsAVX512FCast = true;
22103 }
22104 }
22105
22106 auto ScalarToVector = [&](SDValue X) -> SDValue {
22107 bool TmpZext = false;
22108 EVT TmpCastVT = CastVT;
22109 if (X.getOpcode() == ISD::ZERO_EXTEND) {
22110 SDValue OrigX = X.getOperand(0);
22111 unsigned OrigSize = OrigX.getScalarValueSizeInBits();
22112 if (OrigSize < OpSize) {
22113 if (OrigSize == 128) {
22114 TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
22115 X = OrigX;
22116 TmpZext = true;
22117 } else if (OrigSize == 256) {
22118 TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
22119 X = OrigX;
22120 TmpZext = true;
22121 }
22122 }
22123 }
22124 X = DAG.getBitcast(TmpCastVT, X);
22125 if (!NeedZExt && !TmpZext)
22126 return X;
22127 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
22128 DAG.getConstant(0, DL, VecVT), X,
22129 DAG.getVectorIdxConstant(0, DL));
22130 };
22131
22132 SDValue Cmp;
22133 if (IsOrXorXorTreeCCZero) {
22134 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
22135 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
22136 // Use 2 vector equality compares and 'and' the results before doing a
22137 // MOVMSK.
22138 Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
22139 } else {
22140 SDValue VecX = ScalarToVector(X);
22141 SDValue VecY = ScalarToVector(Y);
22142 if (VecVT != CmpVT) {
22143 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
22144 } else if (HasPT) {
22145 Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
22146 } else {
22147 Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
22148 }
22149 }
22150 // AVX512 should emit a setcc that will lower to kortest.
22151 if (VecVT != CmpVT) {
22152 EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64
22153 : CmpVT == MVT::v32i1 ? MVT::i32
22154 : MVT::i16;
22155 return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
22156 DAG.getConstant(0, DL, KRegVT), CC);
22157 }
22158 if (HasPT) {
22159 SDValue BCCmp =
22160 DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);
22161 SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
22163 SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
22164 return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
22165 }
22166 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
22167 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
22168 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
22169 assert(Cmp.getValueType() == MVT::v16i8 &&
22170 "Non 128-bit vector on pre-SSE41 target");
22171 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
22172 SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
22173 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
22174 }
22175
22176 return SDValue();
22177}
22178
22179/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))
22180/// style scalarized (associative) reduction patterns. Partial reductions
22181/// are supported when the pointer SrcMask is non-null.
22182/// TODO - move this to SelectionDAG?
22185 SmallVectorImpl<APInt> *SrcMask = nullptr) {
22187 DenseMap<SDValue, APInt> SrcOpMap;
22188 EVT VT = MVT::Other;
22189
22190 // Recognize a special case where a vector is casted into wide integer to
22191 // test all 0s.
22192 assert(Op.getOpcode() == unsigned(BinOp) &&
22193 "Unexpected bit reduction opcode");
22194 Opnds.push_back(Op.getOperand(0));
22195 Opnds.push_back(Op.getOperand(1));
22196
22197 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
22199 // BFS traverse all BinOp operands.
22200 if (I->getOpcode() == unsigned(BinOp)) {
22201 Opnds.push_back(I->getOperand(0));
22202 Opnds.push_back(I->getOperand(1));
22203 // Re-evaluate the number of nodes to be traversed.
22204 e += 2; // 2 more nodes (LHS and RHS) are pushed.
22205 continue;
22206 }
22207
22208 // Quit if a non-EXTRACT_VECTOR_ELT
22209 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
22210 return false;
22211
22212 // Quit if without a constant index.
22213 auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
22214 if (!Idx)
22215 return false;
22216
22217 SDValue Src = I->getOperand(0);
22218 DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);
22219 if (M == SrcOpMap.end()) {
22220 VT = Src.getValueType();
22221 // Quit if not the same type.
22222 if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())
22223 return false;
22224 unsigned NumElts = VT.getVectorNumElements();
22225 APInt EltCount = APInt::getZero(NumElts);
22226 M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
22227 SrcOps.push_back(Src);
22228 }
22229
22230 // Quit if element already used.
22231 unsigned CIdx = Idx->getZExtValue();
22232 if (M->second[CIdx])
22233 return false;
22234 M->second.setBit(CIdx);
22235 }
22236
22237 if (SrcMask) {
22238 // Collect the source partial masks.
22239 for (SDValue &SrcOp : SrcOps)
22240 SrcMask->push_back(SrcOpMap[SrcOp]);
22241 } else {
22242 // Quit if not all elements are used.
22243 for (const auto &I : SrcOpMap)
22244 if (!I.second.isAllOnes())
22245 return false;
22246 }
22247
22248 return true;
22249}
22250
22251// Helper function for comparing all bits of two vectors.
22253 ISD::CondCode CC, const APInt &OriginalMask,
22254 const X86Subtarget &Subtarget,
22255 SelectionDAG &DAG, X86::CondCode &X86CC) {
22256 EVT VT = LHS.getValueType();
22257 unsigned ScalarSize = VT.getScalarSizeInBits();
22258 if (OriginalMask.getBitWidth() != ScalarSize) {
22259 assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch");
22260 return SDValue();
22261 }
22262
22263 // Quit if not convertable to legal scalar or 128/256-bit vector.
22264 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
22265 return SDValue();
22266
22267 // FCMP may use ISD::SETNE when nnan - early out if we manage to get here.
22268 if (VT.isFloatingPoint())
22269 return SDValue();
22270
22271 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22272 X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
22273
22274 APInt Mask = OriginalMask;
22275
22276 auto MaskBits = [&](SDValue Src) {
22277 if (Mask.isAllOnes())
22278 return Src;
22279 EVT SrcVT = Src.getValueType();
22280 SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
22281 return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
22282 };
22283
22284 // For sub-128-bit vector, cast to (legal) integer and compare with zero.
22285 if (VT.getSizeInBits() < 128) {
22286 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
22287 if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {
22288 if (IntVT != MVT::i64)
22289 return SDValue();
22290 auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,
22291 MVT::i32, MVT::i32);
22292 auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,
22293 MVT::i32, MVT::i32);
22294 SDValue Lo =
22295 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);
22296 SDValue Hi =
22297 DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);
22298 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22299 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),
22300 DAG.getConstant(0, DL, MVT::i32));
22301 }
22302 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
22303 DAG.getBitcast(IntVT, MaskBits(LHS)),
22304 DAG.getBitcast(IntVT, MaskBits(RHS)));
22305 }
22306
22307 // Without PTEST, a masked v2i64 or-reduction is not faster than
22308 // scalarization.
22309 bool UseKORTEST = Subtarget.useAVX512Regs();
22310 bool UsePTEST = Subtarget.hasSSE41();
22311 if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)
22312 return SDValue();
22313
22314 // Split down to 128/256/512-bit vector.
22315 unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);
22316
22317 // If the input vector has vector elements wider than the target test size,
22318 // then cast to <X x i64> so it will safely split.
22319 if (ScalarSize > TestSize) {
22320 if (!Mask.isAllOnes())
22321 return SDValue();
22322 VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);
22323 LHS = DAG.getBitcast(VT, LHS);
22324 RHS = DAG.getBitcast(VT, RHS);
22325 Mask = APInt::getAllOnes(64);
22326 }
22327
22328 if (VT.getSizeInBits() > TestSize) {
22329 KnownBits KnownRHS = DAG.computeKnownBits(RHS);
22330 if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {
22331 // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.
22332 while (VT.getSizeInBits() > TestSize) {
22333 auto Split = DAG.SplitVector(LHS, DL);
22334 VT = Split.first.getValueType();
22335 LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
22336 }
22337 RHS = DAG.getAllOnesConstant(DL, VT);
22338 } else if (!UsePTEST && !KnownRHS.isZero()) {
22339 // MOVMSK Special Case:
22340 // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)
22341 MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;
22342 VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());
22343 LHS = DAG.getBitcast(VT, MaskBits(LHS));
22344 RHS = DAG.getBitcast(VT, MaskBits(RHS));
22345 EVT BoolVT = VT.changeVectorElementType(MVT::i1);
22346 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);
22347 V = DAG.getSExtOrTrunc(V, DL, VT);
22348 while (VT.getSizeInBits() > TestSize) {
22349 auto Split = DAG.SplitVector(V, DL);
22350 VT = Split.first.getValueType();
22351 V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);
22352 }
22353 V = DAG.getNOT(DL, V, VT);
22354 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22355 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22356 DAG.getConstant(0, DL, MVT::i32));
22357 } else {
22358 // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.
22359 SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
22360 while (VT.getSizeInBits() > TestSize) {
22361 auto Split = DAG.SplitVector(V, DL);
22362 VT = Split.first.getValueType();
22363 V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
22364 }
22365 LHS = V;
22366 RHS = DAG.getConstant(0, DL, VT);
22367 }
22368 }
22369
22370 if (UseKORTEST && VT.is512BitVector()) {
22371 MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
22372 MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);
22373 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
22374 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
22375 SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);
22376 return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);
22377 }
22378
22379 if (UsePTEST) {
22380 MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
22381 LHS = DAG.getBitcast(TestVT, MaskBits(LHS));
22382 RHS = DAG.getBitcast(TestVT, MaskBits(RHS));
22383 SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);
22384 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
22385 }
22386
22387 assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits");
22388 MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;
22389 LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));
22390 RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));
22391 SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);
22392 V = DAG.getNOT(DL, V, MaskVT);
22393 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
22394 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
22395 DAG.getConstant(0, DL, MVT::i32));
22396}
22397
22398// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback
22399// to CMP(MOVMSK(PCMPEQB(X,Y))).
22401 ISD::CondCode CC, const SDLoc &DL,
22402 const X86Subtarget &Subtarget,
22403 SelectionDAG &DAG,
22404 X86::CondCode &X86CC) {
22405 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
22406
22407 bool CmpNull = isNullConstant(RHS);
22408 bool CmpAllOnes = isAllOnesConstant(RHS);
22409 if (!CmpNull && !CmpAllOnes)
22410 return SDValue();
22411
22412 SDValue Op = LHS;
22413 if (!Subtarget.hasSSE2() || !Op->hasOneUse())
22414 return SDValue();
22415
22416 // Check whether we're masking/truncating an OR-reduction result, in which
22417 // case track the masked bits.
22418 // TODO: Add CmpAllOnes support.
22419 APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());
22420 if (CmpNull) {
22421 switch (Op.getOpcode()) {
22422 case ISD::TRUNCATE: {
22423 SDValue Src = Op.getOperand(0);
22424 Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
22425 Op.getScalarValueSizeInBits());
22426 Op = Src;
22427 break;
22428 }
22429 case ISD::AND: {
22430 if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
22431 Mask = Cst->getAPIntValue();
22432 Op = Op.getOperand(0);
22433 }
22434 break;
22435 }
22436 }
22437 }
22438
22439 ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;
22440
22441 // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.
22442 // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.
22444 if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {
22445 EVT VT = VecIns[0].getValueType();
22446 assert(llvm::all_of(VecIns,
22447 [VT](SDValue V) { return VT == V.getValueType(); }) &&
22448 "Reduction source vector mismatch");
22449
22450 // Quit if not splittable to scalar/128/256/512-bit vector.
22451 if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))
22452 return SDValue();
22453
22454 // If more than one full vector is evaluated, AND/OR them first before
22455 // PTEST.
22456 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
22457 Slot += 2, e += 1) {
22458 // Each iteration will AND/OR 2 nodes and append the result until there is
22459 // only 1 node left, i.e. the final value of all vectors.
22460 SDValue LHS = VecIns[Slot];
22461 SDValue RHS = VecIns[Slot + 1];
22462 VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));
22463 }
22464
22465 return LowerVectorAllEqual(DL, VecIns.back(),
22466 CmpNull ? DAG.getConstant(0, DL, VT)
22467 : DAG.getAllOnesConstant(DL, VT),
22468 CC, Mask, Subtarget, DAG, X86CC);
22469 }
22470
22471 // Match icmp(reduce_or(X),0) anyof reduction patterns.
22472 // Match icmp(reduce_and(X),-1) allof reduction patterns.
22473 if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
22474 ISD::NodeType BinOp;
22475 if (SDValue Match =
22476 DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {
22477 EVT MatchVT = Match.getValueType();
22479 CmpNull ? DAG.getConstant(0, DL, MatchVT)
22480 : DAG.getAllOnesConstant(DL, MatchVT),
22481 CC, Mask, Subtarget, DAG, X86CC);
22482 }
22483 }
22484
22485 if (Mask.isAllOnes()) {
22486 assert(!Op.getValueType().isVector() &&
22487 "Illegal vector type for reduction pattern");
22489 if (Src.getValueType().isFixedLengthVector() &&
22490 Src.getValueType().getScalarType() == MVT::i1) {
22491 // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.
22492 // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.
22493 if (Src.getOpcode() == ISD::SETCC) {
22494 SDValue LHS = Src.getOperand(0);
22495 SDValue RHS = Src.getOperand(1);
22496 EVT LHSVT = LHS.getValueType();
22497 ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();
22498 if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&
22499 llvm::has_single_bit<uint32_t>(LHSVT.getSizeInBits())) {
22500 APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());
22501 return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,
22502 X86CC);
22503 }
22504 }
22505 // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.
22506 // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.
22507 // Peek through truncation, mask the LSB and compare against zero/LSB.
22508 if (Src.getOpcode() == ISD::TRUNCATE) {
22509 SDValue Inner = Src.getOperand(0);
22510 EVT InnerVT = Inner.getValueType();
22511 if (llvm::has_single_bit<uint32_t>(InnerVT.getSizeInBits())) {
22512 unsigned BW = InnerVT.getScalarSizeInBits();
22513 APInt SrcMask = APInt(BW, 1);
22514 APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;
22515 return LowerVectorAllEqual(DL, Inner,
22516 DAG.getConstant(Cmp, DL, InnerVT), CC,
22517 SrcMask, Subtarget, DAG, X86CC);
22518 }
22519 }
22520 }
22521 }
22522
22523 return SDValue();
22524}
22525
22526/// return true if \c Op has a use that doesn't just read flags.
22528 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
22529 ++UI) {
22530 SDNode *User = *UI;
22531 unsigned UOpNo = UI.getOperandNo();
22532 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
22533 // Look pass truncate.
22534 UOpNo = User->use_begin().getOperandNo();
22535 User = *User->use_begin();
22536 }
22537
22538 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
22539 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
22540 return true;
22541 }
22542 return false;
22543}
22544
22545// Transform to an x86-specific ALU node with flags if there is a chance of
22546// using an RMW op or only the flags are used. Otherwise, leave
22547// the node alone and emit a 'cmp' or 'test' instruction.
22549 for (SDNode *U : Op->uses())
22550 if (U->getOpcode() != ISD::CopyToReg &&
22551 U->getOpcode() != ISD::SETCC &&
22552 U->getOpcode() != ISD::STORE)
22553 return false;
22554
22555 return true;
22556}
22557
22558/// Emit nodes that will be selected as "test Op0,Op0", or something
22559/// equivalent.
22560static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
22561 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
22562 // CF and OF aren't always set the way we want. Determine which
22563 // of these we need.
22564 bool NeedCF = false;
22565 bool NeedOF = false;
22566 switch (X86CC) {
22567 default: break;
22568 case X86::COND_A: case X86::COND_AE:
22569 case X86::COND_B: case X86::COND_BE:
22570 NeedCF = true;
22571 break;
22572 case X86::COND_G: case X86::COND_GE:
22573 case X86::COND_L: case X86::COND_LE:
22574 case X86::COND_O: case X86::COND_NO: {
22575 // Check if we really need to set the
22576 // Overflow flag. If NoSignedWrap is present
22577 // that is not actually needed.
22578 switch (Op->getOpcode()) {
22579 case ISD::ADD:
22580 case ISD::SUB:
22581 case ISD::MUL:
22582 case ISD::SHL:
22583 if (Op.getNode()->getFlags().hasNoSignedWrap())
22584 break;
22585 [[fallthrough]];
22586 default:
22587 NeedOF = true;
22588 break;
22589 }
22590 break;
22591 }
22592 }
22593 // See if we can use the EFLAGS value from the operand instead of
22594 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
22595 // we prove that the arithmetic won't overflow, we can't use OF or CF.
22596 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
22597 // Emit a CMP with 0, which is the TEST pattern.
22598 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22599 DAG.getConstant(0, dl, Op.getValueType()));
22600 }
22601 unsigned Opcode = 0;
22602 unsigned NumOperands = 0;
22603
22604 SDValue ArithOp = Op;
22605
22606 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
22607 // which may be the result of a CAST. We use the variable 'Op', which is the
22608 // non-casted variable when we check for possible users.
22609 switch (ArithOp.getOpcode()) {
22610 case ISD::AND:
22611 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
22612 // because a TEST instruction will be better.
22613 if (!hasNonFlagsUse(Op))
22614 break;
22615
22616 [[fallthrough]];
22617 case ISD::ADD:
22618 case ISD::SUB:
22619 case ISD::OR:
22620 case ISD::XOR:
22622 break;
22623
22624 // Otherwise use a regular EFLAGS-setting instruction.
22625 switch (ArithOp.getOpcode()) {
22626 // clang-format off
22627 default: llvm_unreachable("unexpected operator!");
22628 case ISD::ADD: Opcode = X86ISD::ADD; break;
22629 case ISD::SUB: Opcode = X86ISD::SUB; break;
22630 case ISD::XOR: Opcode = X86ISD::XOR; break;
22631 case ISD::AND: Opcode = X86ISD::AND; break;
22632 case ISD::OR: Opcode = X86ISD::OR; break;
22633 // clang-format on
22634 }
22635
22636 NumOperands = 2;
22637 break;
22638 case X86ISD::ADD:
22639 case X86ISD::SUB:
22640 case X86ISD::OR:
22641 case X86ISD::XOR:
22642 case X86ISD::AND:
22643 return SDValue(Op.getNode(), 1);
22644 case ISD::SSUBO:
22645 case ISD::USUBO: {
22646 // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.
22647 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22648 return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),
22649 Op->getOperand(1)).getValue(1);
22650 }
22651 default:
22652 break;
22653 }
22654
22655 if (Opcode == 0) {
22656 // Emit a CMP with 0, which is the TEST pattern.
22657 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
22658 DAG.getConstant(0, dl, Op.getValueType()));
22659 }
22660 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
22661 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
22662
22663 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
22664 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
22665 return SDValue(New.getNode(), 1);
22666}
22667
22668/// Emit nodes that will be selected as "cmp Op0,Op1", or something
22669/// equivalent.
22670static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
22671 const SDLoc &dl, SelectionDAG &DAG,
22672 const X86Subtarget &Subtarget) {
22673 if (isNullConstant(Op1))
22674 return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
22675
22676 EVT CmpVT = Op0.getValueType();
22677
22678 assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
22679 CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
22680
22681 // Only promote the compare up to I32 if it is a 16 bit operation
22682 // with an immediate. 16 bit immediates are to be avoided.
22683 if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&
22685 ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);
22686 ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
22687 // Don't do this if the immediate can fit in 8-bits.
22688 if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
22689 (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
22690 unsigned ExtendOp =
22692 if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
22693 // For equality comparisons try to use SIGN_EXTEND if the input was
22694 // truncate from something with enough sign bits.
22695 if (Op0.getOpcode() == ISD::TRUNCATE) {
22696 if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)
22697 ExtendOp = ISD::SIGN_EXTEND;
22698 } else if (Op1.getOpcode() == ISD::TRUNCATE) {
22699 if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)
22700 ExtendOp = ISD::SIGN_EXTEND;
22701 }
22702 }
22703
22704 CmpVT = MVT::i32;
22705 Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);
22706 Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
22707 }
22708 }
22709
22710 // Try to shrink i64 compares if the input has enough zero bits.
22711 // FIXME: Do this for non-constant compares for constant on LHS?
22712 if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
22713 Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
22714 Op1->getAsAPIntVal().getActiveBits() <= 32 &&
22715 DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
22716 CmpVT = MVT::i32;
22717 Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
22718 Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
22719 }
22720
22721 // 0-x == y --> x+y == 0
22722 // 0-x != y --> x+y != 0
22723 if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
22724 Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22725 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22726 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
22727 return Add.getValue(1);
22728 }
22729
22730 // x == 0-y --> x+y == 0
22731 // x != 0-y --> x+y != 0
22732 if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
22733 Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
22734 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22735 SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
22736 return Add.getValue(1);
22737 }
22738
22739 // Use SUB instead of CMP to enable CSE between SUB and CMP.
22740 SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
22741 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
22742 return Sub.getValue(1);
22743}
22744
22746 EVT VT) const {
22747 return !VT.isVector() || Cond != ISD::CondCode::SETEQ;
22748}
22749
22750bool X86TargetLowering::optimizeFMulOrFDivAsShiftAddBitcast(
22751 SDNode *N, SDValue, SDValue IntPow2) const {
22752 if (N->getOpcode() == ISD::FDIV)
22753 return true;
22754
22755 EVT FPVT = N->getValueType(0);
22756 EVT IntVT = IntPow2.getValueType();
22757
22758 // This indicates a non-free bitcast.
22759 // TODO: This is probably overly conservative as we will need to scale the
22760 // integer vector anyways for the int->fp cast.
22761 if (FPVT.isVector() &&
22762 FPVT.getScalarSizeInBits() != IntVT.getScalarSizeInBits())
22763 return false;
22764
22765 return true;
22766}
22767
22768/// Check if replacement of SQRT with RSQRT should be disabled.
22769bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
22770 EVT VT = Op.getValueType();
22771
22772 // We don't need to replace SQRT with RSQRT for half type.
22773 if (VT.getScalarType() == MVT::f16)
22774 return true;
22775
22776 // We never want to use both SQRT and RSQRT instructions for the same input.
22777 if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
22778 return false;
22779
22780 if (VT.isVector())
22781 return Subtarget.hasFastVectorFSQRT();
22782 return Subtarget.hasFastScalarFSQRT();
22783}
22784
22785/// The minimum architected relative accuracy is 2^-12. We need one
22786/// Newton-Raphson step to have a good float result (24 bits of precision).
22787SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
22788 SelectionDAG &DAG, int Enabled,
22789 int &RefinementSteps,
22790 bool &UseOneConstNR,
22791 bool Reciprocal) const {
22792 SDLoc DL(Op);
22793 EVT VT = Op.getValueType();
22794
22795 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
22796 // It is likely not profitable to do this for f64 because a double-precision
22797 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
22798 // instructions: convert to single, rsqrtss, convert back to double, refine
22799 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
22800 // along with FMA, this could be a throughput win.
22801 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
22802 // after legalize types.
22803 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22804 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
22805 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
22806 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22807 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22808 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22809 RefinementSteps = 1;
22810
22811 UseOneConstNR = false;
22812 // There is no FSQRT for 512-bits, but there is RSQRT14.
22813 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
22814 SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);
22815 if (RefinementSteps == 0 && !Reciprocal)
22816 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);
22817 return Estimate;
22818 }
22819
22820 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
22821 Subtarget.hasFP16()) {
22822 assert(Reciprocal && "Don't replace SQRT with RSQRT for half type");
22823 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22824 RefinementSteps = 0;
22825
22826 if (VT == MVT::f16) {
22827 SDValue Zero = DAG.getIntPtrConstant(0, DL);
22828 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
22829 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
22830 Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);
22831 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
22832 }
22833
22834 return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);
22835 }
22836 return SDValue();
22837}
22838
22839/// The minimum architected relative accuracy is 2^-12. We need one
22840/// Newton-Raphson step to have a good float result (24 bits of precision).
22841SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
22842 int Enabled,
22843 int &RefinementSteps) const {
22844 SDLoc DL(Op);
22845 EVT VT = Op.getValueType();
22846
22847 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
22848 // It is likely not profitable to do this for f64 because a double-precision
22849 // reciprocal estimate with refinement on x86 prior to FMA requires
22850 // 15 instructions: convert to single, rcpss, convert back to double, refine
22851 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
22852 // along with FMA, this could be a throughput win.
22853
22854 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
22855 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
22856 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
22857 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
22858 // Enable estimate codegen with 1 refinement step for vector division.
22859 // Scalar division estimates are disabled because they break too much
22860 // real-world code. These defaults are intended to match GCC behavior.
22861 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
22862 return SDValue();
22863
22864 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22865 RefinementSteps = 1;
22866
22867 // There is no FSQRT for 512-bits, but there is RCP14.
22868 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
22869 return DAG.getNode(Opcode, DL, VT, Op);
22870 }
22871
22872 if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&
22873 Subtarget.hasFP16()) {
22874 if (RefinementSteps == ReciprocalEstimate::Unspecified)
22875 RefinementSteps = 0;
22876
22877 if (VT == MVT::f16) {
22878 SDValue Zero = DAG.getIntPtrConstant(0, DL);
22879 SDValue Undef = DAG.getUNDEF(MVT::v8f16);
22880 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);
22881 Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);
22882 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);
22883 }
22884
22885 return DAG.getNode(X86ISD::RCP14, DL, VT, Op);
22886 }
22887 return SDValue();
22888}
22889
22890/// If we have at least two divisions that use the same divisor, convert to
22891/// multiplication by a reciprocal. This may need to be adjusted for a given
22892/// CPU if a division's cost is not at least twice the cost of a multiplication.
22893/// This is because we still need one division to calculate the reciprocal and
22894/// then we need two multiplies by that reciprocal as replacements for the
22895/// original divisions.
22896unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
22897 return 2;
22898}
22899
22900SDValue
22901X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
22902 SelectionDAG &DAG,
22903 SmallVectorImpl<SDNode *> &Created) const {
22905 if (isIntDivCheap(N->getValueType(0), Attr))
22906 return SDValue(N,0); // Lower SDIV as SDIV
22907
22908 assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&
22909 "Unexpected divisor!");
22910
22911 // Only perform this transform if CMOV is supported otherwise the select
22912 // below will become a branch.
22913 if (!Subtarget.canUseCMOV())
22914 return SDValue();
22915
22916 // fold (sdiv X, pow2)
22917 EVT VT = N->getValueType(0);
22918 // FIXME: Support i8.
22919 if (VT != MVT::i16 && VT != MVT::i32 &&
22920 !(Subtarget.is64Bit() && VT == MVT::i64))
22921 return SDValue();
22922
22923 // If the divisor is 2 or -2, the default expansion is better.
22924 if (Divisor == 2 ||
22925 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
22926 return SDValue();
22927
22928 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
22929}
22930
22931/// Result of 'and' is compared against zero. Change to a BT node if possible.
22932/// Returns the BT node and the condition code needed to use it.
22934 SelectionDAG &DAG, X86::CondCode &X86CC) {
22935 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
22936 SDValue Op0 = And.getOperand(0);
22937 SDValue Op1 = And.getOperand(1);
22938 if (Op0.getOpcode() == ISD::TRUNCATE)
22939 Op0 = Op0.getOperand(0);
22940 if (Op1.getOpcode() == ISD::TRUNCATE)
22941 Op1 = Op1.getOperand(0);
22942
22943 SDValue Src, BitNo;
22944 if (Op1.getOpcode() == ISD::SHL)
22945 std::swap(Op0, Op1);
22946 if (Op0.getOpcode() == ISD::SHL) {
22947 if (isOneConstant(Op0.getOperand(0))) {
22948 // If we looked past a truncate, check that it's only truncating away
22949 // known zeros.
22950 unsigned BitWidth = Op0.getValueSizeInBits();
22951 unsigned AndBitWidth = And.getValueSizeInBits();
22952 if (BitWidth > AndBitWidth) {
22953 KnownBits Known = DAG.computeKnownBits(Op0);
22954 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
22955 return SDValue();
22956 }
22957 Src = Op1;
22958 BitNo = Op0.getOperand(1);
22959 }
22960 } else if (Op1.getOpcode() == ISD::Constant) {
22961 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
22962 uint64_t AndRHSVal = AndRHS->getZExtValue();
22963 SDValue AndLHS = Op0;
22964
22965 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
22966 Src = AndLHS.getOperand(0);
22967 BitNo = AndLHS.getOperand(1);
22968 } else {
22969 // Use BT if the immediate can't be encoded in a TEST instruction or we
22970 // are optimizing for size and the immedaite won't fit in a byte.
22971 bool OptForSize = DAG.shouldOptForSize();
22972 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
22973 isPowerOf2_64(AndRHSVal)) {
22974 Src = AndLHS;
22975 BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,
22976 Src.getValueType());
22977 }
22978 }
22979 }
22980
22981 // No patterns found, give up.
22982 if (!Src.getNode())
22983 return SDValue();
22984
22985 // Remove any bit flip.
22986 if (isBitwiseNot(Src)) {
22987 Src = Src.getOperand(0);
22989 }
22990
22991 // Attempt to create the X86ISD::BT node.
22992 if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {
22993 X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
22994 return BT;
22995 }
22996
22997 return SDValue();
22998}
22999
23000// Check if pre-AVX condcode can be performed by a single FCMP op.
23001static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {
23002 return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);
23003}
23004
23005/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
23006/// CMPs.
23007static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
23008 SDValue &Op1, bool &IsAlwaysSignaling) {
23009 unsigned SSECC;
23010 bool Swap = false;
23011
23012 // SSE Condition code mapping:
23013 // 0 - EQ
23014 // 1 - LT
23015 // 2 - LE
23016 // 3 - UNORD
23017 // 4 - NEQ
23018 // 5 - NLT
23019 // 6 - NLE
23020 // 7 - ORD
23021 switch (SetCCOpcode) {
23022 // clang-format off
23023 default: llvm_unreachable("Unexpected SETCC condition");
23024 case ISD::SETOEQ:
23025 case ISD::SETEQ: SSECC = 0; break;
23026 case ISD::SETOGT:
23027 case ISD::SETGT: Swap = true; [[fallthrough]];
23028 case ISD::SETLT:
23029 case ISD::SETOLT: SSECC = 1; break;
23030 case ISD::SETOGE:
23031 case ISD::SETGE: Swap = true; [[fallthrough]];
23032 case ISD::SETLE:
23033 case ISD::SETOLE: SSECC = 2; break;
23034 case ISD::SETUO: SSECC = 3; break;
23035 case ISD::SETUNE:
23036 case ISD::SETNE: SSECC = 4; break;
23037 case ISD::SETULE: Swap = true; [[fallthrough]];
23038 case ISD::SETUGE: SSECC = 5; break;
23039 case ISD::SETULT: Swap = true; [[fallthrough]];
23040 case ISD::SETUGT: SSECC = 6; break;
23041 case ISD::SETO: SSECC = 7; break;
23042 case ISD::SETUEQ: SSECC = 8; break;
23043 case ISD::SETONE: SSECC = 12; break;
23044 // clang-format on
23045 }
23046 if (Swap)
23047 std::swap(Op0, Op1);
23048
23049 switch (SetCCOpcode) {
23050 default:
23051 IsAlwaysSignaling = true;
23052 break;
23053 case ISD::SETEQ:
23054 case ISD::SETOEQ:
23055 case ISD::SETUEQ:
23056 case ISD::SETNE:
23057 case ISD::SETONE:
23058 case ISD::SETUNE:
23059 case ISD::SETO:
23060 case ISD::SETUO:
23061 IsAlwaysSignaling = false;
23062 break;
23063 }
23064
23065 return SSECC;
23066}
23067
23068/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
23069/// concatenate the result back.
23072 const SDLoc &dl) {
23073 assert(VT.isInteger() && VT == LHS.getValueType() &&
23074 VT == RHS.getValueType() && "Unsupported VTs!");
23075
23076 SDValue CC = DAG.getCondCode(Cond);
23077
23078 // Extract the LHS Lo/Hi vectors
23079 SDValue LHS1, LHS2;
23080 std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);
23081
23082 // Extract the RHS Lo/Hi vectors
23083 SDValue RHS1, RHS2;
23084 std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);
23085
23086 // Issue the operation on the smaller types and concatenate the result back
23087 EVT LoVT, HiVT;
23088 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
23089 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
23090 DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
23091 DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
23092}
23093
23095
23096 SDValue Op0 = Op.getOperand(0);
23097 SDValue Op1 = Op.getOperand(1);
23098 SDValue CC = Op.getOperand(2);
23099 MVT VT = Op.getSimpleValueType();
23100 SDLoc dl(Op);
23101
23102 assert(VT.getVectorElementType() == MVT::i1 &&
23103 "Cannot set masked compare for this operation");
23104
23105 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
23106
23107 // Prefer SETGT over SETLT.
23108 if (SetCCOpcode == ISD::SETLT) {
23109 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
23110 std::swap(Op0, Op1);
23111 }
23112
23113 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
23114}
23115
23116/// Given a buildvector constant, return a new vector constant with each element
23117/// incremented or decremented. If incrementing or decrementing would result in
23118/// unsigned overflow or underflow or this is not a simple vector constant,
23119/// return an empty value.
23121 bool NSW) {
23122 auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());
23123 if (!BV || !V.getValueType().isSimple())
23124 return SDValue();
23125
23126 MVT VT = V.getSimpleValueType();
23127 MVT EltVT = VT.getVectorElementType();
23128 unsigned NumElts = VT.getVectorNumElements();
23130 SDLoc DL(V);
23131 for (unsigned i = 0; i < NumElts; ++i) {
23132 auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
23133 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)
23134 return SDValue();
23135
23136 // Avoid overflow/underflow.
23137 const APInt &EltC = Elt->getAPIntValue();
23138 if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))
23139 return SDValue();
23140 if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||
23141 (!IsInc && EltC.isMinSignedValue())))
23142 return SDValue();
23143
23144 NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));
23145 }
23146
23147 return DAG.getBuildVector(VT, DL, NewVecC);
23148}
23149
23150/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
23151/// Op0 u<= Op1:
23152/// t = psubus Op0, Op1
23153/// pcmpeq t, <0..0>
23155 ISD::CondCode Cond, const SDLoc &dl,
23156 const X86Subtarget &Subtarget,
23157 SelectionDAG &DAG) {
23158 if (!Subtarget.hasSSE2())
23159 return SDValue();
23160
23161 MVT VET = VT.getVectorElementType();
23162 if (VET != MVT::i8 && VET != MVT::i16)
23163 return SDValue();
23164
23165 switch (Cond) {
23166 default:
23167 return SDValue();
23168 case ISD::SETULT: {
23169 // If the comparison is against a constant we can turn this into a
23170 // setule. With psubus, setule does not require a swap. This is
23171 // beneficial because the constant in the register is no longer
23172 // destructed as the destination so it can be hoisted out of a loop.
23173 // Only do this pre-AVX since vpcmp* is no longer destructive.
23174 if (Subtarget.hasAVX())
23175 return SDValue();
23176 SDValue ULEOp1 =
23177 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);
23178 if (!ULEOp1)
23179 return SDValue();
23180 Op1 = ULEOp1;
23181 break;
23182 }
23183 case ISD::SETUGT: {
23184 // If the comparison is against a constant, we can turn this into a setuge.
23185 // This is beneficial because materializing a constant 0 for the PCMPEQ is
23186 // probably cheaper than XOR+PCMPGT using 2 different vector constants:
23187 // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0
23188 SDValue UGEOp1 =
23189 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);
23190 if (!UGEOp1)
23191 return SDValue();
23192 Op1 = Op0;
23193 Op0 = UGEOp1;
23194 break;
23195 }
23196 // Psubus is better than flip-sign because it requires no inversion.
23197 case ISD::SETUGE:
23198 std::swap(Op0, Op1);
23199 break;
23200 case ISD::SETULE:
23201 break;
23202 }
23203
23204 SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);
23205 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
23206 DAG.getConstant(0, dl, VT));
23207}
23208
23209static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
23210 SelectionDAG &DAG) {
23211 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23212 Op.getOpcode() == ISD::STRICT_FSETCCS;
23213 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23214 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23215 SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
23216 MVT VT = Op->getSimpleValueType(0);
23217 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
23218 bool isFP = Op1.getSimpleValueType().isFloatingPoint();
23219 SDLoc dl(Op);
23220
23221 if (isFP) {
23223 assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64);
23224 if (isSoftF16(EltVT, Subtarget))
23225 return SDValue();
23226
23227 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23228 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23229
23230 // If we have a strict compare with a vXi1 result and the input is 128/256
23231 // bits we can't use a masked compare unless we have VLX. If we use a wider
23232 // compare like we do for non-strict, we might trigger spurious exceptions
23233 // from the upper elements. Instead emit a AVX compare and convert to mask.
23234 unsigned Opc;
23235 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
23236 (!IsStrict || Subtarget.hasVLX() ||
23238#ifndef NDEBUG
23239 unsigned Num = VT.getVectorNumElements();
23240 assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16));
23241#endif
23242 Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
23243 } else {
23244 Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
23245 // The SSE/AVX packed FP comparison nodes are defined with a
23246 // floating-point vector result that matches the operand type. This allows
23247 // them to work with an SSE1 target (integer vector types are not legal).
23248 VT = Op0.getSimpleValueType();
23249 }
23250
23251 SDValue Cmp;
23252 bool IsAlwaysSignaling;
23253 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
23254 if (!Subtarget.hasAVX()) {
23255 // TODO: We could use following steps to handle a quiet compare with
23256 // signaling encodings.
23257 // 1. Get ordered masks from a quiet ISD::SETO
23258 // 2. Use the masks to mask potential unordered elements in operand A, B
23259 // 3. Get the compare results of masked A, B
23260 // 4. Calculating final result using the mask and result from 3
23261 // But currently, we just fall back to scalar operations.
23262 if (IsStrict && IsAlwaysSignaling && !IsSignaling)
23263 return SDValue();
23264
23265 // Insert an extra signaling instruction to raise exception.
23266 if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
23267 SDValue SignalCmp = DAG.getNode(
23268 Opc, dl, {VT, MVT::Other},
23269 {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
23270 // FIXME: It seems we need to update the flags of all new strict nodes.
23271 // Otherwise, mayRaiseFPException in MI will return false due to
23272 // NoFPExcept = false by default. However, I didn't find it in other
23273 // patches.
23274 SignalCmp->setFlags(Op->getFlags());
23275 Chain = SignalCmp.getValue(1);
23276 }
23277
23278 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
23279 // emit two comparisons and a logic op to tie them together.
23280 if (!cheapX86FSETCC_SSE(Cond)) {
23281 // LLVM predicate is SETUEQ or SETONE.
23282 unsigned CC0, CC1;
23283 unsigned CombineOpc;
23284 if (Cond == ISD::SETUEQ) {
23285 CC0 = 3; // UNORD
23286 CC1 = 0; // EQ
23287 CombineOpc = X86ISD::FOR;
23288 } else {
23290 CC0 = 7; // ORD
23291 CC1 = 4; // NEQ
23292 CombineOpc = X86ISD::FAND;
23293 }
23294
23295 SDValue Cmp0, Cmp1;
23296 if (IsStrict) {
23297 Cmp0 = DAG.getNode(
23298 Opc, dl, {VT, MVT::Other},
23299 {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
23300 Cmp1 = DAG.getNode(
23301 Opc, dl, {VT, MVT::Other},
23302 {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
23303 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
23304 Cmp1.getValue(1));
23305 } else {
23306 Cmp0 = DAG.getNode(
23307 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
23308 Cmp1 = DAG.getNode(
23309 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
23310 }
23311 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
23312 } else {
23313 if (IsStrict) {
23314 Cmp = DAG.getNode(
23315 Opc, dl, {VT, MVT::Other},
23316 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23317 Chain = Cmp.getValue(1);
23318 } else
23319 Cmp = DAG.getNode(
23320 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23321 }
23322 } else {
23323 // Handle all other FP comparisons here.
23324 if (IsStrict) {
23325 // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
23326 SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
23327 Cmp = DAG.getNode(
23328 Opc, dl, {VT, MVT::Other},
23329 {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
23330 Chain = Cmp.getValue(1);
23331 } else
23332 Cmp = DAG.getNode(
23333 Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
23334 }
23335
23336 if (VT.getFixedSizeInBits() >
23337 Op.getSimpleValueType().getFixedSizeInBits()) {
23338 // We emitted a compare with an XMM/YMM result. Finish converting to a
23339 // mask register using a vptestm.
23341 Cmp = DAG.getBitcast(CastVT, Cmp);
23342 Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
23343 DAG.getConstant(0, dl, CastVT), ISD::SETNE);
23344 } else {
23345 // If this is SSE/AVX CMPP, bitcast the result back to integer to match
23346 // the result type of SETCC. The bitcast is expected to be optimized
23347 // away during combining/isel.
23348 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
23349 }
23350
23351 if (IsStrict)
23352 return DAG.getMergeValues({Cmp, Chain}, dl);
23353
23354 return Cmp;
23355 }
23356
23357 assert(!IsStrict && "Strict SETCC only handles FP operands.");
23358
23359 MVT VTOp0 = Op0.getSimpleValueType();
23360 (void)VTOp0;
23361 assert(VTOp0 == Op1.getSimpleValueType() &&
23362 "Expected operands with same type!");
23364 "Invalid number of packed elements for source and destination!");
23365
23366 // The non-AVX512 code below works under the assumption that source and
23367 // destination types are the same.
23368 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
23369 "Value types for source and destination must be the same!");
23370
23371 // The result is boolean, but operands are int/float
23372 if (VT.getVectorElementType() == MVT::i1) {
23373 // In AVX-512 architecture setcc returns mask with i1 elements,
23374 // But there is no compare instruction for i8 and i16 elements in KNL.
23375 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
23376 "Unexpected operand type");
23377 return LowerIntVSETCC_AVX512(Op, DAG);
23378 }
23379
23380 // Lower using XOP integer comparisons.
23381 if (VT.is128BitVector() && Subtarget.hasXOP()) {
23382 // Translate compare code to XOP PCOM compare mode.
23383 unsigned CmpMode = 0;
23384 switch (Cond) {
23385 // clang-format off
23386 default: llvm_unreachable("Unexpected SETCC condition");
23387 case ISD::SETULT:
23388 case ISD::SETLT: CmpMode = 0x00; break;
23389 case ISD::SETULE:
23390 case ISD::SETLE: CmpMode = 0x01; break;
23391 case ISD::SETUGT:
23392 case ISD::SETGT: CmpMode = 0x02; break;
23393 case ISD::SETUGE:
23394 case ISD::SETGE: CmpMode = 0x03; break;
23395 case ISD::SETEQ: CmpMode = 0x04; break;
23396 case ISD::SETNE: CmpMode = 0x05; break;
23397 // clang-format on
23398 }
23399
23400 // Are we comparing unsigned or signed integers?
23401 unsigned Opc =
23403
23404 return DAG.getNode(Opc, dl, VT, Op0, Op1,
23405 DAG.getTargetConstant(CmpMode, dl, MVT::i8));
23406 }
23407
23408 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
23409 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
23411 SDValue BC0 = peekThroughBitcasts(Op0);
23412 if (BC0.getOpcode() == ISD::AND) {
23413 APInt UndefElts;
23414 SmallVector<APInt, 64> EltBits;
23416 BC0.getOperand(1), VT.getScalarSizeInBits(), UndefElts, EltBits,
23417 /*AllowWholeUndefs*/ false, /*AllowPartialUndefs*/ false)) {
23418 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
23419 Cond = ISD::SETEQ;
23420 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
23421 }
23422 }
23423 }
23424 }
23425
23426 // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.
23427 if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&
23428 Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {
23430 if (C1 && C1->getAPIntValue().isPowerOf2()) {
23431 unsigned BitWidth = VT.getScalarSizeInBits();
23432 unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;
23433
23434 SDValue Result = Op0.getOperand(0);
23435 Result = DAG.getNode(ISD::SHL, dl, VT, Result,
23436 DAG.getConstant(ShiftAmt, dl, VT));
23437 Result = DAG.getNode(ISD::SRA, dl, VT, Result,
23438 DAG.getConstant(BitWidth - 1, dl, VT));
23439 return Result;
23440 }
23441 }
23442
23443 // Break 256-bit integer vector compare into smaller ones.
23444 if (VT.is256BitVector() && !Subtarget.hasInt256())
23445 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23446
23447 // Break 512-bit integer vector compare into smaller ones.
23448 // TODO: Try harder to use VPCMPx + VPMOV2x?
23449 if (VT.is512BitVector())
23450 return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
23451
23452 // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid
23453 // not-of-PCMPEQ:
23454 // X != INT_MIN --> X >s INT_MIN
23455 // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X
23456 // +X != 0 --> +X >s 0
23457 APInt ConstValue;
23458 if (Cond == ISD::SETNE &&
23459 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
23460 if (ConstValue.isMinSignedValue())
23461 Cond = ISD::SETGT;
23462 else if (ConstValue.isMaxSignedValue())
23463 Cond = ISD::SETLT;
23464 else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))
23465 Cond = ISD::SETGT;
23466 }
23467
23468 // If both operands are known non-negative, then an unsigned compare is the
23469 // same as a signed compare and there's no need to flip signbits.
23470 // TODO: We could check for more general simplifications here since we're
23471 // computing known bits.
23472 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
23473 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
23474
23475 // Special case: Use min/max operations for unsigned compares.
23476 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23478 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
23479 TLI.isOperationLegal(ISD::UMIN, VT)) {
23480 // If we have a constant operand, increment/decrement it and change the
23481 // condition to avoid an invert.
23482 if (Cond == ISD::SETUGT) {
23483 // X > C --> X >= (C+1) --> X == umax(X, C+1)
23484 if (SDValue UGTOp1 =
23485 incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {
23486 Op1 = UGTOp1;
23487 Cond = ISD::SETUGE;
23488 }
23489 }
23490 if (Cond == ISD::SETULT) {
23491 // X < C --> X <= (C-1) --> X == umin(X, C-1)
23492 if (SDValue ULTOp1 =
23493 incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {
23494 Op1 = ULTOp1;
23495 Cond = ISD::SETULE;
23496 }
23497 }
23498 bool Invert = false;
23499 unsigned Opc;
23500 switch (Cond) {
23501 // clang-format off
23502 default: llvm_unreachable("Unexpected condition code");
23503 case ISD::SETUGT: Invert = true; [[fallthrough]];
23504 case ISD::SETULE: Opc = ISD::UMIN; break;
23505 case ISD::SETULT: Invert = true; [[fallthrough]];
23506 case ISD::SETUGE: Opc = ISD::UMAX; break;
23507 // clang-format on
23508 }
23509
23510 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23511 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
23512
23513 // If the logical-not of the result is required, perform that now.
23514 if (Invert)
23515 Result = DAG.getNOT(dl, Result, VT);
23516
23517 return Result;
23518 }
23519
23520 // Try to use SUBUS and PCMPEQ.
23521 if (FlipSigns)
23522 if (SDValue V =
23523 LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
23524 return V;
23525
23526 // We are handling one of the integer comparisons here. Since SSE only has
23527 // GT and EQ comparisons for integer, swapping operands and multiple
23528 // operations may be required for some comparisons.
23529 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
23531 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
23533 bool Invert = Cond == ISD::SETNE ||
23535
23536 if (Swap)
23537 std::swap(Op0, Op1);
23538
23539 // Check that the operation in question is available (most are plain SSE2,
23540 // but PCMPGTQ and PCMPEQQ have different requirements).
23541 if (VT == MVT::v2i64) {
23542 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
23543 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
23544
23545 // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
23546 // the odd elements over the even elements.
23547 if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
23548 Op0 = DAG.getConstant(0, dl, MVT::v4i32);
23549 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23550
23551 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23552 static const int MaskHi[] = { 1, 1, 3, 3 };
23553 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23554
23555 return DAG.getBitcast(VT, Result);
23556 }
23557
23558 if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
23559 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23560 Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
23561
23562 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23563 static const int MaskHi[] = { 1, 1, 3, 3 };
23564 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23565
23566 return DAG.getBitcast(VT, Result);
23567 }
23568
23569 // If the i64 elements are sign-extended enough to be representable as i32
23570 // then we can compare the lower i32 bits and splat.
23571 if (!FlipSigns && !Invert && DAG.ComputeNumSignBits(Op0) > 32 &&
23572 DAG.ComputeNumSignBits(Op1) > 32) {
23573 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23574 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23575
23576 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23577 static const int MaskLo[] = {0, 0, 2, 2};
23578 SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
23579
23580 return DAG.getBitcast(VT, Result);
23581 }
23582
23583 // Since SSE has no unsigned integer comparisons, we need to flip the sign
23584 // bits of the inputs before performing those operations. The lower
23585 // compare is always unsigned.
23586 SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL
23587 : 0x0000000080000000ULL,
23588 dl, MVT::v2i64);
23589
23590 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);
23591 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);
23592
23593 // Cast everything to the right type.
23594 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23595 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23596
23597 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
23598 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
23599 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
23600
23601 // Create masks for only the low parts/high parts of the 64 bit integers.
23602 static const int MaskHi[] = { 1, 1, 3, 3 };
23603 static const int MaskLo[] = { 0, 0, 2, 2 };
23604 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
23605 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
23606 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
23607
23608 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
23609 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
23610
23611 if (Invert)
23612 Result = DAG.getNOT(dl, Result, MVT::v4i32);
23613
23614 return DAG.getBitcast(VT, Result);
23615 }
23616
23617 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
23618 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
23619 // pcmpeqd + pshufd + pand.
23620 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
23621
23622 // First cast everything to the right type.
23623 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
23624 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
23625
23626 // Do the compare.
23627 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
23628
23629 // Make sure the lower and upper halves are both all-ones.
23630 static const int Mask[] = { 1, 0, 3, 2 };
23631 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
23632 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
23633
23634 if (Invert)
23635 Result = DAG.getNOT(dl, Result, MVT::v4i32);
23636
23637 return DAG.getBitcast(VT, Result);
23638 }
23639 }
23640
23641 // Since SSE has no unsigned integer comparisons, we need to flip the sign
23642 // bits of the inputs before performing those operations.
23643 if (FlipSigns) {
23644 MVT EltVT = VT.getVectorElementType();
23646 VT);
23647 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
23648 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
23649 }
23650
23651 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
23652
23653 // If the logical-not of the result is required, perform that now.
23654 if (Invert)
23655 Result = DAG.getNOT(dl, Result, VT);
23656
23657 return Result;
23658}
23659
23660// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.
23662 const SDLoc &dl, SelectionDAG &DAG,
23663 const X86Subtarget &Subtarget,
23664 SDValue &X86CC) {
23665 assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
23666
23667 // Must be a bitcast from vXi1.
23668 if (Op0.getOpcode() != ISD::BITCAST)
23669 return SDValue();
23670
23671 Op0 = Op0.getOperand(0);
23672 MVT VT = Op0.getSimpleValueType();
23673 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
23674 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
23675 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
23676 return SDValue();
23677
23678 X86::CondCode X86Cond;
23679 if (isNullConstant(Op1)) {
23680 X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
23681 } else if (isAllOnesConstant(Op1)) {
23682 // C flag is set for all ones.
23683 X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
23684 } else
23685 return SDValue();
23686
23687 // If the input is an AND, we can combine it's operands into the KTEST.
23688 bool KTestable = false;
23689 if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))
23690 KTestable = true;
23691 if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))
23692 KTestable = true;
23693 if (!isNullConstant(Op1))
23694 KTestable = false;
23695 if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {
23696 SDValue LHS = Op0.getOperand(0);
23697 SDValue RHS = Op0.getOperand(1);
23698 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23699 return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);
23700 }
23701
23702 // If the input is an OR, we can combine it's operands into the KORTEST.
23703 SDValue LHS = Op0;
23704 SDValue RHS = Op0;
23705 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
23706 LHS = Op0.getOperand(0);
23707 RHS = Op0.getOperand(1);
23708 }
23709
23710 X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
23711 return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
23712}
23713
23714/// Emit flags for the given setcc condition and operands. Also returns the
23715/// corresponding X86 condition code constant in X86CC.
23716SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
23717 ISD::CondCode CC, const SDLoc &dl,
23718 SelectionDAG &DAG,
23719 SDValue &X86CC) const {
23720 // Equality Combines.
23721 if (CC == ISD::SETEQ || CC == ISD::SETNE) {
23722 X86::CondCode X86CondCode;
23723
23724 // Optimize to BT if possible.
23725 // Lower (X & (1 << N)) == 0 to BT(X, N).
23726 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
23727 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
23728 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {
23729 if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {
23730 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23731 return BT;
23732 }
23733 }
23734
23735 // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.
23736 if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,
23737 X86CondCode)) {
23738 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23739 return CmpZ;
23740 }
23741
23742 // Try to lower using KORTEST or KTEST.
23743 if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
23744 return Test;
23745
23746 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms
23747 // of these.
23748 if (isOneConstant(Op1) || isNullConstant(Op1)) {
23749 // If the input is a setcc, then reuse the input setcc or use a new one
23750 // with the inverted condition.
23751 if (Op0.getOpcode() == X86ISD::SETCC) {
23752 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
23753
23754 X86CC = Op0.getOperand(0);
23755 if (Invert) {
23756 X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);
23757 X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);
23758 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23759 }
23760
23761 return Op0.getOperand(1);
23762 }
23763 }
23764
23765 // Try to use the carry flag from the add in place of an separate CMP for:
23766 // (seteq (add X, -1), -1). Similar for setne.
23767 if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
23768 Op0.getOperand(1) == Op1) {
23769 if (isProfitableToUseFlagOp(Op0)) {
23770 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
23771
23772 SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
23773 Op0.getOperand(1));
23774 DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
23775 X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
23776 X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);
23777 return SDValue(New.getNode(), 1);
23778 }
23779 }
23780 }
23781
23783 TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
23784 assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
23785
23786 SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
23787 X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23788 return EFLAGS;
23789}
23790
23791SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
23792
23793 bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
23794 Op.getOpcode() == ISD::STRICT_FSETCCS;
23795 MVT VT = Op->getSimpleValueType(0);
23796
23797 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
23798
23799 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
23800 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
23801 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
23802 SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
23803 SDLoc dl(Op);
23805 cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
23806
23807 if (isSoftF16(Op0.getValueType(), Subtarget))
23808 return SDValue();
23809
23810 // Handle f128 first, since one possible outcome is a normal integer
23811 // comparison which gets handled by emitFlagsForSetcc.
23812 if (Op0.getValueType() == MVT::f128) {
23813 softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
23814 Op.getOpcode() == ISD::STRICT_FSETCCS);
23815
23816 // If softenSetCCOperands returned a scalar, use it.
23817 if (!Op1.getNode()) {
23818 assert(Op0.getValueType() == Op.getValueType() &&
23819 "Unexpected setcc expansion!");
23820 if (IsStrict)
23821 return DAG.getMergeValues({Op0, Chain}, dl);
23822 return Op0;
23823 }
23824 }
23825
23826 if (Op0.getSimpleValueType().isInteger()) {
23827 // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which
23828 // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),
23829 // this may translate to less uops depending on uarch implementation. The
23830 // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already
23831 // canonicalize to that CondCode.
23832 // NOTE: Only do this if incrementing the constant doesn't increase the bit
23833 // encoding size - so it must either already be a i8 or i32 immediate, or it
23834 // shrinks down to that. We don't do this for any i64's to avoid additional
23835 // constant materializations.
23836 // TODO: Can we move this to TranslateX86CC to handle jumps/branches too?
23837 if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {
23838 const APInt &Op1Val = Op1C->getAPIntValue();
23839 if (!Op1Val.isZero()) {
23840 // Ensure the constant+1 doesn't overflow.
23841 if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||
23842 (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {
23843 APInt Op1ValPlusOne = Op1Val + 1;
23844 if (Op1ValPlusOne.isSignedIntN(32) &&
23845 (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {
23846 Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());
23849 }
23850 }
23851 }
23852 }
23853
23854 SDValue X86CC;
23855 SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
23856 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23857 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23858 }
23859
23860 // Handle floating point.
23861 X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
23862 if (CondCode == X86::COND_INVALID)
23863 return SDValue();
23864
23865 SDValue EFLAGS;
23866 if (IsStrict) {
23867 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
23868 EFLAGS =
23870 dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
23871 Chain = EFLAGS.getValue(1);
23872 } else {
23873 EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
23874 }
23875
23876 SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
23877 SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
23878 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
23879}
23880
23881SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
23882 SDValue LHS = Op.getOperand(0);
23883 SDValue RHS = Op.getOperand(1);
23884 SDValue Carry = Op.getOperand(2);
23885 SDValue Cond = Op.getOperand(3);
23886 SDLoc DL(Op);
23887
23888 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
23889 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
23890
23891 // Recreate the carry if needed.
23892 EVT CarryVT = Carry.getValueType();
23893 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
23894 Carry, DAG.getAllOnesConstant(DL, CarryVT));
23895
23896 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
23897 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
23898 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
23899}
23900
23901// This function returns three things: the arithmetic computation itself
23902// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The
23903// flag and the condition code define the case in which the arithmetic
23904// computation overflows.
23905static std::pair<SDValue, SDValue>
23907 assert(Op.getResNo() == 0 && "Unexpected result number!");
23908 SDValue Value, Overflow;
23909 SDValue LHS = Op.getOperand(0);
23910 SDValue RHS = Op.getOperand(1);
23911 unsigned BaseOp = 0;
23912 SDLoc DL(Op);
23913 switch (Op.getOpcode()) {
23914 default: llvm_unreachable("Unknown ovf instruction!");
23915 case ISD::SADDO:
23916 BaseOp = X86ISD::ADD;
23917 Cond = X86::COND_O;
23918 break;
23919 case ISD::UADDO:
23920 BaseOp = X86ISD::ADD;
23922 break;
23923 case ISD::SSUBO:
23924 BaseOp = X86ISD::SUB;
23925 Cond = X86::COND_O;
23926 break;
23927 case ISD::USUBO:
23928 BaseOp = X86ISD::SUB;
23929 Cond = X86::COND_B;
23930 break;
23931 case ISD::SMULO:
23932 BaseOp = X86ISD::SMUL;
23933 Cond = X86::COND_O;
23934 break;
23935 case ISD::UMULO:
23936 BaseOp = X86ISD::UMUL;
23937 Cond = X86::COND_O;
23938 break;
23939 }
23940
23941 if (BaseOp) {
23942 // Also sets EFLAGS.
23943 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
23944 Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
23945 Overflow = Value.getValue(1);
23946 }
23947
23948 return std::make_pair(Value, Overflow);
23949}
23950
23952 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
23953 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
23954 // looks for this combo and may remove the "setcc" instruction if the "setcc"
23955 // has only one use.
23956 SDLoc DL(Op);
23958 SDValue Value, Overflow;
23959 std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);
23960
23961 SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);
23962 assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!");
23963 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);
23964}
23965
23966/// Return true if opcode is a X86 logical comparison.
23968 unsigned Opc = Op.getOpcode();
23969 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
23970 Opc == X86ISD::FCMP)
23971 return true;
23972 if (Op.getResNo() == 1 &&
23973 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
23974 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
23975 Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))
23976 return true;
23977
23978 return false;
23979}
23980
23982 if (V.getOpcode() != ISD::TRUNCATE)
23983 return false;
23984
23985 SDValue VOp0 = V.getOperand(0);
23986 unsigned InBits = VOp0.getValueSizeInBits();
23987 unsigned Bits = V.getValueSizeInBits();
23988 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
23989}
23990
23991SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
23992 bool AddTest = true;
23993 SDValue Cond = Op.getOperand(0);
23994 SDValue Op1 = Op.getOperand(1);
23995 SDValue Op2 = Op.getOperand(2);
23996 SDLoc DL(Op);
23997 MVT VT = Op1.getSimpleValueType();
23998 SDValue CC;
23999
24000 if (isSoftF16(VT, Subtarget)) {
24001 MVT NVT = VT.changeTypeToInteger();
24002 return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,
24003 DAG.getBitcast(NVT, Op1),
24004 DAG.getBitcast(NVT, Op2)));
24005 }
24006
24007 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
24008 // are available or VBLENDV if AVX is available.
24009 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
24010 if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
24011 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
24012 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
24013 bool IsAlwaysSignaling;
24014 unsigned SSECC =
24015 translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
24016 CondOp0, CondOp1, IsAlwaysSignaling);
24017
24018 if (Subtarget.hasAVX512()) {
24019 SDValue Cmp =
24020 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,
24021 DAG.getTargetConstant(SSECC, DL, MVT::i8));
24022 assert(!VT.isVector() && "Not a scalar type?");
24023 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
24024 }
24025
24026 if (SSECC < 8 || Subtarget.hasAVX()) {
24027 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
24028 DAG.getTargetConstant(SSECC, DL, MVT::i8));
24029
24030 // If we have AVX, we can use a variable vector select (VBLENDV) instead
24031 // of 3 logic instructions for size savings and potentially speed.
24032 // Unfortunately, there is no scalar form of VBLENDV.
24033
24034 // If either operand is a +0.0 constant, don't try this. We can expect to
24035 // optimize away at least one of the logic instructions later in that
24036 // case, so that sequence would be faster than a variable blend.
24037
24038 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
24039 // uses XMM0 as the selection register. That may need just as many
24040 // instructions as the AND/ANDN/OR sequence due to register moves, so
24041 // don't bother.
24042 if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&
24043 !isNullFPConstant(Op2)) {
24044 // Convert to vectors, do a VSELECT, and convert back to scalar.
24045 // All of the conversions should be optimized away.
24046 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
24047 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
24048 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
24049 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
24050
24051 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
24052 VCmp = DAG.getBitcast(VCmpVT, VCmp);
24053
24054 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
24055
24056 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
24057 VSel, DAG.getIntPtrConstant(0, DL));
24058 }
24059 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
24060 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
24061 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
24062 }
24063 }
24064
24065 // AVX512 fallback is to lower selects of scalar floats to masked moves.
24066 if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
24067 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
24068 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
24069 }
24070
24071 if (Cond.getOpcode() == ISD::SETCC &&
24072 !isSoftF16(Cond.getOperand(0).getSimpleValueType(), Subtarget)) {
24073 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
24074 Cond = NewCond;
24075 // If the condition was updated, it's possible that the operands of the
24076 // select were also updated (for example, EmitTest has a RAUW). Refresh
24077 // the local references to the select operands in case they got stale.
24078 Op1 = Op.getOperand(1);
24079 Op2 = Op.getOperand(2);
24080 }
24081 }
24082
24083 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
24084 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
24085 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
24086 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
24087 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
24088 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
24089 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
24090 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
24091 if (Cond.getOpcode() == X86ISD::SETCC &&
24092 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
24093 isNullConstant(Cond.getOperand(1).getOperand(1))) {
24094 SDValue Cmp = Cond.getOperand(1);
24095 SDValue CmpOp0 = Cmp.getOperand(0);
24096 unsigned CondCode = Cond.getConstantOperandVal(0);
24097
24098 // Special handling for __builtin_ffs(X) - 1 pattern which looks like
24099 // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
24100 // handle to keep the CMP with 0. This should be removed by
24101 // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
24102 // cttz_zero_undef.
24103 auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
24104 return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
24105 Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
24106 };
24107 if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&
24108 ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
24109 (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
24110 // Keep Cmp.
24111 } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24112 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
24113 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
24114 SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
24115
24116 // 'X - 1' sets the carry flag if X == 0.
24117 // '0 - X' sets the carry flag if X != 0.
24118 // Convert the carry flag to a -1/0 mask with sbb:
24119 // select (X != 0), -1, Y --> 0 - X; or (sbb), Y
24120 // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y
24121 // select (X != 0), Y, -1 --> X - 1; or (sbb), Y
24122 // select (X == 0), -1, Y --> X - 1; or (sbb), Y
24123 SDValue Sub;
24124 if (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE)) {
24125 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
24126 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
24127 } else {
24128 SDValue One = DAG.getConstant(1, DL, CmpOp0.getValueType());
24129 Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpOp0, One);
24130 }
24132 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
24133 Sub.getValue(1));
24134 return DAG.getNode(ISD::OR, DL, VT, SBB, Y);
24135 } else if (!Subtarget.canUseCMOV() && CondCode == X86::COND_E &&
24136 CmpOp0.getOpcode() == ISD::AND &&
24137 isOneConstant(CmpOp0.getOperand(1))) {
24138 SDValue Src1, Src2;
24139 // true if Op2 is XOR or OR operator and one of its operands
24140 // is equal to Op1
24141 // ( a , a op b) || ( b , a op b)
24142 auto isOrXorPattern = [&]() {
24143 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
24144 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
24145 Src1 =
24146 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
24147 Src2 = Op1;
24148 return true;
24149 }
24150 return false;
24151 };
24152
24153 if (isOrXorPattern()) {
24154 SDValue Neg;
24155 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
24156 // we need mask of all zeros or ones with same size of the other
24157 // operands.
24158 if (CmpSz > VT.getSizeInBits())
24159 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
24160 else if (CmpSz < VT.getSizeInBits())
24161 Neg = DAG.getNode(ISD::AND, DL, VT,
24162 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
24163 DAG.getConstant(1, DL, VT));
24164 else
24165 Neg = CmpOp0;
24166 SDValue Mask = DAG.getNegative(Neg, DL, VT); // -(and (x, 0x1))
24167 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
24168 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
24169 }
24170 } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&
24171 Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&
24172 ((CondCode == X86::COND_S) || // smin(x, 0)
24173 (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)
24174 // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x
24175 //
24176 // If the comparison is testing for a positive value, we have to invert
24177 // the sign bit mask, so only do that transform if the target has a
24178 // bitwise 'and not' instruction (the invert is free).
24179 // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
24180 unsigned ShCt = VT.getSizeInBits() - 1;
24181 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);
24182 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);
24183 if (CondCode == X86::COND_G)
24184 Shift = DAG.getNOT(DL, Shift, VT);
24185 return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);
24186 }
24187 }
24188
24189 // Look past (and (setcc_carry (cmp ...)), 1).
24190 if (Cond.getOpcode() == ISD::AND &&
24191 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
24192 isOneConstant(Cond.getOperand(1)))
24193 Cond = Cond.getOperand(0);
24194
24195 // If condition flag is set by a X86ISD::CMP, then use it as the condition
24196 // setting operand in place of the X86ISD::SETCC.
24197 unsigned CondOpcode = Cond.getOpcode();
24198 if (CondOpcode == X86ISD::SETCC ||
24199 CondOpcode == X86ISD::SETCC_CARRY) {
24200 CC = Cond.getOperand(0);
24201
24202 SDValue Cmp = Cond.getOperand(1);
24203 bool IllegalFPCMov = false;
24204 if (VT.isFloatingPoint() && !VT.isVector() &&
24205 !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?
24206 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
24207
24208 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
24209 Cmp.getOpcode() == X86ISD::BT) { // FIXME
24210 Cond = Cmp;
24211 AddTest = false;
24212 }
24213 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
24214 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
24215 CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
24216 SDValue Value;
24217 X86::CondCode X86Cond;
24218 std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24219
24220 CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);
24221 AddTest = false;
24222 }
24223
24224 if (AddTest) {
24225 // Look past the truncate if the high bits are known zero.
24227 Cond = Cond.getOperand(0);
24228
24229 // We know the result of AND is compared against zero. Try to match
24230 // it to BT.
24231 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
24232 X86::CondCode X86CondCode;
24233 if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {
24234 CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);
24235 Cond = BT;
24236 AddTest = false;
24237 }
24238 }
24239 }
24240
24241 if (AddTest) {
24242 CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
24243 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
24244 }
24245
24246 // a < b ? -1 : 0 -> RES = ~setcc_carry
24247 // a < b ? 0 : -1 -> RES = setcc_carry
24248 // a >= b ? -1 : 0 -> RES = setcc_carry
24249 // a >= b ? 0 : -1 -> RES = ~setcc_carry
24250 if (Cond.getOpcode() == X86ISD::SUB) {
24251 unsigned CondCode = CC->getAsZExtVal();
24252
24253 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
24254 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
24255 (isNullConstant(Op1) || isNullConstant(Op2))) {
24256 SDValue Res =
24257 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
24258 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);
24259 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
24260 return DAG.getNOT(DL, Res, Res.getValueType());
24261 return Res;
24262 }
24263 }
24264
24265 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
24266 // widen the cmov and push the truncate through. This avoids introducing a new
24267 // branch during isel and doesn't add any extensions.
24268 if (Op.getValueType() == MVT::i8 &&
24269 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
24270 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
24271 if (T1.getValueType() == T2.getValueType() &&
24272 // Exclude CopyFromReg to avoid partial register stalls.
24273 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
24274 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
24275 CC, Cond);
24276 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24277 }
24278 }
24279
24280 // Or finally, promote i8 cmovs if we have CMOV,
24281 // or i16 cmovs if it won't prevent folding a load.
24282 // FIXME: we should not limit promotion of i8 case to only when the CMOV is
24283 // legal, but EmitLoweredSelect() can not deal with these extensions
24284 // being inserted between two CMOV's. (in i16 case too TBN)
24285 // https://bugs.llvm.org/show_bug.cgi?id=40974
24286 if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||
24287 (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
24288 !X86::mayFoldLoad(Op2, Subtarget))) {
24289 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
24290 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
24291 SDValue Ops[] = { Op2, Op1, CC, Cond };
24292 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
24293 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
24294 }
24295
24296 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
24297 // condition is true.
24298 SDValue Ops[] = { Op2, Op1, CC, Cond };
24299 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops, Op->getFlags());
24300}
24301
24303 const X86Subtarget &Subtarget,
24304 SelectionDAG &DAG) {
24305 MVT VT = Op->getSimpleValueType(0);
24306 SDValue In = Op->getOperand(0);
24307 MVT InVT = In.getSimpleValueType();
24308 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
24309 MVT VTElt = VT.getVectorElementType();
24310 SDLoc dl(Op);
24311
24312 unsigned NumElts = VT.getVectorNumElements();
24313
24314 // Extend VT if the scalar type is i8/i16 and BWI is not supported.
24315 MVT ExtVT = VT;
24316 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
24317 // If v16i32 is to be avoided, we'll need to split and concatenate.
24318 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
24319 return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);
24320
24321 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
24322 }
24323
24324 // Widen to 512-bits if VLX is not supported.
24325 MVT WideVT = ExtVT;
24326 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
24327 NumElts *= 512 / ExtVT.getSizeInBits();
24328 InVT = MVT::getVectorVT(MVT::i1, NumElts);
24329 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
24330 In, DAG.getIntPtrConstant(0, dl));
24331 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
24332 }
24333
24334 SDValue V;
24335 MVT WideEltVT = WideVT.getVectorElementType();
24336 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
24337 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
24338 V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);
24339 } else {
24340 SDValue NegOne = DAG.getConstant(-1, dl, WideVT);
24341 SDValue Zero = DAG.getConstant(0, dl, WideVT);
24342 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
24343 }
24344
24345 // Truncate if we had to extend i16/i8 above.
24346 if (VT != ExtVT) {
24347 WideVT = MVT::getVectorVT(VTElt, NumElts);
24348 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
24349 }
24350
24351 // Extract back to 128/256-bit if we widened.
24352 if (WideVT != VT)
24353 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
24354 DAG.getIntPtrConstant(0, dl));
24355
24356 return V;
24357}
24358
24360 SelectionDAG &DAG) {
24361 SDValue In = Op->getOperand(0);
24362 MVT InVT = In.getSimpleValueType();
24363
24364 if (InVT.getVectorElementType() == MVT::i1)
24365 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24366
24367 assert(Subtarget.hasAVX() && "Expected AVX support");
24368 return LowerAVXExtend(Op, DAG, Subtarget);
24369}
24370
24371// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
24372// For sign extend this needs to handle all vector sizes and SSE4.1 and
24373// non-SSE4.1 targets. For zero extend this should only handle inputs of
24374// MVT::v64i8 when BWI is not supported, but AVX512 is.
24376 const X86Subtarget &Subtarget,
24377 SelectionDAG &DAG) {
24378 SDValue In = Op->getOperand(0);
24379 MVT VT = Op->getSimpleValueType(0);
24380 MVT InVT = In.getSimpleValueType();
24381
24382 MVT SVT = VT.getVectorElementType();
24383 MVT InSVT = InVT.getVectorElementType();
24385
24386 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
24387 return SDValue();
24388 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
24389 return SDValue();
24390 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
24391 !(VT.is256BitVector() && Subtarget.hasAVX()) &&
24392 !(VT.is512BitVector() && Subtarget.hasAVX512()))
24393 return SDValue();
24394
24395 SDLoc dl(Op);
24396 unsigned Opc = Op.getOpcode();
24397 unsigned NumElts = VT.getVectorNumElements();
24398
24399 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
24400 // For 512-bit vectors, we need 128-bits or 256-bits.
24401 if (InVT.getSizeInBits() > 128) {
24402 // Input needs to be at least the same number of elements as output, and
24403 // at least 128-bits.
24404 int InSize = InSVT.getSizeInBits() * NumElts;
24405 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
24406 InVT = In.getSimpleValueType();
24407 }
24408
24409 // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,
24410 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
24411 // need to be handled here for 256/512-bit results.
24412 if (Subtarget.hasInt256()) {
24413 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
24414
24415 if (InVT.getVectorNumElements() != NumElts)
24416 return DAG.getNode(Op.getOpcode(), dl, VT, In);
24417
24418 // FIXME: Apparently we create inreg operations that could be regular
24419 // extends.
24420 unsigned ExtOpc =
24423 return DAG.getNode(ExtOpc, dl, VT, In);
24424 }
24425
24426 // pre-AVX2 256-bit extensions need to be split into 128-bit instructions.
24427 if (Subtarget.hasAVX()) {
24428 assert(VT.is256BitVector() && "256-bit vector expected");
24429 MVT HalfVT = VT.getHalfNumVectorElementsVT();
24430 int HalfNumElts = HalfVT.getVectorNumElements();
24431
24432 unsigned NumSrcElts = InVT.getVectorNumElements();
24433 SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);
24434 for (int i = 0; i != HalfNumElts; ++i)
24435 HiMask[i] = HalfNumElts + i;
24436
24437 SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);
24438 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);
24439 Hi = DAG.getNode(Opc, dl, HalfVT, Hi);
24440 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
24441 }
24442
24443 // We should only get here for sign extend.
24444 assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!");
24445 assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs");
24446 unsigned InNumElts = InVT.getVectorNumElements();
24447
24448 // If the source elements are already all-signbits, we don't need to extend,
24449 // just splat the elements.
24450 APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
24451 if (DAG.ComputeNumSignBits(In, DemandedElts) == InVT.getScalarSizeInBits()) {
24452 unsigned Scale = InNumElts / NumElts;
24453 SmallVector<int, 16> ShuffleMask;
24454 for (unsigned I = 0; I != NumElts; ++I)
24455 ShuffleMask.append(Scale, I);
24456 return DAG.getBitcast(VT,
24457 DAG.getVectorShuffle(InVT, dl, In, In, ShuffleMask));
24458 }
24459
24460 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
24461 SDValue Curr = In;
24462 SDValue SignExt = Curr;
24463
24464 // As SRAI is only available on i16/i32 types, we expand only up to i32
24465 // and handle i64 separately.
24466 if (InVT != MVT::v4i32) {
24467 MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
24468
24469 unsigned DestWidth = DestVT.getScalarSizeInBits();
24470 unsigned Scale = DestWidth / InSVT.getSizeInBits();
24471 unsigned DestElts = DestVT.getVectorNumElements();
24472
24473 // Build a shuffle mask that takes each input element and places it in the
24474 // MSBs of the new element size.
24475 SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);
24476 for (unsigned i = 0; i != DestElts; ++i)
24477 Mask[i * Scale + (Scale - 1)] = i;
24478
24479 Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);
24480 Curr = DAG.getBitcast(DestVT, Curr);
24481
24482 unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();
24483 SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,
24484 DAG.getTargetConstant(SignExtShift, dl, MVT::i8));
24485 }
24486
24487 if (VT == MVT::v2i64) {
24488 assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT");
24489 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
24490 SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);
24491 SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});
24492 SignExt = DAG.getBitcast(VT, SignExt);
24493 }
24494
24495 return SignExt;
24496}
24497
24499 SelectionDAG &DAG) {
24500 MVT VT = Op->getSimpleValueType(0);
24501 SDValue In = Op->getOperand(0);
24502 MVT InVT = In.getSimpleValueType();
24503 SDLoc dl(Op);
24504
24505 if (InVT.getVectorElementType() == MVT::i1)
24506 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
24507
24508 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
24510 "Expected same number of elements");
24511 assert((VT.getVectorElementType() == MVT::i16 ||
24512 VT.getVectorElementType() == MVT::i32 ||
24513 VT.getVectorElementType() == MVT::i64) &&
24514 "Unexpected element type");
24515 assert((InVT.getVectorElementType() == MVT::i8 ||
24516 InVT.getVectorElementType() == MVT::i16 ||
24517 InVT.getVectorElementType() == MVT::i32) &&
24518 "Unexpected element type");
24519
24520 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
24521 assert(InVT == MVT::v32i8 && "Unexpected VT!");
24522 return splitVectorIntUnary(Op, DAG, dl);
24523 }
24524
24525 if (Subtarget.hasInt256())
24526 return Op;
24527
24528 // Optimize vectors in AVX mode
24529 // Sign extend v8i16 to v8i32 and
24530 // v4i32 to v4i64
24531 //
24532 // Divide input vector into two parts
24533 // for v4i32 the high shuffle mask will be {2, 3, -1, -1}
24534 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
24535 // concat the vectors to original VT
24536 MVT HalfVT = VT.getHalfNumVectorElementsVT();
24537 SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);
24538
24539 unsigned NumElems = InVT.getVectorNumElements();
24540 SmallVector<int,8> ShufMask(NumElems, -1);
24541 for (unsigned i = 0; i != NumElems/2; ++i)
24542 ShufMask[i] = i + NumElems/2;
24543
24544 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
24545 OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);
24546
24547 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
24548}
24549
24550/// Change a vector store into a pair of half-size vector stores.
24552 SDValue StoredVal = Store->getValue();
24553 assert((StoredVal.getValueType().is256BitVector() ||
24554 StoredVal.getValueType().is512BitVector()) &&
24555 "Expecting 256/512-bit op");
24556
24557 // Splitting volatile memory ops is not allowed unless the operation was not
24558 // legal to begin with. Assume the input store is legal (this transform is
24559 // only used for targets with AVX). Note: It is possible that we have an
24560 // illegal type like v2i128, and so we could allow splitting a volatile store
24561 // in that case if that is important.
24562 if (!Store->isSimple())
24563 return SDValue();
24564
24565 SDLoc DL(Store);
24566 SDValue Value0, Value1;
24567 std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
24568 unsigned HalfOffset = Value0.getValueType().getStoreSize();
24569 SDValue Ptr0 = Store->getBasePtr();
24570 SDValue Ptr1 =
24571 DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(HalfOffset), DL);
24572 SDValue Ch0 =
24573 DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
24574 Store->getOriginalAlign(),
24575 Store->getMemOperand()->getFlags());
24576 SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
24577 Store->getPointerInfo().getWithOffset(HalfOffset),
24578 Store->getOriginalAlign(),
24579 Store->getMemOperand()->getFlags());
24580 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
24581}
24582
24583/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar
24584/// type.
24586 SelectionDAG &DAG) {
24587 SDValue StoredVal = Store->getValue();
24588 assert(StoreVT.is128BitVector() &&
24589 StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op");
24590 StoredVal = DAG.getBitcast(StoreVT, StoredVal);
24591
24592 // Splitting volatile memory ops is not allowed unless the operation was not
24593 // legal to begin with. We are assuming the input op is legal (this transform
24594 // is only used for targets with AVX).
24595 if (!Store->isSimple())
24596 return SDValue();
24597
24598 MVT StoreSVT = StoreVT.getScalarType();
24599 unsigned NumElems = StoreVT.getVectorNumElements();
24600 unsigned ScalarSize = StoreSVT.getStoreSize();
24601
24602 SDLoc DL(Store);
24604 for (unsigned i = 0; i != NumElems; ++i) {
24605 unsigned Offset = i * ScalarSize;
24606 SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),
24608 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,
24609 DAG.getIntPtrConstant(i, DL));
24610 SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
24611 Store->getPointerInfo().getWithOffset(Offset),
24612 Store->getOriginalAlign(),
24613 Store->getMemOperand()->getFlags());
24614 Stores.push_back(Ch);
24615 }
24616 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
24617}
24618
24619static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
24620 SelectionDAG &DAG) {
24621 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
24622 SDLoc dl(St);
24623 SDValue StoredVal = St->getValue();
24624
24625 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.
24626 if (StoredVal.getValueType().isVector() &&
24627 StoredVal.getValueType().getVectorElementType() == MVT::i1) {
24628 unsigned NumElts = StoredVal.getValueType().getVectorNumElements();
24629 assert(NumElts <= 8 && "Unexpected VT");
24630 assert(!St->isTruncatingStore() && "Expected non-truncating store");
24631 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
24632 "Expected AVX512F without AVX512DQI");
24633
24634 // We must pad with zeros to ensure we store zeroes to any unused bits.
24635 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
24636 DAG.getUNDEF(MVT::v16i1), StoredVal,
24637 DAG.getIntPtrConstant(0, dl));
24638 StoredVal = DAG.getBitcast(MVT::i16, StoredVal);
24639 StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
24640 // Make sure we store zeros in the extra bits.
24641 if (NumElts < 8)
24642 StoredVal = DAG.getZeroExtendInReg(
24643 StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));
24644
24645 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24646 St->getPointerInfo(), St->getOriginalAlign(),
24647 St->getMemOperand()->getFlags());
24648 }
24649
24650 if (St->isTruncatingStore())
24651 return SDValue();
24652
24653 // If this is a 256-bit store of concatenated ops, we are better off splitting
24654 // that store into two 128-bit stores. This avoids spurious use of 256-bit ops
24655 // and each half can execute independently. Some cores would split the op into
24656 // halves anyway, so the concat (vinsertf128) is purely an extra op.
24657 MVT StoreVT = StoredVal.getSimpleValueType();
24658 if (StoreVT.is256BitVector() ||
24659 ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
24660 !Subtarget.hasBWI())) {
24661 if (StoredVal.hasOneUse() && isFreeToSplitVector(StoredVal.getNode(), DAG))
24662 return splitVectorStore(St, DAG);
24663 return SDValue();
24664 }
24665
24666 if (StoreVT.is32BitVector())
24667 return SDValue();
24668
24669 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24670 assert(StoreVT.is64BitVector() && "Unexpected VT");
24671 assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==
24673 "Unexpected type action!");
24674
24675 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);
24676 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
24677 DAG.getUNDEF(StoreVT));
24678
24679 if (Subtarget.hasSSE2()) {
24680 // Widen the vector, cast to a v2x64 type, extract the single 64-bit element
24681 // and store it.
24682 MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
24683 MVT CastVT = MVT::getVectorVT(StVT, 2);
24684 StoredVal = DAG.getBitcast(CastVT, StoredVal);
24685 StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
24686 DAG.getIntPtrConstant(0, dl));
24687
24688 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
24689 St->getPointerInfo(), St->getOriginalAlign(),
24690 St->getMemOperand()->getFlags());
24691 }
24692 assert(Subtarget.hasSSE1() && "Expected SSE");
24693 SDVTList Tys = DAG.getVTList(MVT::Other);
24694 SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
24695 return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
24696 St->getMemOperand());
24697}
24698
24699// Lower vector extended loads using a shuffle. If SSSE3 is not available we
24700// may emit an illegal shuffle but the expansion is still better than scalar
24701// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise
24702// we'll emit a shuffle and a arithmetic shift.
24703// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
24704// TODO: It is possible to support ZExt by zeroing the undef values during
24705// the shuffle phase or after the shuffle.
24706static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
24707 SelectionDAG &DAG) {
24708 MVT RegVT = Op.getSimpleValueType();
24709 assert(RegVT.isVector() && "We only custom lower vector loads.");
24710 assert(RegVT.isInteger() &&
24711 "We only custom lower integer vector loads.");
24712
24713 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
24714 SDLoc dl(Ld);
24715
24716 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
24717 if (RegVT.getVectorElementType() == MVT::i1) {
24718 assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load");
24719 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
24720 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
24721 "Expected AVX512F without AVX512DQI");
24722
24723 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
24724 Ld->getPointerInfo(), Ld->getOriginalAlign(),
24725 Ld->getMemOperand()->getFlags());
24726
24727 // Replace chain users with the new chain.
24728 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
24729
24730 SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);
24731 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
24732 DAG.getBitcast(MVT::v16i1, Val),
24733 DAG.getIntPtrConstant(0, dl));
24734 return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);
24735 }
24736
24737 return SDValue();
24738}
24739
24740/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
24741/// each of which has no other use apart from the AND / OR.
24742static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
24743 Opc = Op.getOpcode();
24744 if (Opc != ISD::OR && Opc != ISD::AND)
24745 return false;
24746 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
24747 Op.getOperand(0).hasOneUse() &&
24748 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
24749 Op.getOperand(1).hasOneUse());
24750}
24751
24752SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
24753 SDValue Chain = Op.getOperand(0);
24754 SDValue Cond = Op.getOperand(1);
24755 SDValue Dest = Op.getOperand(2);
24756 SDLoc dl(Op);
24757
24758 // Bail out when we don't have native compare instructions.
24759 if (Cond.getOpcode() == ISD::SETCC &&
24760 Cond.getOperand(0).getValueType() != MVT::f128 &&
24761 !isSoftF16(Cond.getOperand(0).getValueType(), Subtarget)) {
24762 SDValue LHS = Cond.getOperand(0);
24763 SDValue RHS = Cond.getOperand(1);
24764 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
24765
24766 // Special case for
24767 // setcc([su]{add,sub,mul}o == 0)
24768 // setcc([su]{add,sub,mul}o != 1)
24769 if (ISD::isOverflowIntrOpRes(LHS) &&
24770 (CC == ISD::SETEQ || CC == ISD::SETNE) &&
24771 (isNullConstant(RHS) || isOneConstant(RHS))) {
24772 SDValue Value, Overflow;
24773 X86::CondCode X86Cond;
24774 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
24775
24776 if ((CC == ISD::SETEQ) == isNullConstant(RHS))
24777 X86Cond = X86::GetOppositeBranchCondition(X86Cond);
24778
24779 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24780 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24781 Overflow);
24782 }
24783
24784 if (LHS.getSimpleValueType().isInteger()) {
24785 SDValue CCVal;
24786 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
24787 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24788 EFLAGS);
24789 }
24790
24791 if (CC == ISD::SETOEQ) {
24792 // For FCMP_OEQ, we can emit
24793 // two branches instead of an explicit AND instruction with a
24794 // separate test. However, we only do this if this block doesn't
24795 // have a fall-through edge, because this requires an explicit
24796 // jmp when the condition is false.
24797 if (Op.getNode()->hasOneUse()) {
24798 SDNode *User = *Op.getNode()->use_begin();
24799 // Look for an unconditional branch following this conditional branch.
24800 // We need this because we need to reverse the successors in order
24801 // to implement FCMP_OEQ.
24802 if (User->getOpcode() == ISD::BR) {
24803 SDValue FalseBB = User->getOperand(1);
24804 SDNode *NewBR =
24805 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
24806 assert(NewBR == User);
24807 (void)NewBR;
24808 Dest = FalseBB;
24809
24810 SDValue Cmp =
24811 DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24812 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24813 Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
24814 CCVal, Cmp);
24815 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24816 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24817 Cmp);
24818 }
24819 }
24820 } else if (CC == ISD::SETUNE) {
24821 // For FCMP_UNE, we can emit
24822 // two branches instead of an explicit OR instruction with a
24823 // separate test.
24824 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24825 SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
24826 Chain =
24827 DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
24828 CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
24829 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24830 Cmp);
24831 } else {
24832 X86::CondCode X86Cond =
24833 TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
24834 SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
24835 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24836 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24837 Cmp);
24838 }
24839 }
24840
24842 SDValue Value, Overflow;
24843 X86::CondCode X86Cond;
24844 std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
24845
24846 SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
24847 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24848 Overflow);
24849 }
24850
24851 // Look past the truncate if the high bits are known zero.
24853 Cond = Cond.getOperand(0);
24854
24855 EVT CondVT = Cond.getValueType();
24856
24857 // Add an AND with 1 if we don't already have one.
24858 if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
24859 Cond =
24860 DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
24861
24862 SDValue LHS = Cond;
24863 SDValue RHS = DAG.getConstant(0, dl, CondVT);
24864
24865 SDValue CCVal;
24866 SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
24867 return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
24868 EFLAGS);
24869}
24870
24871// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
24872// Calls to _alloca are needed to probe the stack when allocating more than 4k
24873// bytes in one go. Touching the stack at 4K increments is necessary to ensure
24874// that the guard pages used by the OS virtual memory manager are allocated in
24875// correct sequence.
24876SDValue
24877X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
24878 SelectionDAG &DAG) const {
24880 bool SplitStack = MF.shouldSplitStack();
24881 bool EmitStackProbeCall = hasStackProbeSymbol(MF);
24882 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
24883 SplitStack || EmitStackProbeCall;
24884 SDLoc dl(Op);
24885
24886 // Get the inputs.
24887 SDNode *Node = Op.getNode();
24888 SDValue Chain = Op.getOperand(0);
24889 SDValue Size = Op.getOperand(1);
24890 MaybeAlign Alignment(Op.getConstantOperandVal(2));
24891 EVT VT = Node->getValueType(0);
24892
24893 // Chain the dynamic stack allocation so that it doesn't modify the stack
24894 // pointer when other instructions are using the stack.
24895 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
24896
24897 bool Is64Bit = Subtarget.is64Bit();
24898 MVT SPTy = getPointerTy(DAG.getDataLayout());
24899
24901 if (!Lower) {
24902 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24904 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
24905 " not tell us which reg is the stack pointer!");
24906
24907 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
24908 const Align StackAlign = TFI.getStackAlign();
24909 if (hasInlineStackProbe(MF)) {
24911
24912 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24913 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24914 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24915 Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
24916 DAG.getRegister(Vreg, SPTy));
24917 } else {
24918 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
24919 Chain = SP.getValue(1);
24920 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
24921 }
24922 if (Alignment && *Alignment > StackAlign)
24923 Result =
24924 DAG.getNode(ISD::AND, dl, VT, Result,
24925 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24926 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
24927 } else if (SplitStack) {
24929
24930 if (Is64Bit) {
24931 // The 64 bit implementation of segmented stacks needs to clobber both r10
24932 // r11. This makes it impossible to use it along with nested parameters.
24933 const Function &F = MF.getFunction();
24934 for (const auto &A : F.args()) {
24935 if (A.hasNestAttr())
24936 report_fatal_error("Cannot use segmented stacks with functions that "
24937 "have nested arguments.");
24938 }
24939 }
24940
24941 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
24942 Register Vreg = MRI.createVirtualRegister(AddrRegClass);
24943 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
24944 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
24945 DAG.getRegister(Vreg, SPTy));
24946 } else {
24947 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
24948 Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);
24949 MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);
24950
24951 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
24952 Register SPReg = RegInfo->getStackRegister();
24953 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
24954 Chain = SP.getValue(1);
24955
24956 if (Alignment) {
24957 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
24958 DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
24959 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
24960 }
24961
24962 Result = SP;
24963 }
24964
24965 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
24966
24967 SDValue Ops[2] = {Result, Chain};
24968 return DAG.getMergeValues(Ops, dl);
24969}
24970
24971SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
24973 auto PtrVT = getPointerTy(MF.getDataLayout());
24975
24976 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
24977 SDLoc DL(Op);
24978
24979 if (!Subtarget.is64Bit() ||
24980 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
24981 // vastart just stores the address of the VarArgsFrameIndex slot into the
24982 // memory location argument.
24983 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
24984 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
24985 MachinePointerInfo(SV));
24986 }
24987
24988 // __va_list_tag:
24989 // gp_offset (0 - 6 * 8)
24990 // fp_offset (48 - 48 + 8 * 16)
24991 // overflow_arg_area (point to parameters coming in memory).
24992 // reg_save_area
24994 SDValue FIN = Op.getOperand(1);
24995 // Store gp_offset
24996 SDValue Store = DAG.getStore(
24997 Op.getOperand(0), DL,
24998 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
24999 MachinePointerInfo(SV));
25000 MemOps.push_back(Store);
25001
25002 // Store fp_offset
25003 FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::getFixed(4), DL);
25004 Store = DAG.getStore(
25005 Op.getOperand(0), DL,
25006 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
25007 MachinePointerInfo(SV, 4));
25008 MemOps.push_back(Store);
25009
25010 // Store ptr to overflow_arg_area
25011 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
25012 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
25013 Store =
25014 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
25015 MemOps.push_back(Store);
25016
25017 // Store ptr to reg_save_area.
25018 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
25019 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
25020 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
25021 Store = DAG.getStore(
25022 Op.getOperand(0), DL, RSFIN, FIN,
25023 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
25024 MemOps.push_back(Store);
25025 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
25026}
25027
25028SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
25029 assert(Subtarget.is64Bit() &&
25030 "LowerVAARG only handles 64-bit va_arg!");
25031 assert(Op.getNumOperands() == 4);
25032
25034 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
25035 // The Win64 ABI uses char* instead of a structure.
25036 return DAG.expandVAArg(Op.getNode());
25037
25038 SDValue Chain = Op.getOperand(0);
25039 SDValue SrcPtr = Op.getOperand(1);
25040 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
25041 unsigned Align = Op.getConstantOperandVal(3);
25042 SDLoc dl(Op);
25043
25044 EVT ArgVT = Op.getNode()->getValueType(0);
25045 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
25046 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
25047 uint8_t ArgMode;
25048
25049 // Decide which area this value should be read from.
25050 // TODO: Implement the AMD64 ABI in its entirety. This simple
25051 // selection mechanism works only for the basic types.
25052 assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
25053 if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
25054 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
25055 } else {
25056 assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
25057 "Unhandled argument type in LowerVAARG");
25058 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
25059 }
25060
25061 if (ArgMode == 2) {
25062 // Make sure using fp_offset makes sense.
25063 assert(!Subtarget.useSoftFloat() &&
25064 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
25065 Subtarget.hasSSE1());
25066 }
25067
25068 // Insert VAARG node into the DAG
25069 // VAARG returns two values: Variable Argument Address, Chain
25070 SDValue InstOps[] = {Chain, SrcPtr,
25071 DAG.getTargetConstant(ArgSize, dl, MVT::i32),
25072 DAG.getTargetConstant(ArgMode, dl, MVT::i8),
25073 DAG.getTargetConstant(Align, dl, MVT::i32)};
25074 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
25077 VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
25078 /*Alignment=*/std::nullopt,
25080 Chain = VAARG.getValue(1);
25081
25082 // Load the next argument and return it
25083 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
25084}
25085
25086static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
25087 SelectionDAG &DAG) {
25088 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
25089 // where a va_list is still an i8*.
25090 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
25091 if (Subtarget.isCallingConvWin64(
25093 // Probably a Win64 va_copy.
25094 return DAG.expandVACopy(Op.getNode());
25095
25096 SDValue Chain = Op.getOperand(0);
25097 SDValue DstPtr = Op.getOperand(1);
25098 SDValue SrcPtr = Op.getOperand(2);
25099 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
25100 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
25101 SDLoc DL(Op);
25102
25103 return DAG.getMemcpy(
25104 Chain, DL, DstPtr, SrcPtr,
25105 DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),
25106 Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,
25107 false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
25108}
25109
25110// Helper to get immediate/variable SSE shift opcode from other shift opcodes.
25111static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {
25112 switch (Opc) {
25113 case ISD::SHL:
25114 case X86ISD::VSHL:
25115 case X86ISD::VSHLI:
25116 return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;
25117 case ISD::SRL:
25118 case X86ISD::VSRL:
25119 case X86ISD::VSRLI:
25120 return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;
25121 case ISD::SRA:
25122 case X86ISD::VSRA:
25123 case X86ISD::VSRAI:
25124 return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;
25125 }
25126 llvm_unreachable("Unknown target vector shift node");
25127}
25128
25129/// Handle vector element shifts where the shift amount is a constant.
25130/// Takes immediate version of shift as input.
25131static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
25132 SDValue SrcOp, uint64_t ShiftAmt,
25133 SelectionDAG &DAG) {
25134 MVT ElementType = VT.getVectorElementType();
25135
25136 // Bitcast the source vector to the output type, this is mainly necessary for
25137 // vXi8/vXi64 shifts.
25138 if (VT != SrcOp.getSimpleValueType())
25139 SrcOp = DAG.getBitcast(VT, SrcOp);
25140
25141 // Fold this packed shift into its first operand if ShiftAmt is 0.
25142 if (ShiftAmt == 0)
25143 return SrcOp;
25144
25145 // Check for ShiftAmt >= element width
25146 if (ShiftAmt >= ElementType.getSizeInBits()) {
25147 if (Opc == X86ISD::VSRAI)
25148 ShiftAmt = ElementType.getSizeInBits() - 1;
25149 else
25150 return DAG.getConstant(0, dl, VT);
25151 }
25152
25153 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
25154 && "Unknown target vector shift-by-constant node");
25155
25156 // Fold this packed vector shift into a build vector if SrcOp is a
25157 // vector of Constants or UNDEFs.
25159 unsigned ShiftOpc;
25160 switch (Opc) {
25161 default: llvm_unreachable("Unknown opcode!");
25162 case X86ISD::VSHLI:
25163 ShiftOpc = ISD::SHL;
25164 break;
25165 case X86ISD::VSRLI:
25166 ShiftOpc = ISD::SRL;
25167 break;
25168 case X86ISD::VSRAI:
25169 ShiftOpc = ISD::SRA;
25170 break;
25171 }
25172
25173 SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);
25174 if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))
25175 return C;
25176 }
25177
25178 return DAG.getNode(Opc, dl, VT, SrcOp,
25179 DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));
25180}
25181
25182/// Handle vector element shifts by a splat shift amount
25183static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
25184 SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,
25185 const X86Subtarget &Subtarget,
25186 SelectionDAG &DAG) {
25187 MVT AmtVT = ShAmt.getSimpleValueType();
25188 assert(AmtVT.isVector() && "Vector shift type mismatch");
25189 assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&
25190 "Illegal vector splat index");
25191
25192 // Move the splat element to the bottom element.
25193 if (ShAmtIdx != 0) {
25194 SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);
25195 Mask[0] = ShAmtIdx;
25196 ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);
25197 }
25198
25199 // Peek through any zext node if we can get back to a 128-bit source.
25200 if (AmtVT.getScalarSizeInBits() == 64 &&
25201 (ShAmt.getOpcode() == ISD::ZERO_EXTEND ||
25203 ShAmt.getOperand(0).getValueType().isSimple() &&
25204 ShAmt.getOperand(0).getValueType().is128BitVector()) {
25205 ShAmt = ShAmt.getOperand(0);
25206 AmtVT = ShAmt.getSimpleValueType();
25207 }
25208
25209 // See if we can mask off the upper elements using the existing source node.
25210 // The shift uses the entire lower 64-bits of the amount vector, so no need to
25211 // do this for vXi64 types.
25212 bool IsMasked = false;
25213 if (AmtVT.getScalarSizeInBits() < 64) {
25214 if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||
25215 ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {
25216 // If the shift amount has come from a scalar, then zero-extend the scalar
25217 // before moving to the vector.
25218 ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);
25219 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
25220 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);
25221 AmtVT = MVT::v4i32;
25222 IsMasked = true;
25223 } else if (ShAmt.getOpcode() == ISD::AND) {
25224 // See if the shift amount is already masked (e.g. for rotation modulo),
25225 // then we can zero-extend it by setting all the other mask elements to
25226 // zero.
25227 SmallVector<SDValue> MaskElts(
25228 AmtVT.getVectorNumElements(),
25229 DAG.getConstant(0, dl, AmtVT.getScalarType()));
25230 MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());
25231 SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);
25232 if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,
25233 {ShAmt.getOperand(1), Mask}))) {
25234 ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);
25235 IsMasked = true;
25236 }
25237 }
25238 }
25239
25240 // Extract if the shift amount vector is larger than 128-bits.
25241 if (AmtVT.getSizeInBits() > 128) {
25242 ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);
25243 AmtVT = ShAmt.getSimpleValueType();
25244 }
25245
25246 // Zero-extend bottom element to v2i64 vector type, either by extension or
25247 // shuffle masking.
25248 if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {
25249 if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||
25250 ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {
25251 ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);
25252 } else if (Subtarget.hasSSE41()) {
25253 ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),
25254 MVT::v2i64, ShAmt);
25255 } else {
25256 SDValue ByteShift = DAG.getTargetConstant(
25257 (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);
25258 ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);
25259 ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25260 ByteShift);
25261 ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,
25262 ByteShift);
25263 }
25264 }
25265
25266 // Change opcode to non-immediate version.
25267 Opc = getTargetVShiftUniformOpcode(Opc, true);
25268
25269 // The return type has to be a 128-bit type with the same element
25270 // type as the input type.
25271 MVT EltVT = VT.getVectorElementType();
25272 MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());
25273
25274 ShAmt = DAG.getBitcast(ShVT, ShAmt);
25275 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
25276}
25277
25278/// Return Mask with the necessary casting or extending
25279/// for \p Mask according to \p MaskVT when lowering masking intrinsics
25280static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
25281 const X86Subtarget &Subtarget, SelectionDAG &DAG,
25282 const SDLoc &dl) {
25283
25284 if (isAllOnesConstant(Mask))
25285 return DAG.getConstant(1, dl, MaskVT);
25286 if (X86::isZeroNode(Mask))
25287 return DAG.getConstant(0, dl, MaskVT);
25288
25289 assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
25290
25291 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
25292 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
25293 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
25294 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
25295 SDValue Lo, Hi;
25296 std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
25297 Lo = DAG.getBitcast(MVT::v32i1, Lo);
25298 Hi = DAG.getBitcast(MVT::v32i1, Hi);
25299 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
25300 } else {
25301 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
25302 Mask.getSimpleValueType().getSizeInBits());
25303 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
25304 // are extracted by EXTRACT_SUBVECTOR.
25305 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
25306 DAG.getBitcast(BitcastVT, Mask),
25307 DAG.getIntPtrConstant(0, dl));
25308 }
25309}
25310
25311/// Return (and \p Op, \p Mask) for compare instructions or
25312/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
25313/// necessary casting or extending for \p Mask when lowering masking intrinsics
25315 SDValue PreservedSrc,
25316 const X86Subtarget &Subtarget,
25317 SelectionDAG &DAG) {
25318 MVT VT = Op.getSimpleValueType();
25319 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
25320 unsigned OpcodeSelect = ISD::VSELECT;
25321 SDLoc dl(Op);
25322
25323 if (isAllOnesConstant(Mask))
25324 return Op;
25325
25326 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
25327
25328 if (PreservedSrc.isUndef())
25329 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25330 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
25331}
25332
25333/// Creates an SDNode for a predicated scalar operation.
25334/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
25335/// The mask is coming as MVT::i8 and it should be transformed
25336/// to MVT::v1i1 while lowering masking intrinsics.
25337/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
25338/// "X86select" instead of "vselect". We just can't create the "vselect" node
25339/// for a scalar instruction.
25341 SDValue PreservedSrc,
25342 const X86Subtarget &Subtarget,
25343 SelectionDAG &DAG) {
25344
25345 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
25346 if (MaskConst->getZExtValue() & 0x1)
25347 return Op;
25348
25349 MVT VT = Op.getSimpleValueType();
25350 SDLoc dl(Op);
25351
25352 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
25353 SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,
25354 DAG.getBitcast(MVT::v8i1, Mask),
25355 DAG.getIntPtrConstant(0, dl));
25356 if (Op.getOpcode() == X86ISD::FSETCCM ||
25357 Op.getOpcode() == X86ISD::FSETCCM_SAE ||
25358 Op.getOpcode() == X86ISD::VFPCLASSS)
25359 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
25360
25361 if (PreservedSrc.isUndef())
25362 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
25363 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
25364}
25365
25367 if (!Fn->hasPersonalityFn())
25369 "querying registration node size for function without personality");
25370 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
25371 // WinEHStatePass for the full struct definition.
25372 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
25373 case EHPersonality::MSVC_X86SEH: return 24;
25374 case EHPersonality::MSVC_CXX: return 16;
25375 default: break;
25376 }
25378 "can only recover FP for 32-bit MSVC EH personality functions");
25379}
25380
25381/// When the MSVC runtime transfers control to us, either to an outlined
25382/// function or when returning to a parent frame after catching an exception, we
25383/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
25384/// Here's the math:
25385/// RegNodeBase = EntryEBP - RegNodeSize
25386/// ParentFP = RegNodeBase - ParentFrameOffset
25387/// Subtracting RegNodeSize takes us to the offset of the registration node, and
25388/// subtracting the offset (negative on x86) takes us back to the parent FP.
25390 SDValue EntryEBP) {
25392 SDLoc dl;
25393
25394 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25395 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
25396
25397 // It's possible that the parent function no longer has a personality function
25398 // if the exceptional code was optimized away, in which case we just return
25399 // the incoming EBP.
25400 if (!Fn->hasPersonalityFn())
25401 return EntryEBP;
25402
25403 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
25404 // registration, or the .set_setframe offset.
25405 MCSymbol *OffsetSym =
25408 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
25409 SDValue ParentFrameOffset =
25410 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
25411
25412 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
25413 // prologue to RBP in the parent function.
25414 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
25415 if (Subtarget.is64Bit())
25416 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
25417
25418 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
25419 // RegNodeBase = EntryEBP - RegNodeSize
25420 // ParentFP = RegNodeBase - ParentFrameOffset
25421 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
25422 DAG.getConstant(RegNodeSize, dl, PtrVT));
25423 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
25424}
25425
25426SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
25427 SelectionDAG &DAG) const {
25428 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
25429 auto isRoundModeCurDirection = [](SDValue Rnd) {
25430 if (auto *C = dyn_cast<ConstantSDNode>(Rnd))
25431 return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;
25432
25433 return false;
25434 };
25435 auto isRoundModeSAE = [](SDValue Rnd) {
25436 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25437 unsigned RC = C->getZExtValue();
25439 // Clear the NO_EXC bit and check remaining bits.
25441 // As a convenience we allow no other bits or explicitly
25442 // current direction.
25443 return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;
25444 }
25445 }
25446
25447 return false;
25448 };
25449 auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {
25450 if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {
25451 RC = C->getZExtValue();
25453 // Clear the NO_EXC bit and check remaining bits.
25459 }
25460 }
25461
25462 return false;
25463 };
25464
25465 SDLoc dl(Op);
25466 unsigned IntNo = Op.getConstantOperandVal(0);
25467 MVT VT = Op.getSimpleValueType();
25468 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
25469
25470 // Propagate flags from original node to transformed node(s).
25471 SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());
25472
25473 if (IntrData) {
25474 switch(IntrData->Type) {
25475 case INTR_TYPE_1OP: {
25476 // We specify 2 possible opcodes for intrinsics with rounding modes.
25477 // First, we check if the intrinsic may have non-default rounding mode,
25478 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25479 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25480 if (IntrWithRoundingModeOpcode != 0) {
25481 SDValue Rnd = Op.getOperand(2);
25482 unsigned RC = 0;
25483 if (isRoundModeSAEToX(Rnd, RC))
25484 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25485 Op.getOperand(1),
25486 DAG.getTargetConstant(RC, dl, MVT::i32));
25487 if (!isRoundModeCurDirection(Rnd))
25488 return SDValue();
25489 }
25490 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25491 Op.getOperand(1));
25492 }
25493 case INTR_TYPE_1OP_SAE: {
25494 SDValue Sae = Op.getOperand(2);
25495
25496 unsigned Opc;
25497 if (isRoundModeCurDirection(Sae))
25498 Opc = IntrData->Opc0;
25499 else if (isRoundModeSAE(Sae))
25500 Opc = IntrData->Opc1;
25501 else
25502 return SDValue();
25503
25504 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));
25505 }
25506 case INTR_TYPE_2OP: {
25507 SDValue Src2 = Op.getOperand(2);
25508
25509 // We specify 2 possible opcodes for intrinsics with rounding modes.
25510 // First, we check if the intrinsic may have non-default rounding mode,
25511 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25512 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25513 if (IntrWithRoundingModeOpcode != 0) {
25514 SDValue Rnd = Op.getOperand(3);
25515 unsigned RC = 0;
25516 if (isRoundModeSAEToX(Rnd, RC))
25517 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25518 Op.getOperand(1), Src2,
25519 DAG.getTargetConstant(RC, dl, MVT::i32));
25520 if (!isRoundModeCurDirection(Rnd))
25521 return SDValue();
25522 }
25523
25524 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25525 Op.getOperand(1), Src2);
25526 }
25527 case INTR_TYPE_2OP_SAE: {
25528 SDValue Sae = Op.getOperand(3);
25529
25530 unsigned Opc;
25531 if (isRoundModeCurDirection(Sae))
25532 Opc = IntrData->Opc0;
25533 else if (isRoundModeSAE(Sae))
25534 Opc = IntrData->Opc1;
25535 else
25536 return SDValue();
25537
25538 return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
25539 Op.getOperand(2));
25540 }
25541 case INTR_TYPE_3OP:
25542 case INTR_TYPE_3OP_IMM8: {
25543 SDValue Src1 = Op.getOperand(1);
25544 SDValue Src2 = Op.getOperand(2);
25545 SDValue Src3 = Op.getOperand(3);
25546
25547 if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&
25548 Src3.getValueType() != MVT::i8) {
25549 Src3 = DAG.getTargetConstant(Src3->getAsZExtVal() & 0xff, dl, MVT::i8);
25550 }
25551
25552 // We specify 2 possible opcodes for intrinsics with rounding modes.
25553 // First, we check if the intrinsic may have non-default rounding mode,
25554 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25555 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25556 if (IntrWithRoundingModeOpcode != 0) {
25557 SDValue Rnd = Op.getOperand(4);
25558 unsigned RC = 0;
25559 if (isRoundModeSAEToX(Rnd, RC))
25560 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25561 Src1, Src2, Src3,
25562 DAG.getTargetConstant(RC, dl, MVT::i32));
25563 if (!isRoundModeCurDirection(Rnd))
25564 return SDValue();
25565 }
25566
25567 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25568 {Src1, Src2, Src3});
25569 }
25570 case INTR_TYPE_4OP_IMM8: {
25571 assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant);
25572 SDValue Src4 = Op.getOperand(4);
25573 if (Src4.getValueType() != MVT::i8) {
25574 Src4 = DAG.getTargetConstant(Src4->getAsZExtVal() & 0xff, dl, MVT::i8);
25575 }
25576
25577 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25578 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
25579 Src4);
25580 }
25581 case INTR_TYPE_1OP_MASK: {
25582 SDValue Src = Op.getOperand(1);
25583 SDValue PassThru = Op.getOperand(2);
25584 SDValue Mask = Op.getOperand(3);
25585 // We add rounding mode to the Node when
25586 // - RC Opcode is specified and
25587 // - RC is not "current direction".
25588 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25589 if (IntrWithRoundingModeOpcode != 0) {
25590 SDValue Rnd = Op.getOperand(4);
25591 unsigned RC = 0;
25592 if (isRoundModeSAEToX(Rnd, RC))
25593 return getVectorMaskingNode(
25594 DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
25595 Src, DAG.getTargetConstant(RC, dl, MVT::i32)),
25596 Mask, PassThru, Subtarget, DAG);
25597 if (!isRoundModeCurDirection(Rnd))
25598 return SDValue();
25599 }
25600 return getVectorMaskingNode(
25601 DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
25602 Subtarget, DAG);
25603 }
25605 SDValue Src = Op.getOperand(1);
25606 SDValue PassThru = Op.getOperand(2);
25607 SDValue Mask = Op.getOperand(3);
25608 SDValue Rnd = Op.getOperand(4);
25609
25610 unsigned Opc;
25611 if (isRoundModeCurDirection(Rnd))
25612 Opc = IntrData->Opc0;
25613 else if (isRoundModeSAE(Rnd))
25614 Opc = IntrData->Opc1;
25615 else
25616 return SDValue();
25617
25618 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
25619 Subtarget, DAG);
25620 }
25621 case INTR_TYPE_SCALAR_MASK: {
25622 SDValue Src1 = Op.getOperand(1);
25623 SDValue Src2 = Op.getOperand(2);
25624 SDValue passThru = Op.getOperand(3);
25625 SDValue Mask = Op.getOperand(4);
25626 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
25627 // There are 2 kinds of intrinsics in this group:
25628 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
25629 // (2) With rounding mode and sae - 7 operands.
25630 bool HasRounding = IntrWithRoundingModeOpcode != 0;
25631 if (Op.getNumOperands() == (5U + HasRounding)) {
25632 if (HasRounding) {
25633 SDValue Rnd = Op.getOperand(5);
25634 unsigned RC = 0;
25635 if (isRoundModeSAEToX(Rnd, RC))
25636 return getScalarMaskingNode(
25637 DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,
25638 DAG.getTargetConstant(RC, dl, MVT::i32)),
25639 Mask, passThru, Subtarget, DAG);
25640 if (!isRoundModeCurDirection(Rnd))
25641 return SDValue();
25642 }
25643 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
25644 Src2),
25645 Mask, passThru, Subtarget, DAG);
25646 }
25647
25648 assert(Op.getNumOperands() == (6U + HasRounding) &&
25649 "Unexpected intrinsic form");
25650 SDValue RoundingMode = Op.getOperand(5);
25651 unsigned Opc = IntrData->Opc0;
25652 if (HasRounding) {
25653 SDValue Sae = Op.getOperand(6);
25654 if (isRoundModeSAE(Sae))
25655 Opc = IntrWithRoundingModeOpcode;
25656 else if (!isRoundModeCurDirection(Sae))
25657 return SDValue();
25658 }
25659 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,
25660 Src2, RoundingMode),
25661 Mask, passThru, Subtarget, DAG);
25662 }
25664 SDValue Src1 = Op.getOperand(1);
25665 SDValue Src2 = Op.getOperand(2);
25666 SDValue passThru = Op.getOperand(3);
25667 SDValue Mask = Op.getOperand(4);
25668 SDValue Rnd = Op.getOperand(5);
25669
25670 SDValue NewOp;
25671 unsigned RC = 0;
25672 if (isRoundModeCurDirection(Rnd))
25673 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25674 else if (isRoundModeSAEToX(Rnd, RC))
25675 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25676 DAG.getTargetConstant(RC, dl, MVT::i32));
25677 else
25678 return SDValue();
25679
25680 return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);
25681 }
25683 SDValue Src1 = Op.getOperand(1);
25684 SDValue Src2 = Op.getOperand(2);
25685 SDValue passThru = Op.getOperand(3);
25686 SDValue Mask = Op.getOperand(4);
25687 SDValue Sae = Op.getOperand(5);
25688 unsigned Opc;
25689 if (isRoundModeCurDirection(Sae))
25690 Opc = IntrData->Opc0;
25691 else if (isRoundModeSAE(Sae))
25692 Opc = IntrData->Opc1;
25693 else
25694 return SDValue();
25695
25696 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25697 Mask, passThru, Subtarget, DAG);
25698 }
25699 case INTR_TYPE_2OP_MASK: {
25700 SDValue Src1 = Op.getOperand(1);
25701 SDValue Src2 = Op.getOperand(2);
25702 SDValue PassThru = Op.getOperand(3);
25703 SDValue Mask = Op.getOperand(4);
25704 SDValue NewOp;
25705 if (IntrData->Opc1 != 0) {
25706 SDValue Rnd = Op.getOperand(5);
25707 unsigned RC = 0;
25708 if (isRoundModeSAEToX(Rnd, RC))
25709 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,
25710 DAG.getTargetConstant(RC, dl, MVT::i32));
25711 else if (!isRoundModeCurDirection(Rnd))
25712 return SDValue();
25713 }
25714 if (!NewOp)
25715 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);
25716 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
25717 }
25719 SDValue Src1 = Op.getOperand(1);
25720 SDValue Src2 = Op.getOperand(2);
25721 SDValue PassThru = Op.getOperand(3);
25722 SDValue Mask = Op.getOperand(4);
25723
25724 unsigned Opc = IntrData->Opc0;
25725 if (IntrData->Opc1 != 0) {
25726 SDValue Sae = Op.getOperand(5);
25727 if (isRoundModeSAE(Sae))
25728 Opc = IntrData->Opc1;
25729 else if (!isRoundModeCurDirection(Sae))
25730 return SDValue();
25731 }
25732
25733 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),
25734 Mask, PassThru, Subtarget, DAG);
25735 }
25737 SDValue Src1 = Op.getOperand(1);
25738 SDValue Src2 = Op.getOperand(2);
25739 SDValue Src3 = Op.getOperand(3);
25740 SDValue PassThru = Op.getOperand(4);
25741 SDValue Mask = Op.getOperand(5);
25742 SDValue Sae = Op.getOperand(6);
25743 unsigned Opc;
25744 if (isRoundModeCurDirection(Sae))
25745 Opc = IntrData->Opc0;
25746 else if (isRoundModeSAE(Sae))
25747 Opc = IntrData->Opc1;
25748 else
25749 return SDValue();
25750
25751 return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25752 Mask, PassThru, Subtarget, DAG);
25753 }
25755 SDValue Src1 = Op.getOperand(1);
25756 SDValue Src2 = Op.getOperand(2);
25757 SDValue Src3 = Op.getOperand(3);
25758 SDValue PassThru = Op.getOperand(4);
25759 SDValue Mask = Op.getOperand(5);
25760
25761 unsigned Opc = IntrData->Opc0;
25762 if (IntrData->Opc1 != 0) {
25763 SDValue Sae = Op.getOperand(6);
25764 if (isRoundModeSAE(Sae))
25765 Opc = IntrData->Opc1;
25766 else if (!isRoundModeCurDirection(Sae))
25767 return SDValue();
25768 }
25769 return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),
25770 Mask, PassThru, Subtarget, DAG);
25771 }
25772 case BLENDV: {
25773 SDValue Src1 = Op.getOperand(1);
25774 SDValue Src2 = Op.getOperand(2);
25775 SDValue Src3 = Op.getOperand(3);
25776
25778 Src3 = DAG.getBitcast(MaskVT, Src3);
25779
25780 // Reverse the operands to match VSELECT order.
25781 return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);
25782 }
25783 case VPERM_2OP : {
25784 SDValue Src1 = Op.getOperand(1);
25785 SDValue Src2 = Op.getOperand(2);
25786
25787 // Swap Src1 and Src2 in the node creation
25788 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
25789 }
25790 case CFMA_OP_MASKZ:
25791 case CFMA_OP_MASK: {
25792 SDValue Src1 = Op.getOperand(1);
25793 SDValue Src2 = Op.getOperand(2);
25794 SDValue Src3 = Op.getOperand(3);
25795 SDValue Mask = Op.getOperand(4);
25796 MVT VT = Op.getSimpleValueType();
25797
25798 SDValue PassThru = Src3;
25799 if (IntrData->Type == CFMA_OP_MASKZ)
25800 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
25801
25802 // We add rounding mode to the Node when
25803 // - RC Opcode is specified and
25804 // - RC is not "current direction".
25805 SDValue NewOp;
25806 if (IntrData->Opc1 != 0) {
25807 SDValue Rnd = Op.getOperand(5);
25808 unsigned RC = 0;
25809 if (isRoundModeSAEToX(Rnd, RC))
25810 NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,
25811 DAG.getTargetConstant(RC, dl, MVT::i32));
25812 else if (!isRoundModeCurDirection(Rnd))
25813 return SDValue();
25814 }
25815 if (!NewOp)
25816 NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
25817 return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);
25818 }
25819 case IFMA_OP:
25820 // NOTE: We need to swizzle the operands to pass the multiply operands
25821 // first.
25822 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
25823 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
25824 case FPCLASSS: {
25825 SDValue Src1 = Op.getOperand(1);
25826 SDValue Imm = Op.getOperand(2);
25827 SDValue Mask = Op.getOperand(3);
25828 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
25829 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
25830 Subtarget, DAG);
25831 // Need to fill with zeros to ensure the bitcast will produce zeroes
25832 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25833 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25834 DAG.getConstant(0, dl, MVT::v8i1),
25835 FPclassMask, DAG.getIntPtrConstant(0, dl));
25836 return DAG.getBitcast(MVT::i8, Ins);
25837 }
25838
25839 case CMP_MASK_CC: {
25840 MVT MaskVT = Op.getSimpleValueType();
25841 SDValue CC = Op.getOperand(3);
25842 SDValue Mask = Op.getOperand(4);
25843 // We specify 2 possible opcodes for intrinsics with rounding modes.
25844 // First, we check if the intrinsic may have non-default rounding mode,
25845 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
25846 if (IntrData->Opc1 != 0) {
25847 SDValue Sae = Op.getOperand(5);
25848 if (isRoundModeSAE(Sae))
25849 return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
25850 Op.getOperand(2), CC, Mask, Sae);
25851 if (!isRoundModeCurDirection(Sae))
25852 return SDValue();
25853 }
25854 //default rounding mode
25855 return DAG.getNode(IntrData->Opc0, dl, MaskVT,
25856 {Op.getOperand(1), Op.getOperand(2), CC, Mask});
25857 }
25858 case CMP_MASK_SCALAR_CC: {
25859 SDValue Src1 = Op.getOperand(1);
25860 SDValue Src2 = Op.getOperand(2);
25861 SDValue CC = Op.getOperand(3);
25862 SDValue Mask = Op.getOperand(4);
25863
25864 SDValue Cmp;
25865 if (IntrData->Opc1 != 0) {
25866 SDValue Sae = Op.getOperand(5);
25867 if (isRoundModeSAE(Sae))
25868 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);
25869 else if (!isRoundModeCurDirection(Sae))
25870 return SDValue();
25871 }
25872 //default rounding mode
25873 if (!Cmp.getNode())
25874 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
25875
25876 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
25877 Subtarget, DAG);
25878 // Need to fill with zeros to ensure the bitcast will produce zeroes
25879 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25880 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
25881 DAG.getConstant(0, dl, MVT::v8i1),
25882 CmpMask, DAG.getIntPtrConstant(0, dl));
25883 return DAG.getBitcast(MVT::i8, Ins);
25884 }
25885 case COMI: { // Comparison intrinsics
25886 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
25887 SDValue LHS = Op.getOperand(1);
25888 SDValue RHS = Op.getOperand(2);
25889 // Some conditions require the operands to be swapped.
25890 if (CC == ISD::SETLT || CC == ISD::SETLE)
25891 std::swap(LHS, RHS);
25892
25893 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
25894 SDValue SetCC;
25895 switch (CC) {
25896 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
25897 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
25898 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
25899 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
25900 break;
25901 }
25902 case ISD::SETNE: { // (ZF = 1 or PF = 1)
25903 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
25904 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
25905 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
25906 break;
25907 }
25908 case ISD::SETGT: // (CF = 0 and ZF = 0)
25909 case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
25910 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
25911 break;
25912 }
25913 case ISD::SETGE: // CF = 0
25914 case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
25915 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
25916 break;
25917 default:
25918 llvm_unreachable("Unexpected illegal condition!");
25919 }
25920 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
25921 }
25922 case COMI_RM: { // Comparison intrinsics with Sae
25923 SDValue LHS = Op.getOperand(1);
25924 SDValue RHS = Op.getOperand(2);
25925 unsigned CondVal = Op.getConstantOperandVal(3);
25926 SDValue Sae = Op.getOperand(4);
25927
25928 SDValue FCmp;
25929 if (isRoundModeCurDirection(Sae))
25930 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
25931 DAG.getTargetConstant(CondVal, dl, MVT::i8));
25932 else if (isRoundModeSAE(Sae))
25933 FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,
25934 DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);
25935 else
25936 return SDValue();
25937 // Need to fill with zeros to ensure the bitcast will produce zeroes
25938 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
25939 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
25940 DAG.getConstant(0, dl, MVT::v16i1),
25941 FCmp, DAG.getIntPtrConstant(0, dl));
25942 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
25943 DAG.getBitcast(MVT::i16, Ins));
25944 }
25945 case VSHIFT: {
25946 SDValue SrcOp = Op.getOperand(1);
25947 SDValue ShAmt = Op.getOperand(2);
25948 assert(ShAmt.getValueType() == MVT::i32 &&
25949 "Unexpected VSHIFT amount type");
25950
25951 // Catch shift-by-constant.
25952 if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
25953 return getTargetVShiftByConstNode(IntrData->Opc0, dl,
25954 Op.getSimpleValueType(), SrcOp,
25955 CShAmt->getZExtValue(), DAG);
25956
25957 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);
25958 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
25959 SrcOp, ShAmt, 0, Subtarget, DAG);
25960 }
25962 SDValue Mask = Op.getOperand(3);
25963 SDValue DataToCompress = Op.getOperand(1);
25964 SDValue PassThru = Op.getOperand(2);
25965 if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is
25966 return Op.getOperand(1);
25967
25968 // Avoid false dependency.
25969 if (PassThru.isUndef())
25970 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
25971
25972 return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,
25973 Mask);
25974 }
25975 case FIXUPIMM:
25976 case FIXUPIMM_MASKZ: {
25977 SDValue Src1 = Op.getOperand(1);
25978 SDValue Src2 = Op.getOperand(2);
25979 SDValue Src3 = Op.getOperand(3);
25980 SDValue Imm = Op.getOperand(4);
25981 SDValue Mask = Op.getOperand(5);
25982 SDValue Passthru = (IntrData->Type == FIXUPIMM)
25983 ? Src1
25984 : getZeroVector(VT, Subtarget, DAG, dl);
25985
25986 unsigned Opc = IntrData->Opc0;
25987 if (IntrData->Opc1 != 0) {
25988 SDValue Sae = Op.getOperand(6);
25989 if (isRoundModeSAE(Sae))
25990 Opc = IntrData->Opc1;
25991 else if (!isRoundModeCurDirection(Sae))
25992 return SDValue();
25993 }
25994
25995 SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);
25996
25997 if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)
25998 return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
25999
26000 return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);
26001 }
26002 case ROUNDP: {
26003 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
26004 // Clear the upper bits of the rounding immediate so that the legacy
26005 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26006 uint64_t Round = Op.getConstantOperandVal(2);
26007 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26008 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26009 Op.getOperand(1), RoundingMode);
26010 }
26011 case ROUNDS: {
26012 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
26013 // Clear the upper bits of the rounding immediate so that the legacy
26014 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
26015 uint64_t Round = Op.getConstantOperandVal(3);
26016 SDValue RoundingMode = DAG.getTargetConstant(Round & 0xf, dl, MVT::i32);
26017 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26018 Op.getOperand(1), Op.getOperand(2), RoundingMode);
26019 }
26020 case BEXTRI: {
26021 assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode");
26022
26023 uint64_t Imm = Op.getConstantOperandVal(2);
26024 SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,
26025 Op.getValueType());
26026 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
26027 Op.getOperand(1), Control);
26028 }
26029 // ADC/SBB
26030 case ADX: {
26031 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
26032 SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);
26033
26034 SDValue Res;
26035 // If the carry in is zero, then we should just use ADD/SUB instead of
26036 // ADC/SBB.
26037 if (isNullConstant(Op.getOperand(1))) {
26038 Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),
26039 Op.getOperand(3));
26040 } else {
26041 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),
26042 DAG.getConstant(-1, dl, MVT::i8));
26043 Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),
26044 Op.getOperand(3), GenCF.getValue(1));
26045 }
26046 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
26047 SDValue Results[] = { SetCC, Res };
26048 return DAG.getMergeValues(Results, dl);
26049 }
26050 case CVTPD2PS_MASK:
26051 case CVTPD2DQ_MASK:
26052 case CVTQQ2PS_MASK:
26053 case TRUNCATE_TO_REG: {
26054 SDValue Src = Op.getOperand(1);
26055 SDValue PassThru = Op.getOperand(2);
26056 SDValue Mask = Op.getOperand(3);
26057
26058 if (isAllOnesConstant(Mask))
26059 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26060
26061 MVT SrcVT = Src.getSimpleValueType();
26062 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
26063 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26064 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
26065 {Src, PassThru, Mask});
26066 }
26067 case CVTPS2PH_MASK: {
26068 SDValue Src = Op.getOperand(1);
26069 SDValue Rnd = Op.getOperand(2);
26070 SDValue PassThru = Op.getOperand(3);
26071 SDValue Mask = Op.getOperand(4);
26072
26073 unsigned RC = 0;
26074 unsigned Opc = IntrData->Opc0;
26075 bool SAE = Src.getValueType().is512BitVector() &&
26076 (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));
26077 if (SAE) {
26079 Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);
26080 }
26081
26082 if (isAllOnesConstant(Mask))
26083 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);
26084
26085 if (SAE)
26087 else
26088 Opc = IntrData->Opc1;
26089 MVT SrcVT = Src.getSimpleValueType();
26090 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
26091 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26092 return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);
26093 }
26094 case CVTNEPS2BF16_MASK: {
26095 SDValue Src = Op.getOperand(1);
26096 SDValue PassThru = Op.getOperand(2);
26097 SDValue Mask = Op.getOperand(3);
26098
26099 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
26100 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);
26101
26102 // Break false dependency.
26103 if (PassThru.isUndef())
26104 PassThru = DAG.getConstant(0, dl, PassThru.getValueType());
26105
26106 return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
26107 Mask);
26108 }
26109 default:
26110 break;
26111 }
26112 }
26113
26114 switch (IntNo) {
26115 default: return SDValue(); // Don't custom lower most intrinsics.
26116
26117 // ptest and testp intrinsics. The intrinsic these come from are designed to
26118 // return an integer value, not just an instruction so lower it to the ptest
26119 // or testp pattern and a setcc for the result.
26120 case Intrinsic::x86_avx512_ktestc_b:
26121 case Intrinsic::x86_avx512_ktestc_w:
26122 case Intrinsic::x86_avx512_ktestc_d:
26123 case Intrinsic::x86_avx512_ktestc_q:
26124 case Intrinsic::x86_avx512_ktestz_b:
26125 case Intrinsic::x86_avx512_ktestz_w:
26126 case Intrinsic::x86_avx512_ktestz_d:
26127 case Intrinsic::x86_avx512_ktestz_q:
26128 case Intrinsic::x86_sse41_ptestz:
26129 case Intrinsic::x86_sse41_ptestc:
26130 case Intrinsic::x86_sse41_ptestnzc:
26131 case Intrinsic::x86_avx_ptestz_256:
26132 case Intrinsic::x86_avx_ptestc_256:
26133 case Intrinsic::x86_avx_ptestnzc_256:
26134 case Intrinsic::x86_avx_vtestz_ps:
26135 case Intrinsic::x86_avx_vtestc_ps:
26136 case Intrinsic::x86_avx_vtestnzc_ps:
26137 case Intrinsic::x86_avx_vtestz_pd:
26138 case Intrinsic::x86_avx_vtestc_pd:
26139 case Intrinsic::x86_avx_vtestnzc_pd:
26140 case Intrinsic::x86_avx_vtestz_ps_256:
26141 case Intrinsic::x86_avx_vtestc_ps_256:
26142 case Intrinsic::x86_avx_vtestnzc_ps_256:
26143 case Intrinsic::x86_avx_vtestz_pd_256:
26144 case Intrinsic::x86_avx_vtestc_pd_256:
26145 case Intrinsic::x86_avx_vtestnzc_pd_256: {
26146 unsigned TestOpc = X86ISD::PTEST;
26147 X86::CondCode X86CC;
26148 switch (IntNo) {
26149 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
26150 case Intrinsic::x86_avx512_ktestc_b:
26151 case Intrinsic::x86_avx512_ktestc_w:
26152 case Intrinsic::x86_avx512_ktestc_d:
26153 case Intrinsic::x86_avx512_ktestc_q:
26154 // CF = 1
26155 TestOpc = X86ISD::KTEST;
26156 X86CC = X86::COND_B;
26157 break;
26158 case Intrinsic::x86_avx512_ktestz_b:
26159 case Intrinsic::x86_avx512_ktestz_w:
26160 case Intrinsic::x86_avx512_ktestz_d:
26161 case Intrinsic::x86_avx512_ktestz_q:
26162 TestOpc = X86ISD::KTEST;
26163 X86CC = X86::COND_E;
26164 break;
26165 case Intrinsic::x86_avx_vtestz_ps:
26166 case Intrinsic::x86_avx_vtestz_pd:
26167 case Intrinsic::x86_avx_vtestz_ps_256:
26168 case Intrinsic::x86_avx_vtestz_pd_256:
26169 TestOpc = X86ISD::TESTP;
26170 [[fallthrough]];
26171 case Intrinsic::x86_sse41_ptestz:
26172 case Intrinsic::x86_avx_ptestz_256:
26173 // ZF = 1
26174 X86CC = X86::COND_E;
26175 break;
26176 case Intrinsic::x86_avx_vtestc_ps:
26177 case Intrinsic::x86_avx_vtestc_pd:
26178 case Intrinsic::x86_avx_vtestc_ps_256:
26179 case Intrinsic::x86_avx_vtestc_pd_256:
26180 TestOpc = X86ISD::TESTP;
26181 [[fallthrough]];
26182 case Intrinsic::x86_sse41_ptestc:
26183 case Intrinsic::x86_avx_ptestc_256:
26184 // CF = 1
26185 X86CC = X86::COND_B;
26186 break;
26187 case Intrinsic::x86_avx_vtestnzc_ps:
26188 case Intrinsic::x86_avx_vtestnzc_pd:
26189 case Intrinsic::x86_avx_vtestnzc_ps_256:
26190 case Intrinsic::x86_avx_vtestnzc_pd_256:
26191 TestOpc = X86ISD::TESTP;
26192 [[fallthrough]];
26193 case Intrinsic::x86_sse41_ptestnzc:
26194 case Intrinsic::x86_avx_ptestnzc_256:
26195 // ZF and CF = 0
26196 X86CC = X86::COND_A;
26197 break;
26198 }
26199
26200 SDValue LHS = Op.getOperand(1);
26201 SDValue RHS = Op.getOperand(2);
26202 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
26203 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
26204 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26205 }
26206
26207 case Intrinsic::x86_sse42_pcmpistria128:
26208 case Intrinsic::x86_sse42_pcmpestria128:
26209 case Intrinsic::x86_sse42_pcmpistric128:
26210 case Intrinsic::x86_sse42_pcmpestric128:
26211 case Intrinsic::x86_sse42_pcmpistrio128:
26212 case Intrinsic::x86_sse42_pcmpestrio128:
26213 case Intrinsic::x86_sse42_pcmpistris128:
26214 case Intrinsic::x86_sse42_pcmpestris128:
26215 case Intrinsic::x86_sse42_pcmpistriz128:
26216 case Intrinsic::x86_sse42_pcmpestriz128: {
26217 unsigned Opcode;
26218 X86::CondCode X86CC;
26219 switch (IntNo) {
26220 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
26221 case Intrinsic::x86_sse42_pcmpistria128:
26222 Opcode = X86ISD::PCMPISTR;
26223 X86CC = X86::COND_A;
26224 break;
26225 case Intrinsic::x86_sse42_pcmpestria128:
26226 Opcode = X86ISD::PCMPESTR;
26227 X86CC = X86::COND_A;
26228 break;
26229 case Intrinsic::x86_sse42_pcmpistric128:
26230 Opcode = X86ISD::PCMPISTR;
26231 X86CC = X86::COND_B;
26232 break;
26233 case Intrinsic::x86_sse42_pcmpestric128:
26234 Opcode = X86ISD::PCMPESTR;
26235 X86CC = X86::COND_B;
26236 break;
26237 case Intrinsic::x86_sse42_pcmpistrio128:
26238 Opcode = X86ISD::PCMPISTR;
26239 X86CC = X86::COND_O;
26240 break;
26241 case Intrinsic::x86_sse42_pcmpestrio128:
26242 Opcode = X86ISD::PCMPESTR;
26243 X86CC = X86::COND_O;
26244 break;
26245 case Intrinsic::x86_sse42_pcmpistris128:
26246 Opcode = X86ISD::PCMPISTR;
26247 X86CC = X86::COND_S;
26248 break;
26249 case Intrinsic::x86_sse42_pcmpestris128:
26250 Opcode = X86ISD::PCMPESTR;
26251 X86CC = X86::COND_S;
26252 break;
26253 case Intrinsic::x86_sse42_pcmpistriz128:
26254 Opcode = X86ISD::PCMPISTR;
26255 X86CC = X86::COND_E;
26256 break;
26257 case Intrinsic::x86_sse42_pcmpestriz128:
26258 Opcode = X86ISD::PCMPESTR;
26259 X86CC = X86::COND_E;
26260 break;
26261 }
26263 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26264 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
26265 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
26266 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
26267 }
26268
26269 case Intrinsic::x86_sse42_pcmpistri128:
26270 case Intrinsic::x86_sse42_pcmpestri128: {
26271 unsigned Opcode;
26272 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
26273 Opcode = X86ISD::PCMPISTR;
26274 else
26275 Opcode = X86ISD::PCMPESTR;
26276
26278 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26279 return DAG.getNode(Opcode, dl, VTs, NewOps);
26280 }
26281
26282 case Intrinsic::x86_sse42_pcmpistrm128:
26283 case Intrinsic::x86_sse42_pcmpestrm128: {
26284 unsigned Opcode;
26285 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
26286 Opcode = X86ISD::PCMPISTR;
26287 else
26288 Opcode = X86ISD::PCMPESTR;
26289
26291 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
26292 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
26293 }
26294
26295 case Intrinsic::eh_sjlj_lsda: {
26297 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26298 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26299 auto &Context = MF.getMMI().getContext();
26300 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
26301 Twine(MF.getFunctionNumber()));
26302 return DAG.getNode(getGlobalWrapperKind(nullptr, /*OpFlags=*/0), dl, VT,
26303 DAG.getMCSymbol(S, PtrVT));
26304 }
26305
26306 case Intrinsic::x86_seh_lsda: {
26307 // Compute the symbol for the LSDA. We know it'll get emitted later.
26309 SDValue Op1 = Op.getOperand(1);
26310 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
26313
26314 // Generate a simple absolute symbol reference. This intrinsic is only
26315 // supported on 32-bit Windows, which isn't PIC.
26316 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
26317 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
26318 }
26319
26320 case Intrinsic::eh_recoverfp: {
26321 SDValue FnOp = Op.getOperand(1);
26322 SDValue IncomingFPOp = Op.getOperand(2);
26323 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
26324 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
26325 if (!Fn)
26327 "llvm.eh.recoverfp must take a function as the first argument");
26328 return recoverFramePointer(DAG, Fn, IncomingFPOp);
26329 }
26330
26331 case Intrinsic::localaddress: {
26332 // Returns one of the stack, base, or frame pointer registers, depending on
26333 // which is used to reference local variables.
26335 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26336 unsigned Reg;
26337 if (RegInfo->hasBasePointer(MF))
26338 Reg = RegInfo->getBaseRegister();
26339 else { // Handles the SP or FP case.
26340 bool CantUseFP = RegInfo->hasStackRealignment(MF);
26341 if (CantUseFP)
26342 Reg = RegInfo->getPtrSizedStackRegister(MF);
26343 else
26344 Reg = RegInfo->getPtrSizedFrameRegister(MF);
26345 }
26346 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
26347 }
26348 case Intrinsic::x86_avx512_vp2intersect_q_512:
26349 case Intrinsic::x86_avx512_vp2intersect_q_256:
26350 case Intrinsic::x86_avx512_vp2intersect_q_128:
26351 case Intrinsic::x86_avx512_vp2intersect_d_512:
26352 case Intrinsic::x86_avx512_vp2intersect_d_256:
26353 case Intrinsic::x86_avx512_vp2intersect_d_128: {
26354 MVT MaskVT = Op.getSimpleValueType();
26355
26356 SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);
26357 SDLoc DL(Op);
26358
26361 Op->getOperand(1), Op->getOperand(2));
26362
26363 SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,
26364 MaskVT, Operation);
26365 SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,
26366 MaskVT, Operation);
26367 return DAG.getMergeValues({Result0, Result1}, DL);
26368 }
26369 case Intrinsic::x86_mmx_pslli_w:
26370 case Intrinsic::x86_mmx_pslli_d:
26371 case Intrinsic::x86_mmx_pslli_q:
26372 case Intrinsic::x86_mmx_psrli_w:
26373 case Intrinsic::x86_mmx_psrli_d:
26374 case Intrinsic::x86_mmx_psrli_q:
26375 case Intrinsic::x86_mmx_psrai_w:
26376 case Intrinsic::x86_mmx_psrai_d: {
26377 SDLoc DL(Op);
26378 SDValue ShAmt = Op.getOperand(2);
26379 // If the argument is a constant, convert it to a target constant.
26380 if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
26381 // Clamp out of bounds shift amounts since they will otherwise be masked
26382 // to 8-bits which may make it no longer out of bounds.
26383 unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
26384 if (ShiftAmount == 0)
26385 return Op.getOperand(1);
26386
26387 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26388 Op.getOperand(0), Op.getOperand(1),
26389 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
26390 }
26391
26392 unsigned NewIntrinsic;
26393 switch (IntNo) {
26394 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
26395 case Intrinsic::x86_mmx_pslli_w:
26396 NewIntrinsic = Intrinsic::x86_mmx_psll_w;
26397 break;
26398 case Intrinsic::x86_mmx_pslli_d:
26399 NewIntrinsic = Intrinsic::x86_mmx_psll_d;
26400 break;
26401 case Intrinsic::x86_mmx_pslli_q:
26402 NewIntrinsic = Intrinsic::x86_mmx_psll_q;
26403 break;
26404 case Intrinsic::x86_mmx_psrli_w:
26405 NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
26406 break;
26407 case Intrinsic::x86_mmx_psrli_d:
26408 NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
26409 break;
26410 case Intrinsic::x86_mmx_psrli_q:
26411 NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
26412 break;
26413 case Intrinsic::x86_mmx_psrai_w:
26414 NewIntrinsic = Intrinsic::x86_mmx_psra_w;
26415 break;
26416 case Intrinsic::x86_mmx_psrai_d:
26417 NewIntrinsic = Intrinsic::x86_mmx_psra_d;
26418 break;
26419 }
26420
26421 // The vector shift intrinsics with scalars uses 32b shift amounts but
26422 // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an
26423 // MMX register.
26424 ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);
26425 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
26426 DAG.getTargetConstant(NewIntrinsic, DL,
26428 Op.getOperand(1), ShAmt);
26429 }
26430 case Intrinsic::thread_pointer: {
26431 if (Subtarget.isTargetELF()) {
26432 SDLoc dl(Op);
26433 EVT PtrVT = getPointerTy(DAG.getDataLayout());
26434 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
26436 *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));
26437 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
26439 }
26441 "Target OS doesn't support __builtin_thread_pointer() yet.");
26442 }
26443 }
26444}
26445
26447 SDValue Src, SDValue Mask, SDValue Base,
26448 SDValue Index, SDValue ScaleOp, SDValue Chain,
26449 const X86Subtarget &Subtarget) {
26450 SDLoc dl(Op);
26451 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26452 // Scale must be constant.
26453 if (!C)
26454 return SDValue();
26455 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26456 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26457 TLI.getPointerTy(DAG.getDataLayout()));
26458 EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
26459 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26460 // If source is undef or we know it won't be used, use a zero vector
26461 // to break register dependency.
26462 // TODO: use undef instead and let BreakFalseDeps deal with it?
26463 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26464 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26465
26466 // Cast mask to an integer type.
26467 Mask = DAG.getBitcast(MaskVT, Mask);
26468
26469 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26470
26471 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26472 SDValue Res =
26473 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26474 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26475 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26476}
26477
26479 SDValue Src, SDValue Mask, SDValue Base,
26480 SDValue Index, SDValue ScaleOp, SDValue Chain,
26481 const X86Subtarget &Subtarget) {
26482 MVT VT = Op.getSimpleValueType();
26483 SDLoc dl(Op);
26484 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26485 // Scale must be constant.
26486 if (!C)
26487 return SDValue();
26488 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26489 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26490 TLI.getPointerTy(DAG.getDataLayout()));
26491 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26493 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26494
26495 // We support two versions of the gather intrinsics. One with scalar mask and
26496 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26497 if (Mask.getValueType() != MaskVT)
26498 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26499
26500 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
26501 // If source is undef or we know it won't be used, use a zero vector
26502 // to break register dependency.
26503 // TODO: use undef instead and let BreakFalseDeps deal with it?
26504 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
26505 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
26506
26507 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26508
26509 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
26510 SDValue Res =
26511 DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
26512 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26513 return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
26514}
26515
26516static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26517 SDValue Src, SDValue Mask, SDValue Base,
26518 SDValue Index, SDValue ScaleOp, SDValue Chain,
26519 const X86Subtarget &Subtarget) {
26520 SDLoc dl(Op);
26521 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26522 // Scale must be constant.
26523 if (!C)
26524 return SDValue();
26525 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26526 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26527 TLI.getPointerTy(DAG.getDataLayout()));
26528 unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),
26529 Src.getSimpleValueType().getVectorNumElements());
26530 MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);
26531
26532 // We support two versions of the scatter intrinsics. One with scalar mask and
26533 // one with vXi1 mask. Convert scalar to vXi1 if necessary.
26534 if (Mask.getValueType() != MaskVT)
26535 Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26536
26537 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26538
26539 SDVTList VTs = DAG.getVTList(MVT::Other);
26540 SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
26541 SDValue Res =
26542 DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
26543 MemIntr->getMemoryVT(), MemIntr->getMemOperand());
26544 return Res;
26545}
26546
26547static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
26549 SDValue ScaleOp, SDValue Chain,
26550 const X86Subtarget &Subtarget) {
26551 SDLoc dl(Op);
26552 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
26553 // Scale must be constant.
26554 if (!C)
26555 return SDValue();
26556 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26557 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
26558 TLI.getPointerTy(DAG.getDataLayout()));
26559 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
26560 SDValue Segment = DAG.getRegister(0, MVT::i32);
26561 MVT MaskVT =
26562 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
26563 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
26564 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
26565 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
26566 return SDValue(Res, 0);
26567}
26568
26569/// Handles the lowering of builtin intrinsics with chain that return their
26570/// value into registers EDX:EAX.
26571/// If operand ScrReg is a valid register identifier, then operand 2 of N is
26572/// copied to SrcReg. The assumption is that SrcReg is an implicit input to
26573/// TargetOpcode.
26574/// Returns a Glue value which can be used to add extra copy-from-reg if the
26575/// expanded intrinsics implicitly defines extra registers (i.e. not just
26576/// EDX:EAX).
26578 SelectionDAG &DAG,
26579 unsigned TargetOpcode,
26580 unsigned SrcReg,
26581 const X86Subtarget &Subtarget,
26583 SDValue Chain = N->getOperand(0);
26584 SDValue Glue;
26585
26586 if (SrcReg) {
26587 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
26588 Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);
26589 Glue = Chain.getValue(1);
26590 }
26591
26592 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
26593 SDValue N1Ops[] = {Chain, Glue};
26594 SDNode *N1 = DAG.getMachineNode(
26595 TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));
26596 Chain = SDValue(N1, 0);
26597
26598 // Reads the content of XCR and returns it in registers EDX:EAX.
26599 SDValue LO, HI;
26600 if (Subtarget.is64Bit()) {
26601 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
26602 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
26603 LO.getValue(2));
26604 } else {
26605 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
26606 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
26607 LO.getValue(2));
26608 }
26609 Chain = HI.getValue(1);
26610 Glue = HI.getValue(2);
26611
26612 if (Subtarget.is64Bit()) {
26613 // Merge the two 32-bit values into a 64-bit one.
26614 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
26615 DAG.getConstant(32, DL, MVT::i8));
26616 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
26617 Results.push_back(Chain);
26618 return Glue;
26619 }
26620
26621 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
26622 SDValue Ops[] = { LO, HI };
26623 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
26624 Results.push_back(Pair);
26625 Results.push_back(Chain);
26626 return Glue;
26627}
26628
26629/// Handles the lowering of builtin intrinsics that read the time stamp counter
26630/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
26631/// READCYCLECOUNTER nodes.
26632static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
26633 SelectionDAG &DAG,
26634 const X86Subtarget &Subtarget,
26636 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
26637 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
26638 // and the EAX register is loaded with the low-order 32 bits.
26639 SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,
26640 /* NoRegister */0, Subtarget,
26641 Results);
26642 if (Opcode != X86::RDTSCP)
26643 return;
26644
26645 SDValue Chain = Results[1];
26646 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
26647 // the ECX register. Add 'ecx' explicitly to the chain.
26648 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);
26649 Results[1] = ecx;
26650 Results.push_back(ecx.getValue(1));
26651}
26652
26654 SelectionDAG &DAG) {
26656 SDLoc DL(Op);
26657 getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,
26658 Results);
26659 return DAG.getMergeValues(Results, DL);
26660}
26661
26664 SDValue Chain = Op.getOperand(0);
26665 SDValue RegNode = Op.getOperand(2);
26666 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26667 if (!EHInfo)
26668 report_fatal_error("EH registrations only live in functions using WinEH");
26669
26670 // Cast the operand to an alloca, and remember the frame index.
26671 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
26672 if (!FINode)
26673 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
26674 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
26675
26676 // Return the chain operand without making any DAG nodes.
26677 return Chain;
26678}
26679
26682 SDValue Chain = Op.getOperand(0);
26683 SDValue EHGuard = Op.getOperand(2);
26684 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
26685 if (!EHInfo)
26686 report_fatal_error("EHGuard only live in functions using WinEH");
26687
26688 // Cast the operand to an alloca, and remember the frame index.
26689 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
26690 if (!FINode)
26691 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
26692 EHInfo->EHGuardFrameIndex = FINode->getIndex();
26693
26694 // Return the chain operand without making any DAG nodes.
26695 return Chain;
26696}
26697
26698/// Emit Truncating Store with signed or unsigned saturation.
26699static SDValue
26700EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val,
26701 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
26702 SelectionDAG &DAG) {
26703 SDVTList VTs = DAG.getVTList(MVT::Other);
26704 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
26705 SDValue Ops[] = { Chain, Val, Ptr, Undef };
26706 unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
26707 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
26708}
26709
26710/// Emit Masked Truncating Store with signed or unsigned saturation.
26711static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain,
26712 const SDLoc &DL,
26713 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
26714 MachineMemOperand *MMO, SelectionDAG &DAG) {
26715 SDVTList VTs = DAG.getVTList(MVT::Other);
26716 SDValue Ops[] = { Chain, Val, Ptr, Mask };
26717 unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
26718 return DAG.getMemIntrinsicNode(Opc, DL, VTs, Ops, MemVT, MMO);
26719}
26720
26722 const MachineFunction &MF) {
26723 if (!Subtarget.is64Bit())
26724 return false;
26725 // 64-bit targets support extended Swift async frame setup,
26726 // except for targets that use the windows 64 prologue.
26727 return !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
26728}
26729
26731 SelectionDAG &DAG) {
26732 unsigned IntNo = Op.getConstantOperandVal(1);
26733 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
26734 if (!IntrData) {
26735 switch (IntNo) {
26736
26737 case Intrinsic::swift_async_context_addr: {
26738 SDLoc dl(Op);
26739 auto &MF = DAG.getMachineFunction();
26740 auto X86FI = MF.getInfo<X86MachineFunctionInfo>();
26741 if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) {
26743 X86FI->setHasSwiftAsyncContext(true);
26744 SDValue Chain = Op->getOperand(0);
26745 SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);
26746 SDValue Result =
26747 SDValue(DAG.getMachineNode(X86::SUB64ri32, dl, MVT::i64, CopyRBP,
26748 DAG.getTargetConstant(8, dl, MVT::i32)),
26749 0);
26750 // Return { result, chain }.
26751 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
26752 CopyRBP.getValue(1));
26753 } else {
26754 // No special extended frame, create or reuse an existing stack slot.
26755 int PtrSize = Subtarget.is64Bit() ? 8 : 4;
26756 if (!X86FI->getSwiftAsyncContextFrameIdx())
26757 X86FI->setSwiftAsyncContextFrameIdx(
26758 MF.getFrameInfo().CreateStackObject(PtrSize, Align(PtrSize),
26759 false));
26760 SDValue Result =
26761 DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(),
26762 PtrSize == 8 ? MVT::i64 : MVT::i32);
26763 // Return { result, chain }.
26764 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
26765 Op->getOperand(0));
26766 }
26767 }
26768
26769 case llvm::Intrinsic::x86_seh_ehregnode:
26770 return MarkEHRegistrationNode(Op, DAG);
26771 case llvm::Intrinsic::x86_seh_ehguard:
26772 return MarkEHGuard(Op, DAG);
26773 case llvm::Intrinsic::x86_rdpkru: {
26774 SDLoc dl(Op);
26775 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26776 // Create a RDPKRU node and pass 0 to the ECX parameter.
26777 return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),
26778 DAG.getConstant(0, dl, MVT::i32));
26779 }
26780 case llvm::Intrinsic::x86_wrpkru: {
26781 SDLoc dl(Op);
26782 // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0
26783 // to the EDX and ECX parameters.
26784 return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,
26785 Op.getOperand(0), Op.getOperand(2),
26786 DAG.getConstant(0, dl, MVT::i32),
26787 DAG.getConstant(0, dl, MVT::i32));
26788 }
26789 case llvm::Intrinsic::asan_check_memaccess: {
26790 // Mark this as adjustsStack because it will be lowered to a call.
26792 // Don't do anything here, we will expand these intrinsics out later.
26793 return Op;
26794 }
26795 case llvm::Intrinsic::x86_flags_read_u32:
26796 case llvm::Intrinsic::x86_flags_read_u64:
26797 case llvm::Intrinsic::x86_flags_write_u32:
26798 case llvm::Intrinsic::x86_flags_write_u64: {
26799 // We need a frame pointer because this will get lowered to a PUSH/POP
26800 // sequence.
26803 // Don't do anything here, we will expand these intrinsics out later
26804 // during FinalizeISel in EmitInstrWithCustomInserter.
26805 return Op;
26806 }
26807 case Intrinsic::x86_lwpins32:
26808 case Intrinsic::x86_lwpins64:
26809 case Intrinsic::x86_umwait:
26810 case Intrinsic::x86_tpause: {
26811 SDLoc dl(Op);
26812 SDValue Chain = Op->getOperand(0);
26813 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26814 unsigned Opcode;
26815
26816 switch (IntNo) {
26817 default: llvm_unreachable("Impossible intrinsic");
26818 case Intrinsic::x86_umwait:
26819 Opcode = X86ISD::UMWAIT;
26820 break;
26821 case Intrinsic::x86_tpause:
26822 Opcode = X86ISD::TPAUSE;
26823 break;
26824 case Intrinsic::x86_lwpins32:
26825 case Intrinsic::x86_lwpins64:
26826 Opcode = X86ISD::LWPINS;
26827 break;
26828 }
26829
26831 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
26832 Op->getOperand(3), Op->getOperand(4));
26833 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26834 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26835 Operation.getValue(1));
26836 }
26837 case Intrinsic::x86_enqcmd:
26838 case Intrinsic::x86_enqcmds: {
26839 SDLoc dl(Op);
26840 SDValue Chain = Op.getOperand(0);
26841 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26842 unsigned Opcode;
26843 switch (IntNo) {
26844 default: llvm_unreachable("Impossible intrinsic!");
26845 case Intrinsic::x86_enqcmd:
26846 Opcode = X86ISD::ENQCMD;
26847 break;
26848 case Intrinsic::x86_enqcmds:
26849 Opcode = X86ISD::ENQCMDS;
26850 break;
26851 }
26852 SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),
26853 Op.getOperand(3));
26854 SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);
26855 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26856 Operation.getValue(1));
26857 }
26858 case Intrinsic::x86_aesenc128kl:
26859 case Intrinsic::x86_aesdec128kl:
26860 case Intrinsic::x86_aesenc256kl:
26861 case Intrinsic::x86_aesdec256kl: {
26862 SDLoc DL(Op);
26863 SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);
26864 SDValue Chain = Op.getOperand(0);
26865 unsigned Opcode;
26866
26867 switch (IntNo) {
26868 default: llvm_unreachable("Impossible intrinsic");
26869 case Intrinsic::x86_aesenc128kl:
26870 Opcode = X86ISD::AESENC128KL;
26871 break;
26872 case Intrinsic::x86_aesdec128kl:
26873 Opcode = X86ISD::AESDEC128KL;
26874 break;
26875 case Intrinsic::x86_aesenc256kl:
26876 Opcode = X86ISD::AESENC256KL;
26877 break;
26878 case Intrinsic::x86_aesdec256kl:
26879 Opcode = X86ISD::AESDEC256KL;
26880 break;
26881 }
26882
26883 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26884 MachineMemOperand *MMO = MemIntr->getMemOperand();
26885 EVT MemVT = MemIntr->getMemoryVT();
26887 Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,
26888 MMO);
26889 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);
26890
26891 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26892 {ZF, Operation.getValue(0), Operation.getValue(2)});
26893 }
26894 case Intrinsic::x86_aesencwide128kl:
26895 case Intrinsic::x86_aesdecwide128kl:
26896 case Intrinsic::x86_aesencwide256kl:
26897 case Intrinsic::x86_aesdecwide256kl: {
26898 SDLoc DL(Op);
26899 SDVTList VTs = DAG.getVTList(
26900 {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,
26901 MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});
26902 SDValue Chain = Op.getOperand(0);
26903 unsigned Opcode;
26904
26905 switch (IntNo) {
26906 default: llvm_unreachable("Impossible intrinsic");
26907 case Intrinsic::x86_aesencwide128kl:
26908 Opcode = X86ISD::AESENCWIDE128KL;
26909 break;
26910 case Intrinsic::x86_aesdecwide128kl:
26911 Opcode = X86ISD::AESDECWIDE128KL;
26912 break;
26913 case Intrinsic::x86_aesencwide256kl:
26914 Opcode = X86ISD::AESENCWIDE256KL;
26915 break;
26916 case Intrinsic::x86_aesdecwide256kl:
26917 Opcode = X86ISD::AESDECWIDE256KL;
26918 break;
26919 }
26920
26921 MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
26922 MachineMemOperand *MMO = MemIntr->getMemOperand();
26923 EVT MemVT = MemIntr->getMemoryVT();
26925 Opcode, DL, VTs,
26926 {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
26927 Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),
26928 Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},
26929 MemVT, MMO);
26930 SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);
26931
26932 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
26933 {ZF, Operation.getValue(1), Operation.getValue(2),
26934 Operation.getValue(3), Operation.getValue(4),
26935 Operation.getValue(5), Operation.getValue(6),
26936 Operation.getValue(7), Operation.getValue(8),
26937 Operation.getValue(9)});
26938 }
26939 case Intrinsic::x86_testui: {
26940 SDLoc dl(Op);
26941 SDValue Chain = Op.getOperand(0);
26942 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
26943 SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);
26944 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
26945 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,
26946 Operation.getValue(1));
26947 }
26948 case Intrinsic::x86_atomic_bts_rm:
26949 case Intrinsic::x86_atomic_btc_rm:
26950 case Intrinsic::x86_atomic_btr_rm: {
26951 SDLoc DL(Op);
26952 MVT VT = Op.getSimpleValueType();
26953 SDValue Chain = Op.getOperand(0);
26954 SDValue Op1 = Op.getOperand(2);
26955 SDValue Op2 = Op.getOperand(3);
26956 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM
26957 : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM
26959 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
26960 SDValue Res =
26961 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
26962 {Chain, Op1, Op2}, VT, MMO);
26963 Chain = Res.getValue(1);
26964 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
26965 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
26966 }
26967 case Intrinsic::x86_atomic_bts:
26968 case Intrinsic::x86_atomic_btc:
26969 case Intrinsic::x86_atomic_btr: {
26970 SDLoc DL(Op);
26971 MVT VT = Op.getSimpleValueType();
26972 SDValue Chain = Op.getOperand(0);
26973 SDValue Op1 = Op.getOperand(2);
26974 SDValue Op2 = Op.getOperand(3);
26975 unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS
26976 : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC
26977 : X86ISD::LBTR;
26978 SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);
26979 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
26980 SDValue Res =
26981 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
26982 {Chain, Op1, Op2, Size}, VT, MMO);
26983 Chain = Res.getValue(1);
26984 Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);
26985 unsigned Imm = Op2->getAsZExtVal();
26986 if (Imm)
26987 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
26988 DAG.getShiftAmountConstant(Imm, VT, DL));
26989 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);
26990 }
26991 case Intrinsic::x86_cmpccxadd32:
26992 case Intrinsic::x86_cmpccxadd64: {
26993 SDLoc DL(Op);
26994 SDValue Chain = Op.getOperand(0);
26995 SDValue Addr = Op.getOperand(2);
26996 SDValue Src1 = Op.getOperand(3);
26997 SDValue Src2 = Op.getOperand(4);
26998 SDValue CC = Op.getOperand(5);
26999 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27001 X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},
27002 MVT::i32, MMO);
27003 return Operation;
27004 }
27005 case Intrinsic::x86_aadd32:
27006 case Intrinsic::x86_aadd64:
27007 case Intrinsic::x86_aand32:
27008 case Intrinsic::x86_aand64:
27009 case Intrinsic::x86_aor32:
27010 case Intrinsic::x86_aor64:
27011 case Intrinsic::x86_axor32:
27012 case Intrinsic::x86_axor64: {
27013 SDLoc DL(Op);
27014 SDValue Chain = Op.getOperand(0);
27015 SDValue Op1 = Op.getOperand(2);
27016 SDValue Op2 = Op.getOperand(3);
27017 MVT VT = Op2.getSimpleValueType();
27018 unsigned Opc = 0;
27019 switch (IntNo) {
27020 default:
27021 llvm_unreachable("Unknown Intrinsic");
27022 case Intrinsic::x86_aadd32:
27023 case Intrinsic::x86_aadd64:
27024 Opc = X86ISD::AADD;
27025 break;
27026 case Intrinsic::x86_aand32:
27027 case Intrinsic::x86_aand64:
27028 Opc = X86ISD::AAND;
27029 break;
27030 case Intrinsic::x86_aor32:
27031 case Intrinsic::x86_aor64:
27032 Opc = X86ISD::AOR;
27033 break;
27034 case Intrinsic::x86_axor32:
27035 case Intrinsic::x86_axor64:
27036 Opc = X86ISD::AXOR;
27037 break;
27038 }
27039 MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();
27040 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),
27041 {Chain, Op1, Op2}, VT, MMO);
27042 }
27043 case Intrinsic::x86_atomic_add_cc:
27044 case Intrinsic::x86_atomic_sub_cc:
27045 case Intrinsic::x86_atomic_or_cc:
27046 case Intrinsic::x86_atomic_and_cc:
27047 case Intrinsic::x86_atomic_xor_cc: {
27048 SDLoc DL(Op);
27049 SDValue Chain = Op.getOperand(0);
27050 SDValue Op1 = Op.getOperand(2);
27051 SDValue Op2 = Op.getOperand(3);
27052 X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);
27053 MVT VT = Op2.getSimpleValueType();
27054 unsigned Opc = 0;
27055 switch (IntNo) {
27056 default:
27057 llvm_unreachable("Unknown Intrinsic");
27058 case Intrinsic::x86_atomic_add_cc:
27059 Opc = X86ISD::LADD;
27060 break;
27061 case Intrinsic::x86_atomic_sub_cc:
27062 Opc = X86ISD::LSUB;
27063 break;
27064 case Intrinsic::x86_atomic_or_cc:
27065 Opc = X86ISD::LOR;
27066 break;
27067 case Intrinsic::x86_atomic_and_cc:
27068 Opc = X86ISD::LAND;
27069 break;
27070 case Intrinsic::x86_atomic_xor_cc:
27071 Opc = X86ISD::LXOR;
27072 break;
27073 }
27074 MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();
27075 SDValue LockArith =
27076 DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),
27077 {Chain, Op1, Op2}, VT, MMO);
27078 Chain = LockArith.getValue(1);
27079 return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);
27080 }
27081 }
27082 return SDValue();
27083 }
27084
27085 SDLoc dl(Op);
27086 switch(IntrData->Type) {
27087 default: llvm_unreachable("Unknown Intrinsic Type");
27088 case RDSEED:
27089 case RDRAND: {
27090 // Emit the node with the right value type.
27091 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
27092 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27093
27094 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
27095 // Otherwise return the value from Rand, which is always 0, casted to i32.
27096 SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
27097 DAG.getConstant(1, dl, Op->getValueType(1)),
27098 DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),
27099 SDValue(Result.getNode(), 1)};
27100 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
27101
27102 // Return { result, isValid, chain }.
27103 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
27104 SDValue(Result.getNode(), 2));
27105 }
27106 case GATHER_AVX2: {
27107 SDValue Chain = Op.getOperand(0);
27108 SDValue Src = Op.getOperand(2);
27109 SDValue Base = Op.getOperand(3);
27110 SDValue Index = Op.getOperand(4);
27111 SDValue Mask = Op.getOperand(5);
27112 SDValue Scale = Op.getOperand(6);
27113 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27114 Scale, Chain, Subtarget);
27115 }
27116 case GATHER: {
27117 //gather(v1, mask, index, base, scale);
27118 SDValue Chain = Op.getOperand(0);
27119 SDValue Src = Op.getOperand(2);
27120 SDValue Base = Op.getOperand(3);
27121 SDValue Index = Op.getOperand(4);
27122 SDValue Mask = Op.getOperand(5);
27123 SDValue Scale = Op.getOperand(6);
27124 return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,
27125 Chain, Subtarget);
27126 }
27127 case SCATTER: {
27128 //scatter(base, mask, index, v1, scale);
27129 SDValue Chain = Op.getOperand(0);
27130 SDValue Base = Op.getOperand(2);
27131 SDValue Mask = Op.getOperand(3);
27132 SDValue Index = Op.getOperand(4);
27133 SDValue Src = Op.getOperand(5);
27134 SDValue Scale = Op.getOperand(6);
27135 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
27136 Scale, Chain, Subtarget);
27137 }
27138 case PREFETCH: {
27139 const APInt &HintVal = Op.getConstantOperandAPInt(6);
27140 assert((HintVal == 2 || HintVal == 3) &&
27141 "Wrong prefetch hint in intrinsic: should be 2 or 3");
27142 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
27143 SDValue Chain = Op.getOperand(0);
27144 SDValue Mask = Op.getOperand(2);
27145 SDValue Index = Op.getOperand(3);
27146 SDValue Base = Op.getOperand(4);
27147 SDValue Scale = Op.getOperand(5);
27148 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
27149 Subtarget);
27150 }
27151 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
27152 case RDTSC: {
27154 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
27155 Results);
27156 return DAG.getMergeValues(Results, dl);
27157 }
27158 // Read Performance Monitoring Counters.
27159 case RDPMC:
27160 // Read Processor Register.
27161 case RDPRU:
27162 // GetExtended Control Register.
27163 case XGETBV: {
27165
27166 // RDPMC uses ECX to select the index of the performance counter to read.
27167 // RDPRU uses ECX to select the processor register to read.
27168 // XGETBV uses ECX to select the index of the XCR register to return.
27169 // The result is stored into registers EDX:EAX.
27170 expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,
27171 Subtarget, Results);
27172 return DAG.getMergeValues(Results, dl);
27173 }
27174 // XTEST intrinsics.
27175 case XTEST: {
27176 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
27177 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
27178
27179 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
27180 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
27181 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
27182 Ret, SDValue(InTrans.getNode(), 1));
27183 }
27186 case TRUNCATE_TO_MEM_VI32: {
27187 SDValue Mask = Op.getOperand(4);
27188 SDValue DataToTruncate = Op.getOperand(3);
27189 SDValue Addr = Op.getOperand(2);
27190 SDValue Chain = Op.getOperand(0);
27191
27192 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
27193 assert(MemIntr && "Expected MemIntrinsicSDNode!");
27194
27195 EVT MemVT = MemIntr->getMemoryVT();
27196
27197 uint16_t TruncationOp = IntrData->Opc0;
27198 switch (TruncationOp) {
27199 case X86ISD::VTRUNC: {
27200 if (isAllOnesConstant(Mask)) // return just a truncate store
27201 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
27202 MemIntr->getMemOperand());
27203
27204 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
27205 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27206 SDValue Offset = DAG.getUNDEF(VMask.getValueType());
27207
27208 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
27209 MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
27210 true /* truncating */);
27211 }
27212 case X86ISD::VTRUNCUS:
27213 case X86ISD::VTRUNCS: {
27214 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
27215 if (isAllOnesConstant(Mask))
27216 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
27217 MemIntr->getMemOperand(), DAG);
27218
27219 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
27220 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
27221
27222 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
27223 VMask, MemVT, MemIntr->getMemOperand(), DAG);
27224 }
27225 default:
27226 llvm_unreachable("Unsupported truncstore intrinsic");
27227 }
27228 }
27229 }
27230}
27231
27232SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
27233 SelectionDAG &DAG) const {
27235 MFI.setReturnAddressIsTaken(true);
27236
27238 return SDValue();
27239
27240 unsigned Depth = Op.getConstantOperandVal(0);
27241 SDLoc dl(Op);
27242 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27243
27244 if (Depth > 0) {
27245 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
27246 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27247 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
27248 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
27249 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
27251 }
27252
27253 // Just load the return address.
27254 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
27255 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
27257}
27258
27259SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
27260 SelectionDAG &DAG) const {
27262 return getReturnAddressFrameIndex(DAG);
27263}
27264
27265SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
27267 MachineFrameInfo &MFI = MF.getFrameInfo();
27269 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27270 EVT VT = Op.getValueType();
27271
27272 MFI.setFrameAddressIsTaken(true);
27273
27274 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
27275 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
27276 // is not possible to crawl up the stack without looking at the unwind codes
27277 // simultaneously.
27278 int FrameAddrIndex = FuncInfo->getFAIndex();
27279 if (!FrameAddrIndex) {
27280 // Set up a frame object for the return address.
27281 unsigned SlotSize = RegInfo->getSlotSize();
27282 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
27283 SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);
27284 FuncInfo->setFAIndex(FrameAddrIndex);
27285 }
27286 return DAG.getFrameIndex(FrameAddrIndex, VT);
27287 }
27288
27289 unsigned FrameReg =
27290 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
27291 SDLoc dl(Op); // FIXME probably not meaningful
27292 unsigned Depth = Op.getConstantOperandVal(0);
27293 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
27294 (FrameReg == X86::EBP && VT == MVT::i32)) &&
27295 "Invalid Frame Register!");
27296 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
27297 while (Depth--)
27298 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
27300 return FrameAddr;
27301}
27302
27303// FIXME? Maybe this could be a TableGen attribute on some registers and
27304// this table could be generated automatically from RegInfo.
27306 const MachineFunction &MF) const {
27307 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
27308
27310 .Case("esp", X86::ESP)
27311 .Case("rsp", X86::RSP)
27312 .Case("ebp", X86::EBP)
27313 .Case("rbp", X86::RBP)
27314 .Case("r14", X86::R14)
27315 .Case("r15", X86::R15)
27316 .Default(0);
27317
27318 if (Reg == X86::EBP || Reg == X86::RBP) {
27319 if (!TFI.hasFP(MF))
27320 report_fatal_error("register " + StringRef(RegName) +
27321 " is allocatable: function has no frame pointer");
27322#ifndef NDEBUG
27323 else {
27324 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27325 Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);
27326 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
27327 "Invalid Frame Register!");
27328 }
27329#endif
27330 }
27331
27332 if (Reg)
27333 return Reg;
27334
27335 report_fatal_error("Invalid register name global variable");
27336}
27337
27338SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
27339 SelectionDAG &DAG) const {
27340 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27341 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
27342}
27343
27345 const Constant *PersonalityFn) const {
27346 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
27347 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27348
27349 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
27350}
27351
27353 const Constant *PersonalityFn) const {
27354 // Funclet personalities don't use selectors (the runtime does the selection).
27356 return X86::NoRegister;
27357 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
27358}
27359
27361 return Subtarget.isTargetWin64();
27362}
27363
27364SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
27365 SDValue Chain = Op.getOperand(0);
27366 SDValue Offset = Op.getOperand(1);
27367 SDValue Handler = Op.getOperand(2);
27368 SDLoc dl (Op);
27369
27370 EVT PtrVT = getPointerTy(DAG.getDataLayout());
27371 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27372 Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
27373 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
27374 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
27375 "Invalid Frame Register!");
27376 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
27377 Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
27378
27379 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
27380 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
27381 dl));
27382 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
27383 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
27384 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
27385
27386 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
27387 DAG.getRegister(StoreAddrReg, PtrVT));
27388}
27389
27390SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
27391 SelectionDAG &DAG) const {
27392 SDLoc DL(Op);
27393 // If the subtarget is not 64bit, we may need the global base reg
27394 // after isel expand pseudo, i.e., after CGBR pass ran.
27395 // Therefore, ask for the GlobalBaseReg now, so that the pass
27396 // inserts the code for us in case we need it.
27397 // Otherwise, we will end up in a situation where we will
27398 // reference a virtual register that is not defined!
27399 if (!Subtarget.is64Bit()) {
27400 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27401 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
27402 }
27403 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
27404 DAG.getVTList(MVT::i32, MVT::Other),
27405 Op.getOperand(0), Op.getOperand(1));
27406}
27407
27408SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
27409 SelectionDAG &DAG) const {
27410 SDLoc DL(Op);
27411 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
27412 Op.getOperand(0), Op.getOperand(1));
27413}
27414
27415SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
27416 SelectionDAG &DAG) const {
27417 SDLoc DL(Op);
27418 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
27419 Op.getOperand(0));
27420}
27421
27423 return Op.getOperand(0);
27424}
27425
27426SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
27427 SelectionDAG &DAG) const {
27428 SDValue Root = Op.getOperand(0);
27429 SDValue Trmp = Op.getOperand(1); // trampoline
27430 SDValue FPtr = Op.getOperand(2); // nested function
27431 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
27432 SDLoc dl (Op);
27433
27434 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
27435 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27436
27437 if (Subtarget.is64Bit()) {
27438 SDValue OutChains[6];
27439
27440 // Large code-model.
27441 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
27442 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
27443
27444 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
27445 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
27446
27447 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
27448
27449 // Load the pointer to the nested function into R11.
27450 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
27451 SDValue Addr = Trmp;
27452 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27453 Addr, MachinePointerInfo(TrmpAddr));
27454
27455 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27456 DAG.getConstant(2, dl, MVT::i64));
27457 OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,
27458 MachinePointerInfo(TrmpAddr, 2), Align(2));
27459
27460 // Load the 'nest' parameter value into R10.
27461 // R10 is specified in X86CallingConv.td
27462 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
27463 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27464 DAG.getConstant(10, dl, MVT::i64));
27465 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27466 Addr, MachinePointerInfo(TrmpAddr, 10));
27467
27468 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27469 DAG.getConstant(12, dl, MVT::i64));
27470 OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,
27471 MachinePointerInfo(TrmpAddr, 12), Align(2));
27472
27473 // Jump to the nested function.
27474 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
27475 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27476 DAG.getConstant(20, dl, MVT::i64));
27477 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
27478 Addr, MachinePointerInfo(TrmpAddr, 20));
27479
27480 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
27481 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
27482 DAG.getConstant(22, dl, MVT::i64));
27483 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
27484 Addr, MachinePointerInfo(TrmpAddr, 22));
27485
27486 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27487 } else {
27488 const Function *Func =
27489 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
27490 CallingConv::ID CC = Func->getCallingConv();
27491 unsigned NestReg;
27492
27493 switch (CC) {
27494 default:
27495 llvm_unreachable("Unsupported calling convention");
27496 case CallingConv::C:
27498 // Pass 'nest' parameter in ECX.
27499 // Must be kept in sync with X86CallingConv.td
27500 NestReg = X86::ECX;
27501
27502 // Check that ECX wasn't needed by an 'inreg' parameter.
27503 FunctionType *FTy = Func->getFunctionType();
27504 const AttributeList &Attrs = Func->getAttributes();
27505
27506 if (!Attrs.isEmpty() && !Func->isVarArg()) {
27507 unsigned InRegCount = 0;
27508 unsigned Idx = 0;
27509
27510 for (FunctionType::param_iterator I = FTy->param_begin(),
27511 E = FTy->param_end(); I != E; ++I, ++Idx)
27512 if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {
27513 const DataLayout &DL = DAG.getDataLayout();
27514 // FIXME: should only count parameters that are lowered to integers.
27515 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
27516 }
27517
27518 if (InRegCount > 2) {
27519 report_fatal_error("Nest register in use - reduce number of inreg"
27520 " parameters!");
27521 }
27522 }
27523 break;
27524 }
27527 case CallingConv::Fast:
27528 case CallingConv::Tail:
27530 // Pass 'nest' parameter in EAX.
27531 // Must be kept in sync with X86CallingConv.td
27532 NestReg = X86::EAX;
27533 break;
27534 }
27535
27536 SDValue OutChains[4];
27537 SDValue Addr, Disp;
27538
27539 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27540 DAG.getConstant(10, dl, MVT::i32));
27541 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
27542
27543 // This is storing the opcode for MOV32ri.
27544 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
27545 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
27546 OutChains[0] =
27547 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
27548 Trmp, MachinePointerInfo(TrmpAddr));
27549
27550 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27551 DAG.getConstant(1, dl, MVT::i32));
27552 OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,
27553 MachinePointerInfo(TrmpAddr, 1), Align(1));
27554
27555 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
27556 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27557 DAG.getConstant(5, dl, MVT::i32));
27558 OutChains[2] =
27559 DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,
27560 MachinePointerInfo(TrmpAddr, 5), Align(1));
27561
27562 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
27563 DAG.getConstant(6, dl, MVT::i32));
27564 OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,
27565 MachinePointerInfo(TrmpAddr, 6), Align(1));
27566
27567 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
27568 }
27569}
27570
27571SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,
27572 SelectionDAG &DAG) const {
27573 /*
27574 The rounding mode is in bits 11:10 of FPSR, and has the following
27575 settings:
27576 00 Round to nearest
27577 01 Round to -inf
27578 10 Round to +inf
27579 11 Round to 0
27580
27581 GET_ROUNDING, on the other hand, expects the following:
27582 -1 Undefined
27583 0 Round to 0
27584 1 Round to nearest
27585 2 Round to +inf
27586 3 Round to -inf
27587
27588 To perform the conversion, we use a packed lookup table of the four 2-bit
27589 values that we can index by FPSP[11:10]
27590 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
27591
27592 (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
27593 */
27594
27596 MVT VT = Op.getSimpleValueType();
27597 SDLoc DL(Op);
27598
27599 // Save FP Control Word to stack slot
27600 int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
27601 SDValue StackSlot =
27602 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
27603
27605
27606 SDValue Chain = Op.getOperand(0);
27607 SDValue Ops[] = {Chain, StackSlot};
27609 DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
27611
27612 // Load FP Control Word from stack slot
27613 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
27614 Chain = CWD.getValue(1);
27615
27616 // Mask and turn the control bits into a shift for the lookup table.
27617 SDValue Shift =
27618 DAG.getNode(ISD::SRL, DL, MVT::i16,
27619 DAG.getNode(ISD::AND, DL, MVT::i16,
27620 CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
27621 DAG.getConstant(9, DL, MVT::i8));
27622 Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
27623
27624 SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
27625 SDValue RetVal =
27626 DAG.getNode(ISD::AND, DL, MVT::i32,
27627 DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
27628 DAG.getConstant(3, DL, MVT::i32));
27629
27630 RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
27631
27632 return DAG.getMergeValues({RetVal, Chain}, DL);
27633}
27634
27635SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,
27636 SelectionDAG &DAG) const {
27638 SDLoc DL(Op);
27639 SDValue Chain = Op.getNode()->getOperand(0);
27640
27641 // FP control word may be set only from data in memory. So we need to allocate
27642 // stack space to save/load FP control word.
27643 int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);
27644 SDValue StackSlot =
27645 DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));
27647 MachineMemOperand *MMO =
27649
27650 // Store FP control word into memory.
27651 SDValue Ops[] = {Chain, StackSlot};
27652 Chain = DAG.getMemIntrinsicNode(
27653 X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);
27654
27655 // Load FP Control Word from stack slot and clear RM field (bits 11:10).
27656 SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);
27657 Chain = CWD.getValue(1);
27658 CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),
27659 DAG.getConstant(0xf3ff, DL, MVT::i16));
27660
27661 // Calculate new rounding mode.
27662 SDValue NewRM = Op.getNode()->getOperand(1);
27663 SDValue RMBits;
27664 if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {
27665 uint64_t RM = CVal->getZExtValue();
27666 int FieldVal;
27667 switch (static_cast<RoundingMode>(RM)) {
27668 // clang-format off
27669 case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;
27670 case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;
27671 case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;
27672 case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;
27673 default:
27674 llvm_unreachable("rounding mode is not supported by X86 hardware");
27675 // clang-format on
27676 }
27677 RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);
27678 } else {
27679 // Need to convert argument into bits of control word:
27680 // 0 Round to 0 -> 11
27681 // 1 Round to nearest -> 00
27682 // 2 Round to +inf -> 10
27683 // 3 Round to -inf -> 01
27684 // The 2-bit value needs then to be shifted so that it occupies bits 11:10.
27685 // To make the conversion, put all these values into a value 0xc9 and shift
27686 // it left depending on the rounding mode:
27687 // (0xc9 << 4) & 0xc00 = X86::rmTowardZero
27688 // (0xc9 << 6) & 0xc00 = X86::rmToNearest
27689 // ...
27690 // (0xc9 << (2 * NewRM + 4)) & 0xc00
27691 SDValue ShiftValue =
27692 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
27693 DAG.getNode(ISD::ADD, DL, MVT::i32,
27694 DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,
27695 DAG.getConstant(1, DL, MVT::i8)),
27696 DAG.getConstant(4, DL, MVT::i32)));
27697 SDValue Shifted =
27698 DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),
27699 ShiftValue);
27700 RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,
27701 DAG.getConstant(0xc00, DL, MVT::i16));
27702 }
27703
27704 // Update rounding mode bits and store the new FP Control Word into stack.
27705 CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);
27706 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));
27707
27708 // Load FP control word from the slot.
27709 SDValue OpsLD[] = {Chain, StackSlot};
27710 MachineMemOperand *MMOL =
27712 Chain = DAG.getMemIntrinsicNode(
27713 X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);
27714
27715 // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the
27716 // same way but in bits 14:13.
27717 if (Subtarget.hasSSE1()) {
27718 // Store MXCSR into memory.
27719 Chain = DAG.getNode(
27720 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27721 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
27722 StackSlot);
27723
27724 // Load MXCSR from stack slot and clear RM field (bits 14:13).
27725 SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);
27726 Chain = CWD.getValue(1);
27727 CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),
27728 DAG.getConstant(0xffff9fff, DL, MVT::i32));
27729
27730 // Shift X87 RM bits from 11:10 to 14:13.
27731 RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);
27732 RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,
27733 DAG.getConstant(3, DL, MVT::i8));
27734
27735 // Update rounding mode bits and store the new FP Control Word into stack.
27736 CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);
27737 Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));
27738
27739 // Load MXCSR from the slot.
27740 Chain = DAG.getNode(
27741 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27742 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
27743 StackSlot);
27744 }
27745
27746 return Chain;
27747}
27748
27749const unsigned X87StateSize = 28;
27750const unsigned FPStateSize = 32;
27751[[maybe_unused]] const unsigned FPStateSizeInBits = FPStateSize * 8;
27752
27753SDValue X86TargetLowering::LowerGET_FPENV_MEM(SDValue Op,
27754 SelectionDAG &DAG) const {
27756 SDLoc DL(Op);
27757 SDValue Chain = Op->getOperand(0);
27758 SDValue Ptr = Op->getOperand(1);
27759 auto *Node = cast<FPStateAccessSDNode>(Op);
27760 EVT MemVT = Node->getMemoryVT();
27762 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
27763
27764 // Get x87 state, if it presents.
27765 if (Subtarget.hasX87()) {
27766 Chain =
27767 DAG.getMemIntrinsicNode(X86ISD::FNSTENVm, DL, DAG.getVTList(MVT::Other),
27768 {Chain, Ptr}, MemVT, MMO);
27769
27770 // FNSTENV changes the exception mask, so load back the stored environment.
27771 MachineMemOperand::Flags NewFlags =
27773 (MMO->getFlags() & ~MachineMemOperand::MOStore);
27774 MMO = MF.getMachineMemOperand(MMO, NewFlags);
27775 Chain =
27776 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
27777 {Chain, Ptr}, MemVT, MMO);
27778 }
27779
27780 // If target supports SSE, get MXCSR as well.
27781 if (Subtarget.hasSSE1()) {
27782 // Get pointer to the MXCSR location in memory.
27784 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
27785 DAG.getConstant(X87StateSize, DL, PtrVT));
27786 // Store MXCSR into memory.
27787 Chain = DAG.getNode(
27788 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27789 DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),
27790 MXCSRAddr);
27791 }
27792
27793 return Chain;
27794}
27795
27797 EVT MemVT, MachineMemOperand *MMO,
27798 SelectionDAG &DAG,
27799 const X86Subtarget &Subtarget) {
27800 // Set x87 state, if it presents.
27801 if (Subtarget.hasX87())
27802 Chain =
27803 DAG.getMemIntrinsicNode(X86ISD::FLDENVm, DL, DAG.getVTList(MVT::Other),
27804 {Chain, Ptr}, MemVT, MMO);
27805 // If target supports SSE, set MXCSR as well.
27806 if (Subtarget.hasSSE1()) {
27807 // Get pointer to the MXCSR location in memory.
27809 SDValue MXCSRAddr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr,
27810 DAG.getConstant(X87StateSize, DL, PtrVT));
27811 // Load MXCSR from memory.
27812 Chain = DAG.getNode(
27813 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,
27814 DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),
27815 MXCSRAddr);
27816 }
27817 return Chain;
27818}
27819
27820SDValue X86TargetLowering::LowerSET_FPENV_MEM(SDValue Op,
27821 SelectionDAG &DAG) const {
27822 SDLoc DL(Op);
27823 SDValue Chain = Op->getOperand(0);
27824 SDValue Ptr = Op->getOperand(1);
27825 auto *Node = cast<FPStateAccessSDNode>(Op);
27826 EVT MemVT = Node->getMemoryVT();
27828 MachineMemOperand *MMO = cast<FPStateAccessSDNode>(Op)->getMemOperand();
27829 return createSetFPEnvNodes(Ptr, Chain, DL, MemVT, MMO, DAG, Subtarget);
27830}
27831
27832SDValue X86TargetLowering::LowerRESET_FPENV(SDValue Op,
27833 SelectionDAG &DAG) const {
27835 SDLoc DL(Op);
27836 SDValue Chain = Op.getNode()->getOperand(0);
27837
27838 IntegerType *ItemTy = Type::getInt32Ty(*DAG.getContext());
27839 ArrayType *FPEnvTy = ArrayType::get(ItemTy, 8);
27841
27842 // x87 FPU Control Word: mask all floating-point exceptions, sets rounding to
27843 // nearest. FPU precision is set to 53 bits on Windows and 64 bits otherwise
27844 // for compatibility with glibc.
27845 unsigned X87CW = Subtarget.isTargetWindowsMSVC() ? 0x27F : 0x37F;
27846 FPEnvVals.push_back(ConstantInt::get(ItemTy, X87CW));
27847 Constant *Zero = ConstantInt::get(ItemTy, 0);
27848 for (unsigned I = 0; I < 6; ++I)
27849 FPEnvVals.push_back(Zero);
27850
27851 // MXCSR: mask all floating-point exceptions, sets rounding to nearest, clear
27852 // all exceptions, sets DAZ and FTZ to 0.
27853 FPEnvVals.push_back(ConstantInt::get(ItemTy, 0x1F80));
27854 Constant *FPEnvBits = ConstantArray::get(FPEnvTy, FPEnvVals);
27856 SDValue Env = DAG.getConstantPool(FPEnvBits, PtrVT);
27857 MachinePointerInfo MPI =
27861
27862 return createSetFPEnvNodes(Env, Chain, DL, MVT::i32, MMO, DAG, Subtarget);
27863}
27864
27865/// Lower a vector CTLZ using native supported vector CTLZ instruction.
27866//
27867// i8/i16 vector implemented using dword LZCNT vector instruction
27868// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
27869// split the vector, perform operation on it's Lo a Hi part and
27870// concatenate the results.
27872 const X86Subtarget &Subtarget) {
27873 assert(Op.getOpcode() == ISD::CTLZ);
27874 SDLoc dl(Op);
27875 MVT VT = Op.getSimpleValueType();
27876 MVT EltVT = VT.getVectorElementType();
27877 unsigned NumElems = VT.getVectorNumElements();
27878
27879 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
27880 "Unsupported element type");
27881
27882 // Split vector, it's Lo and Hi parts will be handled in next iteration.
27883 if (NumElems > 16 ||
27884 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
27885 return splitVectorIntUnary(Op, DAG, dl);
27886
27887 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
27888 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
27889 "Unsupported value type for operation");
27890
27891 // Use native supported vector instruction vplzcntd.
27892 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
27893 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
27894 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
27895 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
27896
27897 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
27898}
27899
27900// Lower CTLZ using a PSHUFB lookup table implementation.
27902 const X86Subtarget &Subtarget,
27903 SelectionDAG &DAG) {
27904 MVT VT = Op.getSimpleValueType();
27905 int NumElts = VT.getVectorNumElements();
27906 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
27907 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
27908
27909 // Per-nibble leading zero PSHUFB lookup table.
27910 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
27911 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
27912 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
27913 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
27914
27916 for (int i = 0; i < NumBytes; ++i)
27917 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
27918 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
27919
27920 // Begin by bitcasting the input to byte vector, then split those bytes
27921 // into lo/hi nibbles and use the PSHUFB LUT to perform CTLZ on each of them.
27922 // If the hi input nibble is zero then we add both results together, otherwise
27923 // we just take the hi result (by masking the lo result to zero before the
27924 // add).
27925 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
27926 SDValue Zero = DAG.getConstant(0, DL, CurrVT);
27927
27928 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
27929 SDValue Lo = Op0;
27930 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
27931 SDValue HiZ;
27932 if (CurrVT.is512BitVector()) {
27933 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27934 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
27935 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27936 } else {
27937 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
27938 }
27939
27940 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
27941 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
27942 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
27943 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
27944
27945 // Merge result back from vXi8 back to VT, working on the lo/hi halves
27946 // of the current vector width in the same way we did for the nibbles.
27947 // If the upper half of the input element is zero then add the halves'
27948 // leading zero counts together, otherwise just use the upper half's.
27949 // Double the width of the result until we are at target width.
27950 while (CurrVT != VT) {
27951 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
27952 int CurrNumElts = CurrVT.getVectorNumElements();
27953 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
27954 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
27955 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
27956
27957 // Check if the upper half of the input element is zero.
27958 if (CurrVT.is512BitVector()) {
27959 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
27960 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
27961 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27962 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
27963 } else {
27964 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
27965 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
27966 }
27967 HiZ = DAG.getBitcast(NextVT, HiZ);
27968
27969 // Move the upper/lower halves to the lower bits as we'll be extending to
27970 // NextVT. Mask the lower result to zero if HiZ is true and add the results
27971 // together.
27972 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
27973 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
27974 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
27975 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
27976 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
27977 CurrVT = NextVT;
27978 }
27979
27980 return Res;
27981}
27982
27984 const X86Subtarget &Subtarget,
27985 SelectionDAG &DAG) {
27986 MVT VT = Op.getSimpleValueType();
27987
27988 if (Subtarget.hasCDI() &&
27989 // vXi8 vectors need to be promoted to 512-bits for vXi32.
27990 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
27991 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
27992
27993 // Decompose 256-bit ops into smaller 128-bit ops.
27994 if (VT.is256BitVector() && !Subtarget.hasInt256())
27995 return splitVectorIntUnary(Op, DAG, DL);
27996
27997 // Decompose 512-bit ops into smaller 256-bit ops.
27998 if (VT.is512BitVector() && !Subtarget.hasBWI())
27999 return splitVectorIntUnary(Op, DAG, DL);
28000
28001 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
28002 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
28003}
28004
28005static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
28006 SelectionDAG &DAG) {
28007 MVT VT = Op.getSimpleValueType();
28008 MVT OpVT = VT;
28009 unsigned NumBits = VT.getSizeInBits();
28010 SDLoc dl(Op);
28011 unsigned Opc = Op.getOpcode();
28012
28013 if (VT.isVector())
28014 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
28015
28016 Op = Op.getOperand(0);
28017 if (VT == MVT::i8) {
28018 // Zero extend to i32 since there is not an i8 bsr.
28019 OpVT = MVT::i32;
28020 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
28021 }
28022
28023 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
28024 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
28025 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
28026
28027 if (Opc == ISD::CTLZ) {
28028 // If src is zero (i.e. bsr sets ZF), returns NumBits.
28029 SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
28030 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
28031 Op.getValue(1)};
28032 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
28033 }
28034
28035 // Finally xor with NumBits-1.
28036 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
28037 DAG.getConstant(NumBits - 1, dl, OpVT));
28038
28039 if (VT == MVT::i8)
28040 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
28041 return Op;
28042}
28043
28044static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
28045 SelectionDAG &DAG) {
28046 MVT VT = Op.getSimpleValueType();
28047 unsigned NumBits = VT.getScalarSizeInBits();
28048 SDValue N0 = Op.getOperand(0);
28049 SDLoc dl(Op);
28050
28051 assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&
28052 "Only scalar CTTZ requires custom lowering");
28053
28054 // Issue a bsf (scan bits forward) which also sets EFLAGS.
28055 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
28056 Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);
28057
28058 // If src is known never zero we can skip the CMOV.
28059 if (DAG.isKnownNeverZero(N0))
28060 return Op;
28061
28062 // If src is zero (i.e. bsf sets ZF), returns NumBits.
28063 SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),
28064 DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),
28065 Op.getValue(1)};
28066 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
28067}
28068
28070 const X86Subtarget &Subtarget) {
28071 MVT VT = Op.getSimpleValueType();
28072 SDLoc DL(Op);
28073
28074 if (VT == MVT::i16 || VT == MVT::i32)
28075 return lowerAddSubToHorizontalOp(Op, DL, DAG, Subtarget);
28076
28077 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28078 return splitVectorIntBinary(Op, DAG, DL);
28079
28080 assert(Op.getSimpleValueType().is256BitVector() &&
28081 Op.getSimpleValueType().isInteger() &&
28082 "Only handle AVX 256-bit vector integer operation");
28083 return splitVectorIntBinary(Op, DAG, DL);
28084}
28085
28087 const X86Subtarget &Subtarget) {
28088 MVT VT = Op.getSimpleValueType();
28089 SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
28090 unsigned Opcode = Op.getOpcode();
28091 SDLoc DL(Op);
28092
28093 if (VT == MVT::v32i16 || VT == MVT::v64i8 ||
28094 (VT.is256BitVector() && !Subtarget.hasInt256())) {
28095 assert(Op.getSimpleValueType().isInteger() &&
28096 "Only handle AVX vector integer operation");
28097 return splitVectorIntBinary(Op, DAG, DL);
28098 }
28099
28100 // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.
28101 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28102 EVT SetCCResultType =
28103 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28104
28105 unsigned BitWidth = VT.getScalarSizeInBits();
28106 if (Opcode == ISD::USUBSAT) {
28107 if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {
28108 // Handle a special-case with a bit-hack instead of cmp+select:
28109 // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)
28110 // If the target can use VPTERNLOG, DAGToDAG will match this as
28111 // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a
28112 // "broadcast" constant load.
28114 if (C && C->getAPIntValue().isSignMask()) {
28115 SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);
28116 SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);
28117 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);
28118 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);
28119 return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);
28120 }
28121 }
28122 if (!TLI.isOperationLegal(ISD::UMAX, VT)) {
28123 // usubsat X, Y --> (X >u Y) ? X - Y : 0
28124 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);
28125 SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);
28126 // TODO: Move this to DAGCombiner?
28127 if (SetCCResultType == VT &&
28128 DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())
28129 return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);
28130 return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));
28131 }
28132 }
28133
28134 if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&
28135 (!VT.isVector() || VT == MVT::v2i64)) {
28138 SDValue Zero = DAG.getConstant(0, DL, VT);
28139 SDValue Result =
28140 DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,
28141 DAG.getVTList(VT, SetCCResultType), X, Y);
28142 SDValue SumDiff = Result.getValue(0);
28143 SDValue Overflow = Result.getValue(1);
28144 SDValue SatMin = DAG.getConstant(MinVal, DL, VT);
28145 SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);
28146 SDValue SumNeg =
28147 DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);
28148 Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);
28149 return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);
28150 }
28151
28152 // Use default expansion.
28153 return SDValue();
28154}
28155
28156static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
28157 SelectionDAG &DAG) {
28158 MVT VT = Op.getSimpleValueType();
28159 SDLoc DL(Op);
28160
28161 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
28162 // Since X86 does not have CMOV for 8-bit integer, we don't convert
28163 // 8-bit integer abs to NEG and CMOV.
28164 SDValue N0 = Op.getOperand(0);
28165 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
28166 DAG.getConstant(0, DL, VT), N0);
28167 SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),
28168 SDValue(Neg.getNode(), 1)};
28169 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
28170 }
28171
28172 // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).
28173 if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {
28174 SDValue Src = Op.getOperand(0);
28175 SDValue Neg = DAG.getNegative(Src, DL, VT);
28176 return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Neg, Src);
28177 }
28178
28179 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
28180 assert(VT.isInteger() &&
28181 "Only handle AVX 256-bit vector integer operation");
28182 return splitVectorIntUnary(Op, DAG, DL);
28183 }
28184
28185 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28186 return splitVectorIntUnary(Op, DAG, DL);
28187
28188 // Default to expand.
28189 return SDValue();
28190}
28191
28192static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,
28193 SelectionDAG &DAG) {
28194 MVT VT = Op.getSimpleValueType();
28195 SDLoc DL(Op);
28196
28197 // For AVX1 cases, split to use legal ops.
28198 if (VT.is256BitVector() && !Subtarget.hasInt256())
28199 return splitVectorIntBinary(Op, DAG, DL);
28200
28201 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28202 return splitVectorIntBinary(Op, DAG, DL);
28203
28204 // Default to expand.
28205 return SDValue();
28206}
28207
28208static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,
28209 SelectionDAG &DAG) {
28210 MVT VT = Op.getSimpleValueType();
28211 SDLoc DL(Op);
28212
28213 // For AVX1 cases, split to use legal ops.
28214 if (VT.is256BitVector() && !Subtarget.hasInt256())
28215 return splitVectorIntBinary(Op, DAG, DL);
28216
28217 if (VT == MVT::v32i16 || VT == MVT::v64i8)
28218 return splitVectorIntBinary(Op, DAG, DL);
28219
28220 // Default to expand.
28221 return SDValue();
28222}
28223
28225 SelectionDAG &DAG) {
28226 assert((Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMINIMUM) &&
28227 "Expected FMAXIMUM or FMINIMUM opcode");
28228 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28229 EVT VT = Op.getValueType();
28230 SDValue X = Op.getOperand(0);
28231 SDValue Y = Op.getOperand(1);
28232 SDLoc DL(Op);
28233 uint64_t SizeInBits = VT.getScalarSizeInBits();
28234 APInt PreferredZero = APInt::getZero(SizeInBits);
28235 APInt OppositeZero = PreferredZero;
28236 EVT IVT = VT.changeTypeToInteger();
28237 X86ISD::NodeType MinMaxOp;
28238 if (Op.getOpcode() == ISD::FMAXIMUM) {
28239 MinMaxOp = X86ISD::FMAX;
28240 OppositeZero.setSignBit();
28241 } else {
28242 PreferredZero.setSignBit();
28243 MinMaxOp = X86ISD::FMIN;
28244 }
28245 EVT SetCCType =
28246 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28247
28248 // The tables below show the expected result of Max in cases of NaN and
28249 // signed zeros.
28250 //
28251 // Y Y
28252 // Num xNaN +0 -0
28253 // --------------- ---------------
28254 // Num | Max | Y | +0 | +0 | +0 |
28255 // X --------------- X ---------------
28256 // xNaN | X | X/Y | -0 | +0 | -0 |
28257 // --------------- ---------------
28258 //
28259 // It is achieved by means of FMAX/FMIN with preliminary checks and operand
28260 // reordering.
28261 //
28262 // We check if any of operands is NaN and return NaN. Then we check if any of
28263 // operands is zero or negative zero (for fmaximum and fminimum respectively)
28264 // to ensure the correct zero is returned.
28265 auto MatchesZero = [](SDValue Op, APInt Zero) {
28267 if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
28268 return CstOp->getValueAPF().bitcastToAPInt() == Zero;
28269 if (auto *CstOp = dyn_cast<ConstantSDNode>(Op))
28270 return CstOp->getAPIntValue() == Zero;
28271 if (Op->getOpcode() == ISD::BUILD_VECTOR ||
28272 Op->getOpcode() == ISD::SPLAT_VECTOR) {
28273 for (const SDValue &OpVal : Op->op_values()) {
28274 if (OpVal.isUndef())
28275 continue;
28276 auto *CstOp = dyn_cast<ConstantFPSDNode>(OpVal);
28277 if (!CstOp)
28278 return false;
28279 if (!CstOp->getValueAPF().isZero())
28280 continue;
28281 if (CstOp->getValueAPF().bitcastToAPInt() != Zero)
28282 return false;
28283 }
28284 return true;
28285 }
28286 return false;
28287 };
28288
28289 bool IsXNeverNaN = DAG.isKnownNeverNaN(X);
28290 bool IsYNeverNaN = DAG.isKnownNeverNaN(Y);
28291 bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath ||
28292 Op->getFlags().hasNoSignedZeros() ||
28293 DAG.isKnownNeverZeroFloat(X) ||
28295 SDValue NewX, NewY;
28296 if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) ||
28297 MatchesZero(X, OppositeZero)) {
28298 // Operands are already in right order or order does not matter.
28299 NewX = X;
28300 NewY = Y;
28301 } else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) {
28302 NewX = Y;
28303 NewY = X;
28304 } else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) &&
28305 (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
28306 if (IsXNeverNaN)
28307 std::swap(X, Y);
28308 // VFPCLASSS consumes a vector type. So provide a minimal one corresponded
28309 // xmm register.
28310 MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits);
28312 // Bits of classes:
28313 // Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7]
28314 // Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN
28315 SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101,
28316 DL, MVT::i32);
28317 SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm);
28318 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
28319 DAG.getConstant(0, DL, MVT::v8i1), IsNanZero,
28320 DAG.getIntPtrConstant(0, DL));
28321 SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins);
28322 NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
28323 NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
28324 return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
28325 } else {
28326 SDValue IsXSigned;
28327 if (Subtarget.is64Bit() || VT != MVT::f64) {
28328 SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
28329 SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
28330 IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
28331 } else {
28332 assert(VT == MVT::f64);
28333 SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
28334 DAG.getConstantFP(0, DL, MVT::v2f64), X,
28335 DAG.getIntPtrConstant(0, DL));
28336 SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
28337 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
28338 DAG.getIntPtrConstant(1, DL));
28339 Hi = DAG.getBitcast(MVT::i32, Hi);
28340 SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
28341 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
28342 *DAG.getContext(), MVT::i32);
28343 IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
28344 }
28345 if (MinMaxOp == X86ISD::FMAX) {
28346 NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
28347 NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
28348 } else {
28349 NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
28350 NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
28351 }
28352 }
28353
28354 bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
28355 Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN);
28356
28357 // If we did no ordering operands for signed zero handling and we need
28358 // to process NaN and we know that the second operand is not NaN then put
28359 // it in first operand and we will not need to post handle NaN after max/min.
28360 if (IgnoreSignedZero && !IgnoreNaN && DAG.isKnownNeverNaN(NewY))
28361 std::swap(NewX, NewY);
28362
28363 SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
28364
28365 if (IgnoreNaN || DAG.isKnownNeverNaN(NewX))
28366 return MinMax;
28367
28368 SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NewX, NewX, ISD::SETUO);
28369 return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
28370}
28371
28372static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,
28373 SelectionDAG &DAG) {
28374 MVT VT = Op.getSimpleValueType();
28375 SDLoc dl(Op);
28376
28377 // For AVX1 cases, split to use legal ops.
28378 if (VT.is256BitVector() && !Subtarget.hasInt256())
28379 return splitVectorIntBinary(Op, DAG, dl);
28380
28381 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())
28382 return splitVectorIntBinary(Op, DAG, dl);
28383
28384 bool IsSigned = Op.getOpcode() == ISD::ABDS;
28385 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28386
28387 // TODO: Move to TargetLowering expandABD() once we have ABD promotion.
28388 if (VT.isScalarInteger()) {
28389 unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u);
28390 MVT WideVT = MVT::getIntegerVT(WideBits);
28391 if (TLI.isTypeLegal(WideVT)) {
28392 // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs))))
28393 // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs))))
28394 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28395 SDValue LHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(0));
28396 SDValue RHS = DAG.getNode(ExtOpc, dl, WideVT, Op.getOperand(1));
28397 SDValue Diff = DAG.getNode(ISD::SUB, dl, WideVT, LHS, RHS);
28398 SDValue AbsDiff = DAG.getNode(ISD::ABS, dl, WideVT, Diff);
28399 return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff);
28400 }
28401 }
28402
28403 // TODO: Move to TargetLowering expandABD().
28404 if (!Subtarget.hasSSE41() &&
28405 ((IsSigned && VT == MVT::v16i8) || VT == MVT::v4i32)) {
28406 SDValue LHS = DAG.getFreeze(Op.getOperand(0));
28407 SDValue RHS = DAG.getFreeze(Op.getOperand(1));
28409 SDValue Cmp = DAG.getSetCC(dl, VT, LHS, RHS, CC);
28410 SDValue Diff0 = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
28411 SDValue Diff1 = DAG.getNode(ISD::SUB, dl, VT, RHS, LHS);
28412 return getBitSelect(dl, VT, Diff0, Diff1, Cmp, DAG);
28413 }
28414
28415 // Default to expand.
28416 return SDValue();
28417}
28418
28419static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
28420 SelectionDAG &DAG) {
28421 SDLoc dl(Op);
28422 MVT VT = Op.getSimpleValueType();
28423
28424 // Decompose 256-bit ops into 128-bit ops.
28425 if (VT.is256BitVector() && !Subtarget.hasInt256())
28426 return splitVectorIntBinary(Op, DAG, dl);
28427
28428 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28429 return splitVectorIntBinary(Op, DAG, dl);
28430
28431 SDValue A = Op.getOperand(0);
28432 SDValue B = Op.getOperand(1);
28433
28434 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
28435 // vector pairs, multiply and truncate.
28436 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
28437 unsigned NumElts = VT.getVectorNumElements();
28438
28439 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28440 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28441 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
28442 return DAG.getNode(
28443 ISD::TRUNCATE, dl, VT,
28444 DAG.getNode(ISD::MUL, dl, ExVT,
28445 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),
28446 DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));
28447 }
28448
28449 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28450
28451 // Extract the lo/hi parts to any extend to i16.
28452 // We're going to mask off the low byte of each result element of the
28453 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
28454 // element.
28455 SDValue Undef = DAG.getUNDEF(VT);
28456 SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));
28457 SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));
28458
28459 SDValue BLo, BHi;
28460 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
28461 // If the RHS is a constant, manually unpackl/unpackh.
28462 SmallVector<SDValue, 16> LoOps, HiOps;
28463 for (unsigned i = 0; i != NumElts; i += 16) {
28464 for (unsigned j = 0; j != 8; ++j) {
28465 LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,
28466 MVT::i16));
28467 HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,
28468 MVT::i16));
28469 }
28470 }
28471
28472 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
28473 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
28474 } else {
28475 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));
28476 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));
28477 }
28478
28479 // Multiply, mask the lower 8bits of the lo/hi results and pack.
28480 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
28481 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
28482 return getPack(DAG, Subtarget, dl, VT, RLo, RHi);
28483 }
28484
28485 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
28486 if (VT == MVT::v4i32) {
28487 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
28488 "Should not custom lower when pmulld is available!");
28489
28490 // Extract the odd parts.
28491 static const int UnpackMask[] = { 1, -1, 3, -1 };
28492 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
28493 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
28494
28495 // Multiply the even parts.
28496 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
28497 DAG.getBitcast(MVT::v2i64, A),
28498 DAG.getBitcast(MVT::v2i64, B));
28499 // Now multiply odd parts.
28500 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
28501 DAG.getBitcast(MVT::v2i64, Aodds),
28502 DAG.getBitcast(MVT::v2i64, Bodds));
28503
28504 Evens = DAG.getBitcast(VT, Evens);
28505 Odds = DAG.getBitcast(VT, Odds);
28506
28507 // Merge the two vectors back together with a shuffle. This expands into 2
28508 // shuffles.
28509 static const int ShufMask[] = { 0, 4, 2, 6 };
28510 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
28511 }
28512
28513 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
28514 "Only know how to lower V2I64/V4I64/V8I64 multiply");
28515 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
28516
28517 // Ahi = psrlqi(a, 32);
28518 // Bhi = psrlqi(b, 32);
28519 //
28520 // AloBlo = pmuludq(a, b);
28521 // AloBhi = pmuludq(a, Bhi);
28522 // AhiBlo = pmuludq(Ahi, b);
28523 //
28524 // Hi = psllqi(AloBhi + AhiBlo, 32);
28525 // return AloBlo + Hi;
28526 KnownBits AKnown = DAG.computeKnownBits(A);
28527 KnownBits BKnown = DAG.computeKnownBits(B);
28528
28529 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
28530 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
28531 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
28532
28533 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
28534 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
28535 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
28536
28537 SDValue Zero = DAG.getConstant(0, dl, VT);
28538
28539 // Only multiply lo/hi halves that aren't known to be zero.
28540 SDValue AloBlo = Zero;
28541 if (!ALoIsZero && !BLoIsZero)
28542 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
28543
28544 SDValue AloBhi = Zero;
28545 if (!ALoIsZero && !BHiIsZero) {
28546 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
28547 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
28548 }
28549
28550 SDValue AhiBlo = Zero;
28551 if (!AHiIsZero && !BLoIsZero) {
28552 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
28553 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
28554 }
28555
28556 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
28557 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
28558
28559 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
28560}
28561
28563 MVT VT, bool IsSigned,
28564 const X86Subtarget &Subtarget,
28565 SelectionDAG &DAG,
28566 SDValue *Low = nullptr) {
28567 unsigned NumElts = VT.getVectorNumElements();
28568
28569 // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
28570 // to a vXi16 type. Do the multiplies, shift the results and pack the half
28571 // lane results back together.
28572
28573 // We'll take different approaches for signed and unsigned.
28574 // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes
28575 // and use pmullw to calculate the full 16-bit product.
28576 // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and
28577 // shift them left into the upper byte of each word. This allows us to use
28578 // pmulhw to calculate the full 16-bit product. This trick means we don't
28579 // need to sign extend the bytes to use pmullw.
28580
28581 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
28582 SDValue Zero = DAG.getConstant(0, dl, VT);
28583
28584 SDValue ALo, AHi;
28585 if (IsSigned) {
28586 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));
28587 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));
28588 } else {
28589 ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));
28590 AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));
28591 }
28592
28593 SDValue BLo, BHi;
28594 if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
28595 // If the RHS is a constant, manually unpackl/unpackh and extend.
28596 SmallVector<SDValue, 16> LoOps, HiOps;
28597 for (unsigned i = 0; i != NumElts; i += 16) {
28598 for (unsigned j = 0; j != 8; ++j) {
28599 SDValue LoOp = B.getOperand(i + j);
28600 SDValue HiOp = B.getOperand(i + j + 8);
28601
28602 if (IsSigned) {
28603 LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);
28604 HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);
28605 LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,
28606 DAG.getConstant(8, dl, MVT::i16));
28607 HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,
28608 DAG.getConstant(8, dl, MVT::i16));
28609 } else {
28610 LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);
28611 HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);
28612 }
28613
28614 LoOps.push_back(LoOp);
28615 HiOps.push_back(HiOp);
28616 }
28617 }
28618
28619 BLo = DAG.getBuildVector(ExVT, dl, LoOps);
28620 BHi = DAG.getBuildVector(ExVT, dl, HiOps);
28621 } else if (IsSigned) {
28622 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));
28623 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));
28624 } else {
28625 BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));
28626 BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));
28627 }
28628
28629 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
28630 // pack back to vXi8.
28631 unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;
28632 SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);
28633 SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);
28634
28635 if (Low)
28636 *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);
28637
28638 return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);
28639}
28640
28641static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
28642 SelectionDAG &DAG) {
28643 SDLoc dl(Op);
28644 MVT VT = Op.getSimpleValueType();
28645 bool IsSigned = Op->getOpcode() == ISD::MULHS;
28646 unsigned NumElts = VT.getVectorNumElements();
28647 SDValue A = Op.getOperand(0);
28648 SDValue B = Op.getOperand(1);
28649
28650 // Decompose 256-bit ops into 128-bit ops.
28651 if (VT.is256BitVector() && !Subtarget.hasInt256())
28652 return splitVectorIntBinary(Op, DAG, dl);
28653
28654 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
28655 return splitVectorIntBinary(Op, DAG, dl);
28656
28657 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
28658 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
28659 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
28660 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
28661
28662 // PMULxD operations multiply each even value (starting at 0) of LHS with
28663 // the related value of RHS and produce a widen result.
28664 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
28665 // => <2 x i64> <ae|cg>
28666 //
28667 // In other word, to have all the results, we need to perform two PMULxD:
28668 // 1. one with the even values.
28669 // 2. one with the odd values.
28670 // To achieve #2, with need to place the odd values at an even position.
28671 //
28672 // Place the odd value at an even position (basically, shift all values 1
28673 // step to the left):
28674 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,
28675 9, -1, 11, -1, 13, -1, 15, -1};
28676 // <a|b|c|d> => <b|undef|d|undef>
28677 SDValue Odd0 =
28678 DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));
28679 // <e|f|g|h> => <f|undef|h|undef>
28680 SDValue Odd1 =
28681 DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));
28682
28683 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
28684 // ints.
28685 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
28686 unsigned Opcode =
28687 (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;
28688 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
28689 // => <2 x i64> <ae|cg>
28690 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
28691 DAG.getBitcast(MulVT, A),
28692 DAG.getBitcast(MulVT, B)));
28693 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
28694 // => <2 x i64> <bf|dh>
28695 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
28696 DAG.getBitcast(MulVT, Odd0),
28697 DAG.getBitcast(MulVT, Odd1)));
28698
28699 // Shuffle it back into the right order.
28700 SmallVector<int, 16> ShufMask(NumElts);
28701 for (int i = 0; i != (int)NumElts; ++i)
28702 ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
28703
28704 SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);
28705
28706 // If we have a signed multiply but no PMULDQ fix up the result of an
28707 // unsigned multiply.
28708 if (IsSigned && !Subtarget.hasSSE41()) {
28709 SDValue Zero = DAG.getConstant(0, dl, VT);
28710 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
28711 DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);
28712 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
28713 DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);
28714
28715 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
28716 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);
28717 }
28718
28719 return Res;
28720 }
28721
28722 // Only i8 vectors should need custom lowering after this.
28723 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
28724 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
28725 "Unsupported vector type");
28726
28727 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
28728 // logical shift down the upper half and pack back to i8.
28729
28730 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
28731 // and then ashr/lshr the upper bits down to the lower bits before multiply.
28732
28733 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28734 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28735 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28736 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28737 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
28738 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
28739 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
28740 Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28741 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
28742 }
28743
28744 return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);
28745}
28746
28747// Custom lowering for SMULO/UMULO.
28748static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,
28749 SelectionDAG &DAG) {
28750 MVT VT = Op.getSimpleValueType();
28751
28752 // Scalars defer to LowerXALUO.
28753 if (!VT.isVector())
28754 return LowerXALUO(Op, DAG);
28755
28756 SDLoc dl(Op);
28757 bool IsSigned = Op->getOpcode() == ISD::SMULO;
28758 SDValue A = Op.getOperand(0);
28759 SDValue B = Op.getOperand(1);
28760 EVT OvfVT = Op->getValueType(1);
28761
28762 if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||
28763 (VT == MVT::v64i8 && !Subtarget.hasBWI())) {
28764 // Extract the LHS Lo/Hi vectors
28765 SDValue LHSLo, LHSHi;
28766 std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);
28767
28768 // Extract the RHS Lo/Hi vectors
28769 SDValue RHSLo, RHSHi;
28770 std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);
28771
28772 EVT LoOvfVT, HiOvfVT;
28773 std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);
28774 SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);
28775 SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);
28776
28777 // Issue the split operations.
28778 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);
28779 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);
28780
28781 // Join the separate data results and the overflow results.
28782 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
28783 SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),
28784 Hi.getValue(1));
28785
28786 return DAG.getMergeValues({Res, Ovf}, dl);
28787 }
28788
28789 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28790 EVT SetccVT =
28791 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
28792
28793 if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||
28794 (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {
28795 unsigned NumElts = VT.getVectorNumElements();
28796 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
28797 unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
28798 SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);
28799 SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);
28800 SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);
28801
28802 SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
28803
28804 SDValue Ovf;
28805 if (IsSigned) {
28806 SDValue High, LowSign;
28807 if (OvfVT.getVectorElementType() == MVT::i1 &&
28808 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
28809 // Rather the truncating try to do the compare on vXi16 or vXi32.
28810 // Shift the high down filling with sign bits.
28811 High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);
28812 // Fill all 16 bits with the sign bit from the low.
28813 LowSign =
28814 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);
28815 LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,
28816 15, DAG);
28817 SetccVT = OvfVT;
28818 if (!Subtarget.hasBWI()) {
28819 // We can't do a vXi16 compare so sign extend to v16i32.
28820 High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);
28821 LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);
28822 }
28823 } else {
28824 // Otherwise do the compare at vXi8.
28825 High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28826 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28827 LowSign =
28828 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28829 }
28830
28831 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28832 } else {
28833 SDValue High =
28834 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);
28835 if (OvfVT.getVectorElementType() == MVT::i1 &&
28836 (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {
28837 // Rather the truncating try to do the compare on vXi16 or vXi32.
28838 SetccVT = OvfVT;
28839 if (!Subtarget.hasBWI()) {
28840 // We can't do a vXi16 compare so sign extend to v16i32.
28841 High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);
28842 }
28843 } else {
28844 // Otherwise do the compare at vXi8.
28845 High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);
28846 }
28847
28848 Ovf =
28849 DAG.getSetCC(dl, SetccVT, High,
28850 DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);
28851 }
28852
28853 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28854
28855 return DAG.getMergeValues({Low, Ovf}, dl);
28856 }
28857
28858 SDValue Low;
28859 SDValue High =
28860 LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);
28861
28862 SDValue Ovf;
28863 if (IsSigned) {
28864 // SMULO overflows if the high bits don't match the sign of the low.
28865 SDValue LowSign =
28866 DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));
28867 Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);
28868 } else {
28869 // UMULO overflows if the high bits are non-zero.
28870 Ovf =
28871 DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);
28872 }
28873
28874 Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);
28875
28876 return DAG.getMergeValues({Low, Ovf}, dl);
28877}
28878
28879SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
28880 assert(Subtarget.isTargetWin64() && "Unexpected target");
28881 EVT VT = Op.getValueType();
28882 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
28883 "Unexpected return type for lowering");
28884
28885 if (isa<ConstantSDNode>(Op->getOperand(1))) {
28887 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))
28888 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);
28889 }
28890
28891 RTLIB::Libcall LC;
28892 bool isSigned;
28893 switch (Op->getOpcode()) {
28894 // clang-format off
28895 default: llvm_unreachable("Unexpected request for libcall!");
28896 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
28897 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
28898 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
28899 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
28900 // clang-format on
28901 }
28902
28903 SDLoc dl(Op);
28904 SDValue InChain = DAG.getEntryNode();
28905
28908 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
28909 EVT ArgVT = Op->getOperand(i).getValueType();
28910 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
28911 "Unexpected argument type for lowering");
28912 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
28913 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
28914 MachinePointerInfo MPI =
28916 Entry.Node = StackPtr;
28917 InChain =
28918 DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));
28919 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
28920 Entry.Ty = PointerType::get(ArgTy,0);
28921 Entry.IsSExt = false;
28922 Entry.IsZExt = false;
28923 Args.push_back(Entry);
28924 }
28925
28928
28930 CLI.setDebugLoc(dl)
28931 .setChain(InChain)
28932 .setLibCallee(
28934 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
28935 std::move(Args))
28936 .setInRegister()
28937 .setSExtResult(isSigned)
28938 .setZExtResult(!isSigned);
28939
28940 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
28941 return DAG.getBitcast(VT, CallInfo.first);
28942}
28943
28944SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,
28945 SelectionDAG &DAG,
28946 SDValue &Chain) const {
28947 assert(Subtarget.isTargetWin64() && "Unexpected target");
28948 EVT VT = Op.getValueType();
28949 bool IsStrict = Op->isStrictFPOpcode();
28950
28951 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
28952 EVT ArgVT = Arg.getValueType();
28953
28954 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
28955 "Unexpected return type for lowering");
28956
28957 RTLIB::Libcall LC;
28958 if (Op->getOpcode() == ISD::FP_TO_SINT ||
28959 Op->getOpcode() == ISD::STRICT_FP_TO_SINT)
28960 LC = RTLIB::getFPTOSINT(ArgVT, VT);
28961 else
28962 LC = RTLIB::getFPTOUINT(ArgVT, VT);
28963 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
28964
28965 SDLoc dl(Op);
28966 MakeLibCallOptions CallOptions;
28967 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
28968
28970 // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the
28971 // expected VT (i128).
28972 std::tie(Result, Chain) =
28973 makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);
28974 Result = DAG.getBitcast(VT, Result);
28975 return Result;
28976}
28977
28978SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,
28979 SelectionDAG &DAG) const {
28980 assert(Subtarget.isTargetWin64() && "Unexpected target");
28981 EVT VT = Op.getValueType();
28982 bool IsStrict = Op->isStrictFPOpcode();
28983
28984 SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);
28985 EVT ArgVT = Arg.getValueType();
28986
28987 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
28988 "Unexpected argument type for lowering");
28989
28990 RTLIB::Libcall LC;
28991 if (Op->getOpcode() == ISD::SINT_TO_FP ||
28992 Op->getOpcode() == ISD::STRICT_SINT_TO_FP)
28993 LC = RTLIB::getSINTTOFP(ArgVT, VT);
28994 else
28995 LC = RTLIB::getUINTTOFP(ArgVT, VT);
28996 assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!");
28997
28998 SDLoc dl(Op);
28999 MakeLibCallOptions CallOptions;
29000 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
29001
29002 // Pass the i128 argument as an indirect argument on the stack.
29003 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
29004 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
29005 MachinePointerInfo MPI =
29007 Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));
29008
29010 std::tie(Result, Chain) =
29011 makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);
29012 return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
29013}
29014
29015// Return true if the required (according to Opcode) shift-imm form is natively
29016// supported by the Subtarget
29017static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget,
29018 unsigned Opcode) {
29019 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
29020 "Unexpected shift opcode");
29021
29022 if (!VT.isSimple())
29023 return false;
29024
29025 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
29026 return false;
29027
29028 if (VT.getScalarSizeInBits() < 16)
29029 return false;
29030
29031 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
29032 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
29033 return true;
29034
29035 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
29036 (VT.is256BitVector() && Subtarget.hasInt256());
29037
29038 bool AShift = LShift && (Subtarget.hasAVX512() ||
29039 (VT != MVT::v2i64 && VT != MVT::v4i64));
29040 return (Opcode == ISD::SRA) ? AShift : LShift;
29041}
29042
29043// The shift amount is a variable, but it is the same for all vector lanes.
29044// These instructions are defined together with shift-immediate.
29045static
29047 unsigned Opcode) {
29048 return supportedVectorShiftWithImm(VT, Subtarget, Opcode);
29049}
29050
29051// Return true if the required (according to Opcode) variable-shift form is
29052// natively supported by the Subtarget
29053static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget,
29054 unsigned Opcode) {
29055 assert((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
29056 "Unexpected shift opcode");
29057
29058 if (!VT.isSimple())
29059 return false;
29060
29061 if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))
29062 return false;
29063
29064 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
29065 return false;
29066
29067 // vXi16 supported only on AVX-512, BWI
29068 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
29069 return false;
29070
29071 if (Subtarget.hasAVX512() &&
29072 (Subtarget.useAVX512Regs() || !VT.is512BitVector()))
29073 return true;
29074
29075 bool LShift = VT.is128BitVector() || VT.is256BitVector();
29076 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
29077 return (Opcode == ISD::SRA) ? AShift : LShift;
29078}
29079
29081 const X86Subtarget &Subtarget) {
29082 MVT VT = Op.getSimpleValueType();
29083 SDLoc dl(Op);
29084 SDValue R = Op.getOperand(0);
29085 SDValue Amt = Op.getOperand(1);
29086 unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);
29087 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29088
29089 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
29090 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
29091 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
29092 SDValue Ex = DAG.getBitcast(ExVT, R);
29093
29094 // ashr(R, 63) === cmp_slt(R, 0)
29095 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
29096 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
29097 "Unsupported PCMPGT op");
29098 return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);
29099 }
29100
29101 if (ShiftAmt >= 32) {
29102 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
29103 SDValue Upper =
29104 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
29106 ShiftAmt - 32, DAG);
29107 if (VT == MVT::v2i64)
29108 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
29109 if (VT == MVT::v4i64)
29110 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
29111 {9, 1, 11, 3, 13, 5, 15, 7});
29112 } else {
29113 // SRA upper i32, SRL whole i64 and select lower i32.
29115 ShiftAmt, DAG);
29116 SDValue Lower =
29117 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
29118 Lower = DAG.getBitcast(ExVT, Lower);
29119 if (VT == MVT::v2i64)
29120 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
29121 if (VT == MVT::v4i64)
29122 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
29123 {8, 1, 10, 3, 12, 5, 14, 7});
29124 }
29125 return DAG.getBitcast(VT, Ex);
29126 };
29127
29128 // Optimize shl/srl/sra with constant shift amount.
29129 APInt APIntShiftAmt;
29130 if (!X86::isConstantSplat(Amt, APIntShiftAmt))
29131 return SDValue();
29132
29133 // If the shift amount is out of range, return undef.
29134 if (APIntShiftAmt.uge(EltSizeInBits))
29135 return DAG.getUNDEF(VT);
29136
29137 uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();
29138
29139 if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {
29140 // Hardware support for vector shifts is sparse which makes us scalarize the
29141 // vector operations in many cases. Also, on sandybridge ADD is faster than
29142 // shl: (shl V, 1) -> (add (freeze V), (freeze V))
29143 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
29144 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
29145 // must be 0). (add undef, undef) however can be any value. To make this
29146 // safe, we must freeze R to ensure that register allocation uses the same
29147 // register for an undefined value. This ensures that the result will
29148 // still be even and preserves the original semantics.
29149 R = DAG.getFreeze(R);
29150 return DAG.getNode(ISD::ADD, dl, VT, R, R);
29151 }
29152
29153 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
29154 }
29155
29156 // i64 SRA needs to be performed as partial shifts.
29157 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
29158 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
29159 Op.getOpcode() == ISD::SRA)
29160 return ArithmeticShiftRight64(ShiftAmt);
29161
29162 // If we're logical shifting an all-signbits value then we can just perform as
29163 // a mask.
29164 if ((Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL) &&
29165 DAG.ComputeNumSignBits(R) == EltSizeInBits) {
29166 SDValue Mask = DAG.getAllOnesConstant(dl, VT);
29167 Mask = DAG.getNode(Op.getOpcode(), dl, VT, Mask, Amt);
29168 return DAG.getNode(ISD::AND, dl, VT, R, Mask);
29169 }
29170
29171 if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
29172 (Subtarget.hasBWI() && VT == MVT::v64i8)) {
29173 unsigned NumElts = VT.getVectorNumElements();
29174 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29175
29176 // Simple i8 add case
29177 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {
29178 // R may be undef at run-time, but (shl R, 1) must be an even number (LSB
29179 // must be 0). (add undef, undef) however can be any value. To make this
29180 // safe, we must freeze R to ensure that register allocation uses the same
29181 // register for an undefined value. This ensures that the result will
29182 // still be even and preserves the original semantics.
29183 R = DAG.getFreeze(R);
29184 return DAG.getNode(ISD::ADD, dl, VT, R, R);
29185 }
29186
29187 // ashr(R, 7) === cmp_slt(R, 0)
29188 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
29189 SDValue Zeros = DAG.getConstant(0, dl, VT);
29190 if (VT.is512BitVector()) {
29191 assert(VT == MVT::v64i8 && "Unexpected element type!");
29192 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);
29193 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
29194 }
29195 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
29196 }
29197
29198 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
29199 if (VT == MVT::v16i8 && Subtarget.hasXOP())
29200 return SDValue();
29201
29202 if (Op.getOpcode() == ISD::SHL) {
29203 // Make a large shift.
29204 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,
29205 ShiftAmt, DAG);
29206 SHL = DAG.getBitcast(VT, SHL);
29207 // Zero out the rightmost bits.
29208 APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);
29209 return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));
29210 }
29211 if (Op.getOpcode() == ISD::SRL) {
29212 // Make a large shift.
29213 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,
29214 ShiftAmt, DAG);
29215 SRL = DAG.getBitcast(VT, SRL);
29216 // Zero out the leftmost bits.
29217 APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);
29218 return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));
29219 }
29220 if (Op.getOpcode() == ISD::SRA) {
29221 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
29222 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
29223
29224 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
29225 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
29226 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
29227 return Res;
29228 }
29229 llvm_unreachable("Unknown shift opcode.");
29230 }
29231
29232 return SDValue();
29233}
29234
29236 const X86Subtarget &Subtarget) {
29237 MVT VT = Op.getSimpleValueType();
29238 SDLoc dl(Op);
29239 SDValue R = Op.getOperand(0);
29240 SDValue Amt = Op.getOperand(1);
29241 unsigned Opcode = Op.getOpcode();
29242 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);
29243
29244 int BaseShAmtIdx = -1;
29245 if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {
29246 if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))
29247 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,
29248 Subtarget, DAG);
29249
29250 // vXi8 shifts - shift as v8i16 + mask result.
29251 if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||
29252 (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||
29253 VT == MVT::v64i8) &&
29254 !Subtarget.hasXOP()) {
29255 unsigned NumElts = VT.getVectorNumElements();
29256 MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29257 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {
29258 unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);
29259 unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);
29260
29261 // Create the mask using vXi16 shifts. For shift-rights we need to move
29262 // the upper byte down before splatting the vXi8 mask.
29263 SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);
29264 BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,
29265 BaseShAmt, BaseShAmtIdx, Subtarget, DAG);
29266 if (Opcode != ISD::SHL)
29267 BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,
29268 8, DAG);
29269 BitMask = DAG.getBitcast(VT, BitMask);
29270 BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,
29271 SmallVector<int, 64>(NumElts, 0));
29272
29273 SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,
29274 DAG.getBitcast(ExtVT, R), BaseShAmt,
29275 BaseShAmtIdx, Subtarget, DAG);
29276 Res = DAG.getBitcast(VT, Res);
29277 Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);
29278
29279 if (Opcode == ISD::SRA) {
29280 // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)
29281 // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.
29282 SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);
29283 SignMask =
29284 getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,
29285 BaseShAmtIdx, Subtarget, DAG);
29286 SignMask = DAG.getBitcast(VT, SignMask);
29287 Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);
29288 Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);
29289 }
29290 return Res;
29291 }
29292 }
29293 }
29294
29295 return SDValue();
29296}
29297
29298// Convert a shift/rotate left amount to a multiplication scale factor.
29300 const X86Subtarget &Subtarget,
29301 SelectionDAG &DAG) {
29302 MVT VT = Amt.getSimpleValueType();
29303 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
29304 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
29305 (Subtarget.hasAVX512() && VT == MVT::v32i16) ||
29306 (!Subtarget.hasAVX512() && VT == MVT::v16i8) ||
29307 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
29308 (Subtarget.hasBWI() && VT == MVT::v64i8)))
29309 return SDValue();
29310
29311 MVT SVT = VT.getVectorElementType();
29312 unsigned SVTBits = SVT.getSizeInBits();
29313 unsigned NumElems = VT.getVectorNumElements();
29314
29315 APInt UndefElts;
29316 SmallVector<APInt> EltBits;
29317 if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {
29318 APInt One(SVTBits, 1);
29319 SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));
29320 for (unsigned I = 0; I != NumElems; ++I) {
29321 if (UndefElts[I] || EltBits[I].uge(SVTBits))
29322 continue;
29323 uint64_t ShAmt = EltBits[I].getZExtValue();
29324 Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);
29325 }
29326 return DAG.getBuildVector(VT, dl, Elts);
29327 }
29328
29329 // If the target doesn't support variable shifts, use either FP conversion
29330 // or integer multiplication to avoid shifting each element individually.
29331 if (VT == MVT::v4i32) {
29332 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
29333 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
29334 DAG.getConstant(0x3f800000U, dl, VT));
29335 Amt = DAG.getBitcast(MVT::v4f32, Amt);
29336 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
29337 }
29338
29339 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
29340 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
29341 SDValue Z = DAG.getConstant(0, dl, VT);
29342 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
29343 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
29344 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
29345 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
29346 if (Subtarget.hasSSE41())
29347 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
29348 return getPack(DAG, Subtarget, dl, VT, Lo, Hi);
29349 }
29350
29351 return SDValue();
29352}
29353
29354static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
29355 SelectionDAG &DAG) {
29356 MVT VT = Op.getSimpleValueType();
29357 SDLoc dl(Op);
29358 SDValue R = Op.getOperand(0);
29359 SDValue Amt = Op.getOperand(1);
29360 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29361 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
29362
29363 unsigned Opc = Op.getOpcode();
29364 unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);
29365 unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);
29366
29367 assert(VT.isVector() && "Custom lowering only for vector shifts!");
29368 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
29369
29370 if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))
29371 return V;
29372
29373 if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))
29374 return V;
29375
29376 if (supportedVectorVarShift(VT, Subtarget, Opc))
29377 return Op;
29378
29379 // i64 vector arithmetic shift can be emulated with the transform:
29380 // M = lshr(SIGN_MASK, Amt)
29381 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
29382 if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||
29383 (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
29384 Opc == ISD::SRA) {
29385 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
29386 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
29387 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
29388 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
29389 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
29390 return R;
29391 }
29392
29393 // XOP has 128-bit variable logical/arithmetic shifts.
29394 // +ve/-ve Amt = shift left/right.
29395 if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||
29396 VT == MVT::v8i16 || VT == MVT::v16i8)) {
29397 if (Opc == ISD::SRL || Opc == ISD::SRA)
29398 Amt = DAG.getNegative(Amt, dl, VT);
29399 if (Opc == ISD::SHL || Opc == ISD::SRL)
29400 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
29401 if (Opc == ISD::SRA)
29402 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
29403 }
29404
29405 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
29406 // shifts per-lane and then shuffle the partial results back together.
29407 if (VT == MVT::v2i64 && Opc != ISD::SRA) {
29408 // Splat the shift amounts so the scalar shifts above will catch it.
29409 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
29410 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
29411 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
29412 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
29413 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
29414 }
29415
29416 // If possible, lower this shift as a sequence of two shifts by
29417 // constant plus a BLENDing shuffle instead of scalarizing it.
29418 // Example:
29419 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
29420 //
29421 // Could be rewritten as:
29422 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
29423 //
29424 // The advantage is that the two shifts from the example would be
29425 // lowered as X86ISD::VSRLI nodes in parallel before blending.
29426 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
29427 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
29428 SDValue Amt1, Amt2;
29429 unsigned NumElts = VT.getVectorNumElements();
29430 SmallVector<int, 8> ShuffleMask;
29431 for (unsigned i = 0; i != NumElts; ++i) {
29432 SDValue A = Amt->getOperand(i);
29433 if (A.isUndef()) {
29434 ShuffleMask.push_back(SM_SentinelUndef);
29435 continue;
29436 }
29437 if (!Amt1 || Amt1 == A) {
29438 ShuffleMask.push_back(i);
29439 Amt1 = A;
29440 continue;
29441 }
29442 if (!Amt2 || Amt2 == A) {
29443 ShuffleMask.push_back(i + NumElts);
29444 Amt2 = A;
29445 continue;
29446 }
29447 break;
29448 }
29449
29450 // Only perform this blend if we can perform it without loading a mask.
29451 if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
29452 (VT != MVT::v16i16 ||
29453 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
29454 (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||
29455 canWidenShuffleElements(ShuffleMask))) {
29456 auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);
29457 auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
29458 if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
29459 Cst2->getAPIntValue().ult(EltSizeInBits)) {
29460 SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
29461 Cst1->getZExtValue(), DAG);
29462 SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
29463 Cst2->getZExtValue(), DAG);
29464 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
29465 }
29466 }
29467 }
29468
29469 // If possible, lower this packed shift into a vector multiply instead of
29470 // expanding it into a sequence of scalar shifts.
29471 // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
29472 if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||
29473 Subtarget.canExtendTo512BW())))
29474 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
29475 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
29476
29477 // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we
29478 // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).
29479 if (Opc == ISD::SRL && ConstantAmt &&
29480 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
29481 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
29482 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
29483 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
29484 SDValue Zero = DAG.getConstant(0, dl, VT);
29485 SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);
29486 SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);
29487 return DAG.getSelect(dl, VT, ZAmt, R, Res);
29488 }
29489 }
29490
29491 // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we
29492 // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).
29493 // TODO: Special case handling for shift by 0/1, really we can afford either
29494 // of these cases in pre-SSE41/XOP/AVX512 but not both.
29495 if (Opc == ISD::SRA && ConstantAmt &&
29496 (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&
29497 ((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&
29498 !Subtarget.hasAVX512()) ||
29499 DAG.isKnownNeverZero(Amt))) {
29500 SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);
29501 SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);
29502 if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {
29503 SDValue Amt0 =
29504 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);
29505 SDValue Amt1 =
29506 DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);
29507 SDValue Sra1 =
29508 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);
29509 SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);
29510 Res = DAG.getSelect(dl, VT, Amt0, R, Res);
29511 return DAG.getSelect(dl, VT, Amt1, Sra1, Res);
29512 }
29513 }
29514
29515 // v4i32 Non Uniform Shifts.
29516 // If the shift amount is constant we can shift each lane using the SSE2
29517 // immediate shifts, else we need to zero-extend each lane to the lower i64
29518 // and shift using the SSE2 variable shifts.
29519 // The separate results can then be blended together.
29520 if (VT == MVT::v4i32) {
29521 SDValue Amt0, Amt1, Amt2, Amt3;
29522 if (ConstantAmt) {
29523 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
29524 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
29525 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
29526 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
29527 } else {
29528 // The SSE2 shifts use the lower i64 as the same shift amount for
29529 // all lanes and the upper i64 is ignored. On AVX we're better off
29530 // just zero-extending, but for SSE just duplicating the top 16-bits is
29531 // cheaper and has the same effect for out of range values.
29532 if (Subtarget.hasAVX()) {
29533 SDValue Z = DAG.getConstant(0, dl, VT);
29534 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
29535 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
29536 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
29537 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
29538 } else {
29539 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
29540 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
29541 {4, 5, 6, 7, -1, -1, -1, -1});
29542 SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);
29543 SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);
29544 Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);
29545 Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);
29546 Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);
29547 Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);
29548 }
29549 }
29550
29551 unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;
29552 SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));
29553 SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));
29554 SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));
29555 SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));
29556
29557 // Merge the shifted lane results optimally with/without PBLENDW.
29558 // TODO - ideally shuffle combining would handle this.
29559 if (Subtarget.hasSSE41()) {
29560 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
29561 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
29562 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
29563 }
29564 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
29565 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
29566 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
29567 }
29568
29569 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
29570 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
29571 // make the existing SSE solution better.
29572 // NOTE: We honor prefered vector width before promoting to 512-bits.
29573 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
29574 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
29575 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
29576 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
29577 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
29578 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
29579 "Unexpected vector type");
29580 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
29581 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
29582 unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
29583 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
29584 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
29585 return DAG.getNode(ISD::TRUNCATE, dl, VT,
29586 DAG.getNode(Opc, dl, ExtVT, R, Amt));
29587 }
29588
29589 // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
29590 // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
29591 if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
29592 (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
29593 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
29594 !Subtarget.hasXOP()) {
29595 int NumElts = VT.getVectorNumElements();
29596 SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
29597
29598 // Extend constant shift amount to vXi16 (it doesn't matter if the type
29599 // isn't legal).
29600 MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);
29601 Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);
29602 Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);
29603 Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);
29605 "Constant build vector expected");
29606
29607 if (VT == MVT::v16i8 && Subtarget.hasInt256()) {
29608 bool IsSigned = Opc == ISD::SRA;
29609 R = DAG.getExtOrTrunc(IsSigned, R, dl, ExVT);
29610 R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);
29611 R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);
29612 return DAG.getZExtOrTrunc(R, dl, VT);
29613 }
29614
29615 SmallVector<SDValue, 16> LoAmt, HiAmt;
29616 for (int i = 0; i != NumElts; i += 16) {
29617 for (int j = 0; j != 8; ++j) {
29618 LoAmt.push_back(Amt.getOperand(i + j));
29619 HiAmt.push_back(Amt.getOperand(i + j + 8));
29620 }
29621 }
29622
29623 MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);
29624 SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);
29625 SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);
29626
29627 SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));
29628 SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));
29629 LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);
29630 HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);
29631 LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);
29632 HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);
29633 LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);
29634 HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);
29635 return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
29636 }
29637
29638 if (VT == MVT::v16i8 ||
29639 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
29640 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
29641 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
29642
29643 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
29644 if (VT.is512BitVector()) {
29645 // On AVX512BW targets we make use of the fact that VSELECT lowers
29646 // to a masked blend which selects bytes based just on the sign bit
29647 // extracted to a mask.
29648 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
29649 V0 = DAG.getBitcast(VT, V0);
29650 V1 = DAG.getBitcast(VT, V1);
29651 Sel = DAG.getBitcast(VT, Sel);
29652 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
29653 ISD::SETGT);
29654 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
29655 } else if (Subtarget.hasSSE41()) {
29656 // On SSE41 targets we can use PBLENDVB which selects bytes based just
29657 // on the sign bit.
29658 V0 = DAG.getBitcast(VT, V0);
29659 V1 = DAG.getBitcast(VT, V1);
29660 Sel = DAG.getBitcast(VT, Sel);
29661 return DAG.getBitcast(SelVT,
29662 DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
29663 }
29664 // On pre-SSE41 targets we test for the sign bit by comparing to
29665 // zero - a negative value will set all bits of the lanes to true
29666 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
29667 SDValue Z = DAG.getConstant(0, dl, SelVT);
29668 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
29669 return DAG.getSelect(dl, SelVT, C, V0, V1);
29670 };
29671
29672 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
29673 // We can safely do this using i16 shifts as we're only interested in
29674 // the 3 lower bits of each byte.
29675 Amt = DAG.getBitcast(ExtVT, Amt);
29676 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);
29677 Amt = DAG.getBitcast(VT, Amt);
29678
29679 if (Opc == ISD::SHL || Opc == ISD::SRL) {
29680 // r = VSELECT(r, shift(r, 4), a);
29681 SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));
29682 R = SignBitSelect(VT, Amt, M, R);
29683
29684 // a += a
29685 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29686
29687 // r = VSELECT(r, shift(r, 2), a);
29688 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));
29689 R = SignBitSelect(VT, Amt, M, R);
29690
29691 // a += a
29692 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29693
29694 // return VSELECT(r, shift(r, 1), a);
29695 M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));
29696 R = SignBitSelect(VT, Amt, M, R);
29697 return R;
29698 }
29699
29700 if (Opc == ISD::SRA) {
29701 // For SRA we need to unpack each byte to the higher byte of a i16 vector
29702 // so we can correctly sign extend. We don't care what happens to the
29703 // lower byte.
29704 SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
29705 SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);
29706 SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);
29707 SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);
29708 ALo = DAG.getBitcast(ExtVT, ALo);
29709 AHi = DAG.getBitcast(ExtVT, AHi);
29710 RLo = DAG.getBitcast(ExtVT, RLo);
29711 RHi = DAG.getBitcast(ExtVT, RHi);
29712
29713 // r = VSELECT(r, shift(r, 4), a);
29714 SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);
29715 SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);
29716 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29717 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29718
29719 // a += a
29720 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
29721 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
29722
29723 // r = VSELECT(r, shift(r, 2), a);
29724 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);
29725 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);
29726 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29727 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29728
29729 // a += a
29730 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
29731 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
29732
29733 // r = VSELECT(r, shift(r, 1), a);
29734 MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);
29735 MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);
29736 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
29737 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
29738
29739 // Logical shift the result back to the lower byte, leaving a zero upper
29740 // byte meaning that we can safely pack with PACKUSWB.
29741 RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);
29742 RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);
29743 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
29744 }
29745 }
29746
29747 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
29748 MVT ExtVT = MVT::v8i32;
29749 SDValue Z = DAG.getConstant(0, dl, VT);
29750 SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);
29751 SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);
29752 SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);
29753 SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);
29754 ALo = DAG.getBitcast(ExtVT, ALo);
29755 AHi = DAG.getBitcast(ExtVT, AHi);
29756 RLo = DAG.getBitcast(ExtVT, RLo);
29757 RHi = DAG.getBitcast(ExtVT, RHi);
29758 SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);
29759 SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);
29760 Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);
29761 Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);
29762 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
29763 }
29764
29765 if (VT == MVT::v8i16) {
29766 // If we have a constant shift amount, the non-SSE41 path is best as
29767 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
29768 bool UseSSE41 = Subtarget.hasSSE41() &&
29770
29771 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
29772 // On SSE41 targets we can use PBLENDVB which selects bytes based just on
29773 // the sign bit.
29774 if (UseSSE41) {
29775 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
29776 V0 = DAG.getBitcast(ExtVT, V0);
29777 V1 = DAG.getBitcast(ExtVT, V1);
29778 Sel = DAG.getBitcast(ExtVT, Sel);
29779 return DAG.getBitcast(
29780 VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
29781 }
29782 // On pre-SSE41 targets we splat the sign bit - a negative value will
29783 // set all bits of the lanes to true and VSELECT uses that in
29784 // its OR(AND(V0,C),AND(V1,~C)) lowering.
29785 SDValue C =
29786 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);
29787 return DAG.getSelect(dl, VT, C, V0, V1);
29788 };
29789
29790 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
29791 if (UseSSE41) {
29792 // On SSE41 targets we need to replicate the shift mask in both
29793 // bytes for PBLENDVB.
29794 Amt = DAG.getNode(
29795 ISD::OR, dl, VT,
29796 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),
29797 getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));
29798 } else {
29799 Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);
29800 }
29801
29802 // r = VSELECT(r, shift(r, 8), a);
29803 SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);
29804 R = SignBitSelect(Amt, M, R);
29805
29806 // a += a
29807 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29808
29809 // r = VSELECT(r, shift(r, 4), a);
29810 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);
29811 R = SignBitSelect(Amt, M, R);
29812
29813 // a += a
29814 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29815
29816 // r = VSELECT(r, shift(r, 2), a);
29817 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);
29818 R = SignBitSelect(Amt, M, R);
29819
29820 // a += a
29821 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
29822
29823 // return VSELECT(r, shift(r, 1), a);
29824 M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);
29825 R = SignBitSelect(Amt, M, R);
29826 return R;
29827 }
29828
29829 // Decompose 256-bit shifts into 128-bit shifts.
29830 if (VT.is256BitVector())
29831 return splitVectorIntBinary(Op, DAG, dl);
29832
29833 if (VT == MVT::v32i16 || VT == MVT::v64i8)
29834 return splitVectorIntBinary(Op, DAG, dl);
29835
29836 return SDValue();
29837}
29838
29840 SelectionDAG &DAG) {
29841 MVT VT = Op.getSimpleValueType();
29842 assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&
29843 "Unexpected funnel shift opcode!");
29844
29845 SDLoc DL(Op);
29846 SDValue Op0 = Op.getOperand(0);
29847 SDValue Op1 = Op.getOperand(1);
29848 SDValue Amt = Op.getOperand(2);
29849 unsigned EltSizeInBits = VT.getScalarSizeInBits();
29850 bool IsFSHR = Op.getOpcode() == ISD::FSHR;
29851
29852 if (VT.isVector()) {
29853 APInt APIntShiftAmt;
29854 bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);
29855 unsigned NumElts = VT.getVectorNumElements();
29856
29857 if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {
29858 if (IsFSHR)
29859 std::swap(Op0, Op1);
29860
29861 if (IsCstSplat) {
29862 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
29863 SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);
29864 return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,
29865 {Op0, Op1, Imm}, DAG, Subtarget);
29866 }
29867 return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
29868 {Op0, Op1, Amt}, DAG, Subtarget);
29869 }
29870 assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||
29871 VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||
29872 VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&
29873 "Unexpected funnel shift type!");
29874
29875 // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.
29876 // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).
29877 if (IsCstSplat) {
29878 // TODO: Can't use generic expansion as UNDEF amt elements can be
29879 // converted to other values when folded to shift amounts, losing the
29880 // splat.
29881 uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);
29882 uint64_t ShXAmt = IsFSHR ? (EltSizeInBits - ShiftAmt) : ShiftAmt;
29883 uint64_t ShYAmt = IsFSHR ? ShiftAmt : (EltSizeInBits - ShiftAmt);
29884 assert((ShXAmt + ShYAmt) == EltSizeInBits && "Illegal funnel shift");
29885
29886 if (EltSizeInBits == 8 && ShXAmt > 1 &&
29887 (Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT))) {
29888 // For vXi8 cases on Subtargets that can perform VPCMOV/VPTERNLOG
29889 // bit-select - lower using vXi16 shifts and then perform the bitmask at
29890 // the original vector width to handle cases where we split.
29891 MVT WideVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
29892 APInt MaskX = APInt::getHighBitsSet(8, 8 - ShXAmt);
29893 APInt MaskY = APInt::getLowBitsSet(8, 8 - ShYAmt);
29894 SDValue ShX =
29895 DAG.getNode(ISD::SHL, DL, WideVT, DAG.getBitcast(WideVT, Op0),
29896 DAG.getShiftAmountConstant(ShXAmt, WideVT, DL));
29897 SDValue ShY =
29898 DAG.getNode(ISD::SRL, DL, WideVT, DAG.getBitcast(WideVT, Op1),
29899 DAG.getShiftAmountConstant(ShYAmt, WideVT, DL));
29900 ShX = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShX),
29901 DAG.getConstant(MaskX, DL, VT));
29902 ShY = DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, ShY),
29903 DAG.getConstant(MaskY, DL, VT));
29904 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
29905 }
29906
29907 SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, Op0,
29908 DAG.getShiftAmountConstant(ShXAmt, VT, DL));
29909 SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Op1,
29910 DAG.getShiftAmountConstant(ShYAmt, VT, DL));
29911 return DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
29912 }
29913
29914 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
29915 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
29916 bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());
29917
29918 // Constant vXi16 funnel shifts can be efficiently handled by default.
29919 if (IsCst && EltSizeInBits == 16)
29920 return SDValue();
29921
29922 unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;
29923 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
29924 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
29925
29926 // Split 256-bit integers on XOP/pre-AVX2 targets.
29927 // Split 512-bit integers on non 512-bit BWI targets.
29928 if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||
29929 !Subtarget.hasAVX2())) ||
29930 (VT.is512BitVector() && !Subtarget.useBWIRegs() &&
29931 EltSizeInBits < 32)) {
29932 // Pre-mask the amount modulo using the wider vector.
29933 Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);
29934 return splitVectorOp(Op, DAG, DL);
29935 }
29936
29937 // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))
29938 if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {
29939 int ScalarAmtIdx = -1;
29940 if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {
29941 // Uniform vXi16 funnel shifts can be efficiently handled by default.
29942 if (EltSizeInBits == 16)
29943 return SDValue();
29944
29945 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
29946 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
29947 Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,
29948 ScalarAmtIdx, Subtarget, DAG);
29949 Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,
29950 ScalarAmtIdx, Subtarget, DAG);
29951 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
29952 }
29953 }
29954
29955 MVT WideSVT = MVT::getIntegerVT(
29956 std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));
29957 MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);
29958
29959 // If per-element shifts are legal, fallback to generic expansion.
29960 if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())
29961 return SDValue();
29962
29963 // Attempt to fold as:
29964 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
29965 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
29966 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
29967 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
29968 Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);
29969 Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);
29970 AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
29971 Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,
29972 EltSizeInBits, DAG);
29973 SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);
29974 Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);
29975 if (!IsFSHR)
29976 Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,
29977 EltSizeInBits, DAG);
29978 return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
29979 }
29980
29981 // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)
29982 if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||
29983 supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {
29984 SDValue Z = DAG.getConstant(0, DL, VT);
29985 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));
29986 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));
29987 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
29988 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
29989 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
29990 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
29991 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);
29992 }
29993
29994 // Fallback to generic expansion.
29995 return SDValue();
29996 }
29997 assert(
29998 (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
29999 "Unexpected funnel shift type!");
30000
30001 // Expand slow SHLD/SHRD cases if we are not optimizing for size.
30002 bool OptForSize = DAG.shouldOptForSize();
30003 bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
30004
30005 // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
30006 // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
30007 if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
30008 !isa<ConstantSDNode>(Amt)) {
30009 SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
30010 SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
30011 Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
30012 Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
30013 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
30014 SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
30015 Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
30016 if (IsFSHR) {
30017 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
30018 } else {
30019 Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
30020 Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
30021 }
30022 return DAG.getZExtOrTrunc(Res, DL, VT);
30023 }
30024
30025 if (VT == MVT::i8 || ExpandFunnel)
30026 return SDValue();
30027
30028 // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
30029 if (VT == MVT::i16) {
30030 Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
30031 DAG.getConstant(15, DL, Amt.getValueType()));
30032 unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
30033 return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
30034 }
30035
30036 return Op;
30037}
30038
30039static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
30040 SelectionDAG &DAG) {
30041 MVT VT = Op.getSimpleValueType();
30042 assert(VT.isVector() && "Custom lowering only for vector rotates!");
30043
30044 SDLoc DL(Op);
30045 SDValue R = Op.getOperand(0);
30046 SDValue Amt = Op.getOperand(1);
30047 unsigned Opcode = Op.getOpcode();
30048 unsigned EltSizeInBits = VT.getScalarSizeInBits();
30049 int NumElts = VT.getVectorNumElements();
30050 bool IsROTL = Opcode == ISD::ROTL;
30051
30052 // Check for constant splat rotation amount.
30053 APInt CstSplatValue;
30054 bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
30055
30056 // Check for splat rotate by zero.
30057 if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
30058 return R;
30059
30060 // AVX512 implicitly uses modulo rotation amounts.
30061 if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
30062 // Attempt to rotate by immediate.
30063 if (IsCstSplat) {
30064 unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;
30065 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30066 return DAG.getNode(RotOpc, DL, VT, R,
30067 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
30068 }
30069
30070 // Else, fall-back on VPROLV/VPRORV.
30071 return Op;
30072 }
30073
30074 // AVX512 VBMI2 vXi16 - lower to funnel shifts.
30075 if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {
30076 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
30077 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
30078 }
30079
30080 SDValue Z = DAG.getConstant(0, DL, VT);
30081
30082 if (!IsROTL) {
30083 // If the ISD::ROTR amount is constant, we're always better converting to
30084 // ISD::ROTL.
30085 if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))
30086 return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);
30087
30088 // XOP targets always prefers ISD::ROTL.
30089 if (Subtarget.hasXOP())
30090 return DAG.getNode(ISD::ROTL, DL, VT, R,
30091 DAG.getNode(ISD::SUB, DL, VT, Z, Amt));
30092 }
30093
30094 // Split 256-bit integers on XOP/pre-AVX2 targets.
30095 if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))
30096 return splitVectorIntBinary(Op, DAG, DL);
30097
30098 // XOP has 128-bit vector variable + immediate rotates.
30099 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
30100 // XOP implicitly uses modulo rotation amounts.
30101 if (Subtarget.hasXOP()) {
30102 assert(IsROTL && "Only ROTL expected");
30103 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
30104
30105 // Attempt to rotate by immediate.
30106 if (IsCstSplat) {
30107 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30108 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
30109 DAG.getTargetConstant(RotAmt, DL, MVT::i8));
30110 }
30111
30112 // Use general rotate by variable (per-element).
30113 return Op;
30114 }
30115
30116 // Rotate by an uniform constant - expand back to shifts.
30117 // TODO: Can't use generic expansion as UNDEF amt elements can be converted
30118 // to other values when folded to shift amounts, losing the splat.
30119 if (IsCstSplat) {
30120 uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
30121 uint64_t ShlAmt = IsROTL ? RotAmt : (EltSizeInBits - RotAmt);
30122 uint64_t SrlAmt = IsROTL ? (EltSizeInBits - RotAmt) : RotAmt;
30123 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, R,
30124 DAG.getShiftAmountConstant(ShlAmt, VT, DL));
30125 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, R,
30126 DAG.getShiftAmountConstant(SrlAmt, VT, DL));
30127 return DAG.getNode(ISD::OR, DL, VT, Shl, Srl);
30128 }
30129
30130 // Split 512-bit integers on non 512-bit BWI targets.
30131 if (VT.is512BitVector() && !Subtarget.useBWIRegs())
30132 return splitVectorIntBinary(Op, DAG, DL);
30133
30134 assert(
30135 (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
30136 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
30137 Subtarget.hasAVX2()) ||
30138 ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&
30139 "Only vXi32/vXi16/vXi8 vector rotates supported");
30140
30141 MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);
30142 MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);
30143
30144 SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);
30145 SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30146
30147 // Attempt to fold as unpack(x,x) << zext(splat(y)):
30148 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
30149 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
30150 if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {
30151 int BaseRotAmtIdx = -1;
30152 if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {
30153 if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {
30154 unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;
30155 return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);
30156 }
30157 unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;
30158 SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
30159 SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
30160 Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,
30161 BaseRotAmtIdx, Subtarget, DAG);
30162 Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,
30163 BaseRotAmtIdx, Subtarget, DAG);
30164 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
30165 }
30166 }
30167
30168 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
30169 unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;
30170
30171 // Attempt to fold as unpack(x,x) << zext(y):
30172 // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.
30173 // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).
30174 // Const vXi16/vXi32 are excluded in favor of MUL-based lowering.
30175 if (!(ConstantAmt && EltSizeInBits != 8) &&
30176 !supportedVectorVarShift(VT, Subtarget, ShiftOpc) &&
30177 (ConstantAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc))) {
30178 SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));
30179 SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));
30180 SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));
30181 SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));
30182 SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);
30183 SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);
30184 return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);
30185 }
30186
30187 // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by
30188 // the amount bit.
30189 // TODO: We're doing nothing here that we couldn't do for funnel shifts.
30190 if (EltSizeInBits == 8) {
30191 MVT WideVT =
30192 MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);
30193
30194 // Attempt to fold as:
30195 // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.
30196 // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).
30197 if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&
30198 supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {
30199 // If we're rotating by constant, just use default promotion.
30200 if (ConstantAmt)
30201 return SDValue();
30202 // See if we can perform this by widening to vXi16 or vXi32.
30203 R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);
30204 R = DAG.getNode(
30205 ISD::OR, DL, WideVT, R,
30206 getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));
30207 Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);
30208 R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);
30209 if (IsROTL)
30210 R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);
30211 return DAG.getNode(ISD::TRUNCATE, DL, VT, R);
30212 }
30213
30214 // We don't need ModuloAmt here as we just peek at individual bits.
30215 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
30216 if (Subtarget.hasSSE41()) {
30217 // On SSE41 targets we can use PBLENDVB which selects bytes based just
30218 // on the sign bit.
30219 V0 = DAG.getBitcast(VT, V0);
30220 V1 = DAG.getBitcast(VT, V1);
30221 Sel = DAG.getBitcast(VT, Sel);
30222 return DAG.getBitcast(SelVT,
30223 DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
30224 }
30225 // On pre-SSE41 targets we test for the sign bit by comparing to
30226 // zero - a negative value will set all bits of the lanes to true
30227 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
30228 SDValue Z = DAG.getConstant(0, DL, SelVT);
30229 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
30230 return DAG.getSelect(DL, SelVT, C, V0, V1);
30231 };
30232
30233 // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.
30234 if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {
30235 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
30236 IsROTL = true;
30237 }
30238
30239 unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;
30240 unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;
30241
30242 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
30243 // We can safely do this using i16 shifts as we're only interested in
30244 // the 3 lower bits of each byte.
30245 Amt = DAG.getBitcast(ExtVT, Amt);
30246 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
30247 Amt = DAG.getBitcast(VT, Amt);
30248
30249 // r = VSELECT(r, rot(r, 4), a);
30250 SDValue M;
30251 M = DAG.getNode(
30252 ISD::OR, DL, VT,
30253 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),
30254 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));
30255 R = SignBitSelect(VT, Amt, M, R);
30256
30257 // a += a
30258 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
30259
30260 // r = VSELECT(r, rot(r, 2), a);
30261 M = DAG.getNode(
30262 ISD::OR, DL, VT,
30263 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),
30264 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));
30265 R = SignBitSelect(VT, Amt, M, R);
30266
30267 // a += a
30268 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
30269
30270 // return VSELECT(r, rot(r, 1), a);
30271 M = DAG.getNode(
30272 ISD::OR, DL, VT,
30273 DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),
30274 DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));
30275 return SignBitSelect(VT, Amt, M, R);
30276 }
30277
30278 bool IsSplatAmt = DAG.isSplatValue(Amt);
30279 bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
30280 supportedVectorVarShift(VT, Subtarget, ISD::SRL);
30281
30282 // Fallback for splats + all supported variable shifts.
30283 // Fallback for non-constants AVX2 vXi16 as well.
30284 if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
30285 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30286 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
30287 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
30288 SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);
30289 SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);
30290 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
30291 }
30292
30293 // Everything below assumes ISD::ROTL.
30294 if (!IsROTL) {
30295 Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);
30296 IsROTL = true;
30297 }
30298
30299 // ISD::ROT* uses modulo rotate amounts.
30300 Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);
30301
30302 assert(IsROTL && "Only ROTL supported");
30303
30304 // As with shifts, attempt to convert the rotation amount to a multiplication
30305 // factor, fallback to general expansion.
30306 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
30307 if (!Scale)
30308 return SDValue();
30309
30310 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
30311 if (EltSizeInBits == 16) {
30312 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
30313 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
30314 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
30315 }
30316
30317 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
30318 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
30319 // that can then be OR'd with the lower 32-bits.
30320 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
30321 static const int OddMask[] = {1, -1, 3, -1};
30322 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
30323 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
30324
30325 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
30326 DAG.getBitcast(MVT::v2i64, R),
30327 DAG.getBitcast(MVT::v2i64, Scale));
30328 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
30329 DAG.getBitcast(MVT::v2i64, R13),
30330 DAG.getBitcast(MVT::v2i64, Scale13));
30331 Res02 = DAG.getBitcast(VT, Res02);
30332 Res13 = DAG.getBitcast(VT, Res13);
30333
30334 return DAG.getNode(ISD::OR, DL, VT,
30335 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
30336 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
30337}
30338
30339/// Returns true if the operand type is exactly twice the native width, and
30340/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
30341/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
30342/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
30343bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
30344 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
30345
30346 if (OpWidth == 64)
30347 return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();
30348 if (OpWidth == 128)
30349 return Subtarget.canUseCMPXCHG16B();
30350
30351 return false;
30352}
30353
30355X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
30356 Type *MemType = SI->getValueOperand()->getType();
30357
30358 bool NoImplicitFloatOps =
30359 SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
30360 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
30361 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
30362 (Subtarget.hasSSE1() || Subtarget.hasX87()))
30364
30365 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
30367}
30368
30369// Note: this turns large loads into lock cmpxchg8b/16b.
30370// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
30372X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
30373 Type *MemType = LI->getType();
30374
30375 // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
30376 // can use movq to do the load. If we have X87 we can load into an 80-bit
30377 // X87 register and store it to a stack temporary.
30378 bool NoImplicitFloatOps =
30379 LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
30380 if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
30381 !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
30382 (Subtarget.hasSSE1() || Subtarget.hasX87()))
30384
30385 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
30387}
30388
30389enum BitTestKind : unsigned {
30396
30397static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {
30398 using namespace llvm::PatternMatch;
30399 BitTestKind BTK = UndefBit;
30400 auto *C = dyn_cast<ConstantInt>(V);
30401 if (C) {
30402 // Check if V is a power of 2 or NOT power of 2.
30403 if (isPowerOf2_64(C->getZExtValue()))
30404 BTK = ConstantBit;
30405 else if (isPowerOf2_64((~C->getValue()).getZExtValue()))
30406 BTK = NotConstantBit;
30407 return {V, BTK};
30408 }
30409
30410 // Check if V is some power of 2 pattern known to be non-zero
30411 auto *I = dyn_cast<Instruction>(V);
30412 if (I) {
30413 bool Not = false;
30414 // Check if we have a NOT
30415 Value *PeekI;
30416 if (match(I, m_c_Xor(m_Value(PeekI), m_AllOnes())) ||
30417 match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {
30418 Not = true;
30419 I = dyn_cast<Instruction>(PeekI);
30420
30421 // If I is constant, it will fold and we can evaluate later. If its an
30422 // argument or something of that nature, we can't analyze.
30423 if (I == nullptr)
30424 return {nullptr, UndefBit};
30425 }
30426 // We can only use 1 << X without more sophisticated analysis. C << X where
30427 // C is a power of 2 but not 1 can result in zero which cannot be translated
30428 // to bittest. Likewise any C >> X (either arith or logical) can be zero.
30429 if (I->getOpcode() == Instruction::Shl) {
30430 // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &
30431 // -X` and some other provable power of 2 patterns that we can use CTZ on
30432 // may be profitable.
30433 // Todo(2): It may be possible in some cases to prove that Shl(C, X) is
30434 // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also
30435 // be provably a non-zero power of 2.
30436 // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be
30437 // transformable to bittest.
30438 auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));
30439 if (!ShiftVal)
30440 return {nullptr, UndefBit};
30441 if (ShiftVal->equalsInt(1))
30442 BTK = Not ? NotShiftBit : ShiftBit;
30443
30444 if (BTK == UndefBit)
30445 return {nullptr, UndefBit};
30446
30447 Value *BitV = I->getOperand(1);
30448
30449 Value *AndOp;
30450 const APInt *AndC;
30451 if (match(BitV, m_c_And(m_Value(AndOp), m_APInt(AndC)))) {
30452 // Read past a shiftmask instruction to find count
30453 if (*AndC == (I->getType()->getPrimitiveSizeInBits() - 1))
30454 BitV = AndOp;
30455 }
30456 return {BitV, BTK};
30457 }
30458 }
30459 return {nullptr, UndefBit};
30460}
30461
30463X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
30464 using namespace llvm::PatternMatch;
30465 // If the atomicrmw's result isn't actually used, we can just add a "lock"
30466 // prefix to a normal instruction for these operations.
30467 if (AI->use_empty())
30469
30470 if (AI->getOperation() == AtomicRMWInst::Xor) {
30471 // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is
30472 // preferable to both `cmpxchg` and `btc`.
30473 if (match(AI->getOperand(1), m_SignMask()))
30475 }
30476
30477 // If the atomicrmw's result is used by a single bit AND, we may use
30478 // bts/btr/btc instruction for these operations.
30479 // Note: InstCombinePass can cause a de-optimization here. It replaces the
30480 // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor
30481 // (depending on CC). This pattern can only use bts/btr/btc but we don't
30482 // detect it.
30483 Instruction *I = AI->user_back();
30484 auto BitChange = FindSingleBitChange(AI->getValOperand());
30485 if (BitChange.second == UndefBit || !AI->hasOneUse() ||
30486 I->getOpcode() != Instruction::And ||
30487 AI->getType()->getPrimitiveSizeInBits() == 8 ||
30488 AI->getParent() != I->getParent())
30490
30491 unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;
30492
30493 // This is a redundant AND, it should get cleaned up elsewhere.
30494 if (AI == I->getOperand(OtherIdx))
30496
30497 // The following instruction must be a AND single bit.
30498 if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {
30499 auto *C1 = cast<ConstantInt>(AI->getValOperand());
30500 auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));
30501 if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {
30503 }
30504 if (AI->getOperation() == AtomicRMWInst::And) {
30505 return ~C1->getValue() == C2->getValue()
30508 }
30511 }
30512
30513 assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit);
30514
30515 auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));
30516 if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)
30518
30519 assert(BitChange.first != nullptr && BitTested.first != nullptr);
30520
30521 // If shift amounts are not the same we can't use BitTestIntrinsic.
30522 if (BitChange.first != BitTested.first)
30524
30525 // If atomic AND need to be masking all be one bit and testing the one bit
30526 // unset in the mask.
30527 if (AI->getOperation() == AtomicRMWInst::And)
30528 return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)
30531
30532 // If atomic XOR/OR need to be setting and testing the same bit.
30533 return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)
30536}
30537
30538void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
30539 IRBuilder<> Builder(AI);
30540 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
30543 switch (AI->getOperation()) {
30544 default:
30545 llvm_unreachable("Unknown atomic operation");
30546 case AtomicRMWInst::Or:
30547 IID_C = Intrinsic::x86_atomic_bts;
30548 IID_I = Intrinsic::x86_atomic_bts_rm;
30549 break;
30550 case AtomicRMWInst::Xor:
30551 IID_C = Intrinsic::x86_atomic_btc;
30552 IID_I = Intrinsic::x86_atomic_btc_rm;
30553 break;
30554 case AtomicRMWInst::And:
30555 IID_C = Intrinsic::x86_atomic_btr;
30556 IID_I = Intrinsic::x86_atomic_btr_rm;
30557 break;
30558 }
30559 Instruction *I = AI->user_back();
30560 LLVMContext &Ctx = AI->getContext();
30561 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
30563 Function *BitTest = nullptr;
30564 Value *Result = nullptr;
30565 auto BitTested = FindSingleBitChange(AI->getValOperand());
30566 assert(BitTested.first != nullptr);
30567
30568 if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
30569 auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
30570
30571 BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType());
30572
30573 unsigned Imm = llvm::countr_zero(C->getZExtValue());
30574 Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});
30575 } else {
30576 BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType());
30577
30578 assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
30579
30580 Value *SI = BitTested.first;
30581 assert(SI != nullptr);
30582
30583 // BT{S|R|C} on memory operand don't modulo bit position so we need to
30584 // mask it.
30585 unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();
30586 Value *BitPos =
30587 Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));
30588 // Todo(1): In many cases it may be provable that SI is less than
30589 // ShiftBits in which case this mask is unnecessary
30590 // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1
30591 // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
30592 // favor of just a raw BT{S|R|C}.
30593
30594 Result = Builder.CreateCall(BitTest, {Addr, BitPos});
30595 Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
30596
30597 // If the result is only used for zero/non-zero status then we don't need to
30598 // shift value back. Otherwise do so.
30599 for (auto It = I->user_begin(); It != I->user_end(); ++It) {
30600 if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {
30601 if (ICmp->isEquality()) {
30602 auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));
30603 auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));
30604 if (C0 || C1) {
30605 assert(C0 == nullptr || C1 == nullptr);
30606 if ((C0 ? C0 : C1)->isZero())
30607 continue;
30608 }
30609 }
30610 }
30611 Result = Builder.CreateShl(Result, BitPos);
30612 break;
30613 }
30614 }
30615
30616 I->replaceAllUsesWith(Result);
30617 I->eraseFromParent();
30618 AI->eraseFromParent();
30619}
30620
30622 using namespace llvm::PatternMatch;
30623 if (!AI->hasOneUse())
30624 return false;
30625
30626 Value *Op = AI->getOperand(1);
30628 Instruction *I = AI->user_back();
30630 if (Opc == AtomicRMWInst::Add) {
30631 if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))
30632 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
30633 if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {
30634 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30635 return Pred == CmpInst::ICMP_SLT;
30636 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30637 return Pred == CmpInst::ICMP_SGT;
30638 }
30639 return false;
30640 }
30641 if (Opc == AtomicRMWInst::Sub) {
30642 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
30643 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
30644 if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {
30645 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30646 return Pred == CmpInst::ICMP_SLT;
30647 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30648 return Pred == CmpInst::ICMP_SGT;
30649 }
30650 return false;
30651 }
30652 if ((Opc == AtomicRMWInst::Or &&
30654 (Opc == AtomicRMWInst::And &&
30656 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30657 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||
30658 Pred == CmpInst::ICMP_SLT;
30659 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30660 return Pred == CmpInst::ICMP_SGT;
30661 return false;
30662 }
30663 if (Opc == AtomicRMWInst::Xor) {
30664 if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))
30665 return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;
30666 if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {
30667 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))
30668 return Pred == CmpInst::ICMP_SLT;
30669 if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))
30670 return Pred == CmpInst::ICMP_SGT;
30671 }
30672 return false;
30673 }
30674
30675 return false;
30676}
30677
30678void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
30679 AtomicRMWInst *AI) const {
30680 IRBuilder<> Builder(AI);
30681 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
30682 Instruction *TempI = nullptr;
30683 LLVMContext &Ctx = AI->getContext();
30684 ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());
30685 if (!ICI) {
30686 TempI = AI->user_back();
30687 assert(TempI->hasOneUse() && "Must have one use");
30688 ICI = cast<ICmpInst>(TempI->user_back());
30689 }
30691 ICmpInst::Predicate Pred = ICI->getPredicate();
30692 switch (Pred) {
30693 default:
30694 llvm_unreachable("Not supported Pred");
30695 case CmpInst::ICMP_EQ:
30696 CC = X86::COND_E;
30697 break;
30698 case CmpInst::ICMP_NE:
30699 CC = X86::COND_NE;
30700 break;
30701 case CmpInst::ICMP_SLT:
30702 CC = X86::COND_S;
30703 break;
30704 case CmpInst::ICMP_SGT:
30705 CC = X86::COND_NS;
30706 break;
30707 }
30709 switch (AI->getOperation()) {
30710 default:
30711 llvm_unreachable("Unknown atomic operation");
30712 case AtomicRMWInst::Add:
30713 IID = Intrinsic::x86_atomic_add_cc;
30714 break;
30715 case AtomicRMWInst::Sub:
30716 IID = Intrinsic::x86_atomic_sub_cc;
30717 break;
30718 case AtomicRMWInst::Or:
30719 IID = Intrinsic::x86_atomic_or_cc;
30720 break;
30721 case AtomicRMWInst::And:
30722 IID = Intrinsic::x86_atomic_and_cc;
30723 break;
30724 case AtomicRMWInst::Xor:
30725 IID = Intrinsic::x86_atomic_xor_cc;
30726 break;
30727 }
30728 Function *CmpArith =
30729 Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());
30730 Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
30732 Value *Call = Builder.CreateCall(
30733 CmpArith, {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
30734 Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
30735 ICI->replaceAllUsesWith(Result);
30736 ICI->eraseFromParent();
30737 if (TempI)
30738 TempI->eraseFromParent();
30739 AI->eraseFromParent();
30740}
30741
30743X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
30744 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
30745 Type *MemType = AI->getType();
30746
30747 // If the operand is too big, we must see if cmpxchg8/16b is available
30748 // and default to library calls otherwise.
30749 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
30750 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
30752 }
30753
30755 switch (Op) {
30758 case AtomicRMWInst::Add:
30759 case AtomicRMWInst::Sub:
30762 // It's better to use xadd, xsub or xchg for these in other cases.
30764 case AtomicRMWInst::Or:
30765 case AtomicRMWInst::And:
30766 case AtomicRMWInst::Xor:
30769 return shouldExpandLogicAtomicRMWInIR(AI);
30771 case AtomicRMWInst::Max:
30772 case AtomicRMWInst::Min:
30781 default:
30782 // These always require a non-trivial set of data operations on x86. We must
30783 // use a cmpxchg loop.
30785 }
30786}
30787
30788LoadInst *
30789X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
30790 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
30791 Type *MemType = AI->getType();
30792 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
30793 // there is no benefit in turning such RMWs into loads, and it is actually
30794 // harmful as it introduces a mfence.
30795 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
30796 return nullptr;
30797
30798 // If this is a canonical idempotent atomicrmw w/no uses, we have a better
30799 // lowering available in lowerAtomicArith.
30800 // TODO: push more cases through this path.
30801 if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))
30802 if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&
30803 AI->use_empty())
30804 return nullptr;
30805
30806 IRBuilder<> Builder(AI);
30807 Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
30808 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
30809 auto SSID = AI->getSyncScopeID();
30810 // We must restrict the ordering to avoid generating loads with Release or
30811 // ReleaseAcquire orderings.
30813
30814 // Before the load we need a fence. Here is an example lifted from
30815 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
30816 // is required:
30817 // Thread 0:
30818 // x.store(1, relaxed);
30819 // r1 = y.fetch_add(0, release);
30820 // Thread 1:
30821 // y.fetch_add(42, acquire);
30822 // r2 = x.load(relaxed);
30823 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
30824 // lowered to just a load without a fence. A mfence flushes the store buffer,
30825 // making the optimization clearly correct.
30826 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
30827 // otherwise, we might be able to be more aggressive on relaxed idempotent
30828 // rmw. In practice, they do not look useful, so we don't try to be
30829 // especially clever.
30830 if (SSID == SyncScope::SingleThread)
30831 // FIXME: we could just insert an ISD::MEMBARRIER here, except we are at
30832 // the IR level, so we must wrap it in an intrinsic.
30833 return nullptr;
30834
30835 if (!Subtarget.hasMFence())
30836 // FIXME: it might make sense to use a locked operation here but on a
30837 // different cache-line to prevent cache-line bouncing. In practice it
30838 // is probably a small win, and x86 processors without mfence are rare
30839 // enough that we do not bother.
30840 return nullptr;
30841
30842 Function *MFence =
30843 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
30844 Builder.CreateCall(MFence, {});
30845
30846 // Finally we can emit the atomic load.
30847 LoadInst *Loaded = Builder.CreateAlignedLoad(
30848 AI->getType(), AI->getPointerOperand(), AI->getAlign());
30849 Loaded->setAtomic(Order, SSID);
30850 AI->replaceAllUsesWith(Loaded);
30851 AI->eraseFromParent();
30852 return Loaded;
30853}
30854
30855/// Emit a locked operation on a stack location which does not change any
30856/// memory location, but does involve a lock prefix. Location is chosen to be
30857/// a) very likely accessed only by a single thread to minimize cache traffic,
30858/// and b) definitely dereferenceable. Returns the new Chain result.
30860 const X86Subtarget &Subtarget, SDValue Chain,
30861 const SDLoc &DL) {
30862 // Implementation notes:
30863 // 1) LOCK prefix creates a full read/write reordering barrier for memory
30864 // operations issued by the current processor. As such, the location
30865 // referenced is not relevant for the ordering properties of the instruction.
30866 // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,
30867 // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions
30868 // 2) Using an immediate operand appears to be the best encoding choice
30869 // here since it doesn't require an extra register.
30870 // 3) OR appears to be very slightly faster than ADD. (Though, the difference
30871 // is small enough it might just be measurement noise.)
30872 // 4) When choosing offsets, there are several contributing factors:
30873 // a) If there's no redzone, we default to TOS. (We could allocate a cache
30874 // line aligned stack object to improve this case.)
30875 // b) To minimize our chances of introducing a false dependence, we prefer
30876 // to offset the stack usage from TOS slightly.
30877 // c) To minimize concerns about cross thread stack usage - in particular,
30878 // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
30879 // captures state in the TOS frame and accesses it from many threads -
30880 // we want to use an offset such that the offset is in a distinct cache
30881 // line from the TOS frame.
30882 //
30883 // For a general discussion of the tradeoffs and benchmark results, see:
30884 // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
30885
30886 auto &MF = DAG.getMachineFunction();
30887 auto &TFL = *Subtarget.getFrameLowering();
30888 const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
30889
30890 if (Subtarget.is64Bit()) {
30891 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
30892 SDValue Ops[] = {
30893 DAG.getRegister(X86::RSP, MVT::i64), // Base
30894 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
30895 DAG.getRegister(0, MVT::i64), // Index
30896 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
30897 DAG.getRegister(0, MVT::i16), // Segment.
30898 Zero,
30899 Chain};
30900 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
30901 MVT::Other, Ops);
30902 return SDValue(Res, 1);
30903 }
30904
30905 SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
30906 SDValue Ops[] = {
30907 DAG.getRegister(X86::ESP, MVT::i32), // Base
30908 DAG.getTargetConstant(1, DL, MVT::i8), // Scale
30909 DAG.getRegister(0, MVT::i32), // Index
30910 DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp
30911 DAG.getRegister(0, MVT::i16), // Segment.
30912 Zero,
30913 Chain
30914 };
30915 SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,
30916 MVT::Other, Ops);
30917 return SDValue(Res, 1);
30918}
30919
30921 SelectionDAG &DAG) {
30922 SDLoc dl(Op);
30923 AtomicOrdering FenceOrdering =
30924 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
30925 SyncScope::ID FenceSSID =
30926 static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
30927
30928 // The only fence that needs an instruction is a sequentially-consistent
30929 // cross-thread fence.
30930 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
30931 FenceSSID == SyncScope::System) {
30932 if (Subtarget.hasMFence())
30933 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
30934
30935 SDValue Chain = Op.getOperand(0);
30936 return emitLockedStackOp(DAG, Subtarget, Chain, dl);
30937 }
30938
30939 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
30940 return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
30941}
30942
30944 SelectionDAG &DAG) {
30945 MVT T = Op.getSimpleValueType();
30946 SDLoc DL(Op);
30947 unsigned Reg = 0;
30948 unsigned size = 0;
30949 switch(T.SimpleTy) {
30950 default: llvm_unreachable("Invalid value type!");
30951 case MVT::i8: Reg = X86::AL; size = 1; break;
30952 case MVT::i16: Reg = X86::AX; size = 2; break;
30953 case MVT::i32: Reg = X86::EAX; size = 4; break;
30954 case MVT::i64:
30955 assert(Subtarget.is64Bit() && "Node not type legal!");
30956 Reg = X86::RAX; size = 8;
30957 break;
30958 }
30959 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
30960 Op.getOperand(2), SDValue());
30961 SDValue Ops[] = { cpIn.getValue(0),
30962 Op.getOperand(1),
30963 Op.getOperand(3),
30964 DAG.getTargetConstant(size, DL, MVT::i8),
30965 cpIn.getValue(1) };
30966 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
30967 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
30969 Ops, T, MMO);
30970
30971 SDValue cpOut =
30972 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
30973 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
30974 MVT::i32, cpOut.getValue(2));
30975 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
30976
30977 return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),
30978 cpOut, Success, EFLAGS.getValue(1));
30979}
30980
30981// Create MOVMSKB, taking into account whether we need to split for AVX1.
30983 const X86Subtarget &Subtarget) {
30984 MVT InVT = V.getSimpleValueType();
30985
30986 if (InVT == MVT::v64i8) {
30987 SDValue Lo, Hi;
30988 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
30989 Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);
30990 Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);
30991 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);
30992 Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);
30993 Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,
30994 DAG.getConstant(32, DL, MVT::i8));
30995 return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);
30996 }
30997 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
30998 SDValue Lo, Hi;
30999 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
31000 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
31001 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
31002 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
31003 DAG.getConstant(16, DL, MVT::i8));
31004 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
31005 }
31006
31007 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
31008}
31009
31010static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
31011 SelectionDAG &DAG) {
31012 SDValue Src = Op.getOperand(0);
31013 MVT SrcVT = Src.getSimpleValueType();
31014 MVT DstVT = Op.getSimpleValueType();
31015
31016 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
31017 // half to v32i1 and concatenating the result.
31018 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
31019 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
31020 assert(Subtarget.hasBWI() && "Expected BWI target");
31021 SDLoc dl(Op);
31022 SDValue Lo, Hi;
31023 std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);
31024 Lo = DAG.getBitcast(MVT::v32i1, Lo);
31025 Hi = DAG.getBitcast(MVT::v32i1, Hi);
31026 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
31027 }
31028
31029 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
31030 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
31031 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
31032 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
31033 SDLoc DL(Op);
31034 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
31035 V = getPMOVMSKB(DL, V, DAG, Subtarget);
31036 return DAG.getZExtOrTrunc(V, DL, DstVT);
31037 }
31038
31039 assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
31040 SrcVT == MVT::i64) && "Unexpected VT!");
31041
31042 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
31043 if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&
31044 !(DstVT == MVT::x86mmx && SrcVT.isVector()))
31045 // This conversion needs to be expanded.
31046 return SDValue();
31047
31048 SDLoc dl(Op);
31049 if (SrcVT.isVector()) {
31050 // Widen the vector in input in the case of MVT::v2i32.
31051 // Example: from MVT::v2i32 to MVT::v4i32.
31053 SrcVT.getVectorNumElements() * 2);
31054 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,
31055 DAG.getUNDEF(SrcVT));
31056 } else {
31057 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
31058 "Unexpected source type in LowerBITCAST");
31059 Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
31060 }
31061
31062 MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;
31063 Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);
31064
31065 if (DstVT == MVT::x86mmx)
31066 return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);
31067
31068 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,
31069 DAG.getIntPtrConstant(0, dl));
31070}
31071
31072/// Compute the horizontal sum of bytes in V for the elements of VT.
31073///
31074/// Requires V to be a byte vector and VT to be an integer vector type with
31075/// wider elements than V's type. The width of the elements of VT determines
31076/// how many bytes of V are summed horizontally to produce each element of the
31077/// result.
31079 const X86Subtarget &Subtarget,
31080 SelectionDAG &DAG) {
31081 SDLoc DL(V);
31082 MVT ByteVecVT = V.getSimpleValueType();
31083 MVT EltVT = VT.getVectorElementType();
31084 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
31085 "Expected value to have byte element type.");
31086 assert(EltVT != MVT::i8 &&
31087 "Horizontal byte sum only makes sense for wider elements!");
31088 unsigned VecSize = VT.getSizeInBits();
31089 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
31090
31091 // PSADBW instruction horizontally add all bytes and leave the result in i64
31092 // chunks, thus directly computes the pop count for v2i64 and v4i64.
31093 if (EltVT == MVT::i64) {
31094 SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);
31095 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
31096 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
31097 return DAG.getBitcast(VT, V);
31098 }
31099
31100 if (EltVT == MVT::i32) {
31101 // We unpack the low half and high half into i32s interleaved with zeros so
31102 // that we can use PSADBW to horizontally sum them. The most useful part of
31103 // this is that it lines up the results of two PSADBW instructions to be
31104 // two v2i64 vectors which concatenated are the 4 population counts. We can
31105 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
31106 SDValue Zeros = DAG.getConstant(0, DL, VT);
31107 SDValue V32 = DAG.getBitcast(VT, V);
31108 SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);
31109 SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);
31110
31111 // Do the horizontal sums into two v2i64s.
31112 Zeros = DAG.getConstant(0, DL, ByteVecVT);
31113 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
31114 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
31115 DAG.getBitcast(ByteVecVT, Low), Zeros);
31116 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
31117 DAG.getBitcast(ByteVecVT, High), Zeros);
31118
31119 // Merge them together.
31120 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
31121 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
31122 DAG.getBitcast(ShortVecVT, Low),
31123 DAG.getBitcast(ShortVecVT, High));
31124
31125 return DAG.getBitcast(VT, V);
31126 }
31127
31128 // The only element type left is i16.
31129 assert(EltVT == MVT::i16 && "Unknown how to handle type");
31130
31131 // To obtain pop count for each i16 element starting from the pop count for
31132 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
31133 // right by 8. It is important to shift as i16s as i8 vector shift isn't
31134 // directly supported.
31135 SDValue ShifterV = DAG.getConstant(8, DL, VT);
31136 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
31137 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
31138 DAG.getBitcast(ByteVecVT, V));
31139 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
31140}
31141
31143 const X86Subtarget &Subtarget,
31144 SelectionDAG &DAG) {
31145 MVT VT = Op.getSimpleValueType();
31146 MVT EltVT = VT.getVectorElementType();
31147 int NumElts = VT.getVectorNumElements();
31148 (void)EltVT;
31149 assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.");
31150
31151 // Implement a lookup table in register by using an algorithm based on:
31152 // http://wm.ite.pl/articles/sse-popcount.html
31153 //
31154 // The general idea is that every lower byte nibble in the input vector is an
31155 // index into a in-register pre-computed pop count table. We then split up the
31156 // input vector in two new ones: (1) a vector with only the shifted-right
31157 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
31158 // masked out higher ones) for each byte. PSHUFB is used separately with both
31159 // to index the in-register table. Next, both are added and the result is a
31160 // i8 vector where each element contains the pop count for input byte.
31161 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
31162 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
31163 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
31164 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
31165
31167 for (int i = 0; i < NumElts; ++i)
31168 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
31169 SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);
31170 SDValue M0F = DAG.getConstant(0x0F, DL, VT);
31171
31172 // High nibbles
31173 SDValue FourV = DAG.getConstant(4, DL, VT);
31174 SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);
31175
31176 // Low nibbles
31177 SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);
31178
31179 // The input vector is used as the shuffle mask that index elements into the
31180 // LUT. After counting low and high nibbles, add the vector to obtain the
31181 // final pop count per i8 element.
31182 SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);
31183 SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);
31184 return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);
31185}
31186
31187// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
31188// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
31190 const X86Subtarget &Subtarget,
31191 SelectionDAG &DAG) {
31192 MVT VT = Op.getSimpleValueType();
31193 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
31194 "Unknown CTPOP type to handle");
31195 SDValue Op0 = Op.getOperand(0);
31196
31197 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
31198 if (Subtarget.hasVPOPCNTDQ()) {
31199 unsigned NumElems = VT.getVectorNumElements();
31200 assert((VT.getVectorElementType() == MVT::i8 ||
31201 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
31202 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
31203 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
31204 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
31205 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
31206 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
31207 }
31208 }
31209
31210 // Decompose 256-bit ops into smaller 128-bit ops.
31211 if (VT.is256BitVector() && !Subtarget.hasInt256())
31212 return splitVectorIntUnary(Op, DAG, DL);
31213
31214 // Decompose 512-bit ops into smaller 256-bit ops.
31215 if (VT.is512BitVector() && !Subtarget.hasBWI())
31216 return splitVectorIntUnary(Op, DAG, DL);
31217
31218 // For element types greater than i8, do vXi8 pop counts and a bytesum.
31219 if (VT.getScalarType() != MVT::i8) {
31220 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
31221 SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);
31222 SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);
31223 return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);
31224 }
31225
31226 // We can't use the fast LUT approach, so fall back on LegalizeDAG.
31227 if (!Subtarget.hasSSSE3())
31228 return SDValue();
31229
31230 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
31231}
31232
31233static SDValue LowerCTPOP(SDValue N, const X86Subtarget &Subtarget,
31234 SelectionDAG &DAG) {
31235 MVT VT = N.getSimpleValueType();
31236 SDValue Op = N.getOperand(0);
31237 SDLoc DL(N);
31238
31239 if (VT.isScalarInteger()) {
31240 // Compute the lower/upper bounds of the active bits of the value,
31241 // allowing us to shift the active bits down if necessary to fit into the
31242 // special cases below.
31243 KnownBits Known = DAG.computeKnownBits(Op);
31244 unsigned LZ = Known.countMinLeadingZeros();
31245 unsigned TZ = Known.countMinTrailingZeros();
31246 assert((LZ + TZ) < Known.getBitWidth() && "Illegal shifted mask");
31247 unsigned ActiveBits = Known.getBitWidth() - LZ;
31248 unsigned ShiftedActiveBits = Known.getBitWidth() - (LZ + TZ);
31249
31250 // i2 CTPOP - "ctpop(x) --> sub(x, (x >> 1))".
31251 if (ShiftedActiveBits <= 2) {
31252 if (ActiveBits > 2)
31253 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
31254 DAG.getShiftAmountConstant(TZ, VT, DL));
31255 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
31256 Op = DAG.getNode(ISD::SUB, DL, MVT::i32, Op,
31257 DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
31258 DAG.getShiftAmountConstant(1, VT, DL)));
31259 return DAG.getZExtOrTrunc(Op, DL, VT);
31260 }
31261
31262 // i3 CTPOP - perform LUT into i32 integer.
31263 if (ShiftedActiveBits <= 3) {
31264 if (ActiveBits > 3)
31265 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
31266 DAG.getShiftAmountConstant(TZ, VT, DL));
31267 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
31268 Op = DAG.getNode(ISD::SHL, DL, MVT::i32, Op,
31269 DAG.getShiftAmountConstant(1, VT, DL));
31270 Op = DAG.getNode(ISD::SRL, DL, MVT::i32,
31271 DAG.getConstant(0b1110100110010100U, DL, MVT::i32), Op);
31272 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op,
31273 DAG.getConstant(0x3, DL, MVT::i32));
31274 return DAG.getZExtOrTrunc(Op, DL, VT);
31275 }
31276
31277 // i4 CTPOP - perform LUT into i64 integer.
31278 if (ShiftedActiveBits <= 4 &&
31279 DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64)) {
31280 SDValue LUT = DAG.getConstant(0x4332322132212110ULL, DL, MVT::i64);
31281 if (ActiveBits > 4)
31282 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
31283 DAG.getShiftAmountConstant(TZ, VT, DL));
31284 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
31285 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
31286 DAG.getConstant(4, DL, MVT::i32));
31287 Op = DAG.getNode(ISD::SRL, DL, MVT::i64, LUT,
31288 DAG.getShiftAmountOperand(MVT::i64, Op));
31289 Op = DAG.getNode(ISD::AND, DL, MVT::i64, Op,
31290 DAG.getConstant(0x7, DL, MVT::i64));
31291 return DAG.getZExtOrTrunc(Op, DL, VT);
31292 }
31293
31294 // i8 CTPOP - with efficient i32 MUL, then attempt multiply-mask-multiply.
31295 if (ShiftedActiveBits <= 8) {
31296 SDValue Mask11 = DAG.getConstant(0x11111111U, DL, MVT::i32);
31297 if (ActiveBits > 8)
31298 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
31299 DAG.getShiftAmountConstant(TZ, VT, DL));
31300 Op = DAG.getZExtOrTrunc(Op, DL, MVT::i32);
31301 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op,
31302 DAG.getConstant(0x08040201U, DL, MVT::i32));
31303 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
31304 DAG.getShiftAmountConstant(3, MVT::i32, DL));
31305 Op = DAG.getNode(ISD::AND, DL, MVT::i32, Op, Mask11);
31306 Op = DAG.getNode(ISD::MUL, DL, MVT::i32, Op, Mask11);
31307 Op = DAG.getNode(ISD::SRL, DL, MVT::i32, Op,
31308 DAG.getShiftAmountConstant(28, MVT::i32, DL));
31309 return DAG.getZExtOrTrunc(Op, DL, VT);
31310 }
31311
31312 return SDValue(); // fallback to generic expansion.
31313 }
31314
31315 assert(VT.isVector() &&
31316 "We only do custom lowering for vector population count.");
31317 return LowerVectorCTPOP(N, DL, Subtarget, DAG);
31318}
31319
31321 MVT VT = Op.getSimpleValueType();
31322 SDValue In = Op.getOperand(0);
31323 SDLoc DL(Op);
31324
31325 // For scalars, its still beneficial to transfer to/from the SIMD unit to
31326 // perform the BITREVERSE.
31327 if (!VT.isVector()) {
31328 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
31329 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
31330 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
31331 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
31332 DAG.getIntPtrConstant(0, DL));
31333 }
31334
31335 int NumElts = VT.getVectorNumElements();
31336 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
31337
31338 // Decompose 256-bit ops into smaller 128-bit ops.
31339 if (VT.is256BitVector())
31340 return splitVectorIntUnary(Op, DAG, DL);
31341
31342 assert(VT.is128BitVector() &&
31343 "Only 128-bit vector bitreverse lowering supported.");
31344
31345 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
31346 // perform the BSWAP in the shuffle.
31347 // Its best to shuffle using the second operand as this will implicitly allow
31348 // memory folding for multiple vectors.
31349 SmallVector<SDValue, 16> MaskElts;
31350 for (int i = 0; i != NumElts; ++i) {
31351 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
31352 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
31353 int PermuteByte = SourceByte | (2 << 5);
31354 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
31355 }
31356 }
31357
31358 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
31359 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
31360 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
31361 Res, Mask);
31362 return DAG.getBitcast(VT, Res);
31363}
31364
31366 SelectionDAG &DAG) {
31367 MVT VT = Op.getSimpleValueType();
31368
31369 if (Subtarget.hasXOP() && !VT.is512BitVector())
31370 return LowerBITREVERSE_XOP(Op, DAG);
31371
31372 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
31373
31374 SDValue In = Op.getOperand(0);
31375 SDLoc DL(Op);
31376
31377 // Split 512-bit ops without BWI so that we can still use the PSHUFB lowering.
31378 if (VT.is512BitVector() && !Subtarget.hasBWI())
31379 return splitVectorIntUnary(Op, DAG, DL);
31380
31381 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
31382 if (VT.is256BitVector() && !Subtarget.hasInt256())
31383 return splitVectorIntUnary(Op, DAG, DL);
31384
31385 // Lower i8/i16/i32/i64 as vXi8 BITREVERSE + BSWAP
31386 if (!VT.isVector()) {
31387 assert(
31388 (VT == MVT::i32 || VT == MVT::i64 || VT == MVT::i16 || VT == MVT::i8) &&
31389 "Only tested for i8/i16/i32/i64");
31390 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
31391 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
31392 Res = DAG.getNode(ISD::BITREVERSE, DL, MVT::v16i8,
31393 DAG.getBitcast(MVT::v16i8, Res));
31394 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
31395 DAG.getBitcast(VecVT, Res), DAG.getIntPtrConstant(0, DL));
31396 return (VT == MVT::i8) ? Res : DAG.getNode(ISD::BSWAP, DL, VT, Res);
31397 }
31398
31399 assert(VT.isVector() && VT.getSizeInBits() >= 128);
31400
31401 // Lower vXi16/vXi32/vXi64 as BSWAP + vXi8 BITREVERSE.
31402 if (VT.getScalarType() != MVT::i8) {
31403 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
31404 SDValue Res = DAG.getNode(ISD::BSWAP, DL, VT, In);
31405 Res = DAG.getBitcast(ByteVT, Res);
31406 Res = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Res);
31407 return DAG.getBitcast(VT, Res);
31408 }
31409 assert(VT.isVector() && VT.getScalarType() == MVT::i8 &&
31410 "Only byte vector BITREVERSE supported");
31411
31412 unsigned NumElts = VT.getVectorNumElements();
31413
31414 // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.
31415 if (Subtarget.hasGFNI()) {
31416 MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);
31417 SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);
31418 Matrix = DAG.getBitcast(VT, Matrix);
31419 return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,
31420 DAG.getTargetConstant(0, DL, MVT::i8));
31421 }
31422
31423 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
31424 // two nibbles and a PSHUFB lookup to find the bitreverse of each
31425 // 0-15 value (moved to the other nibble).
31426 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
31427 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
31428 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
31429
31430 const int LoLUT[16] = {
31431 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
31432 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
31433 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
31434 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
31435 const int HiLUT[16] = {
31436 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
31437 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
31438 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
31439 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
31440
31441 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
31442 for (unsigned i = 0; i < NumElts; ++i) {
31443 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
31444 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
31445 }
31446
31447 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
31448 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
31449 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
31450 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
31451 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
31452}
31453
31454static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,
31455 SelectionDAG &DAG) {
31456 SDLoc DL(Op);
31457 SDValue X = Op.getOperand(0);
31458 MVT VT = Op.getSimpleValueType();
31459
31460 // Special case. If the input fits in 8-bits we can use a single 8-bit TEST.
31461 if (VT == MVT::i8 ||
31463 X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
31464 SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,
31465 DAG.getConstant(0, DL, MVT::i8));
31466 // Copy the inverse of the parity flag into a register with setcc.
31467 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
31468 // Extend to the original type.
31469 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
31470 }
31471
31472 // If we have POPCNT, use the default expansion.
31473 if (Subtarget.hasPOPCNT())
31474 return SDValue();
31475
31476 if (VT == MVT::i64) {
31477 // Xor the high and low 16-bits together using a 32-bit operation.
31478 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,
31479 DAG.getNode(ISD::SRL, DL, MVT::i64, X,
31480 DAG.getConstant(32, DL, MVT::i8)));
31481 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
31482 X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);
31483 }
31484
31485 if (VT != MVT::i16) {
31486 // Xor the high and low 16-bits together using a 32-bit operation.
31487 SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,
31488 DAG.getConstant(16, DL, MVT::i8));
31489 X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);
31490 } else {
31491 // If the input is 16-bits, we need to extend to use an i32 shift below.
31492 X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);
31493 }
31494
31495 // Finally xor the low 2 bytes together and use a 8-bit flag setting xor.
31496 // This should allow an h-reg to be used to save a shift.
31497 SDValue Hi = DAG.getNode(
31498 ISD::TRUNCATE, DL, MVT::i8,
31499 DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));
31500 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);
31501 SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);
31502 SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);
31503
31504 // Copy the inverse of the parity flag into a register with setcc.
31505 SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);
31506 // Extend to the original type.
31507 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);
31508}
31509
31511 const X86Subtarget &Subtarget) {
31512 unsigned NewOpc = 0;
31513 switch (N->getOpcode()) {
31515 NewOpc = X86ISD::LADD;
31516 break;
31518 NewOpc = X86ISD::LSUB;
31519 break;
31521 NewOpc = X86ISD::LOR;
31522 break;
31524 NewOpc = X86ISD::LXOR;
31525 break;
31527 NewOpc = X86ISD::LAND;
31528 break;
31529 default:
31530 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
31531 }
31532
31533 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
31534
31535 return DAG.getMemIntrinsicNode(
31536 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
31537 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
31538 /*MemVT=*/N->getSimpleValueType(0), MMO);
31539}
31540
31541/// Lower atomic_load_ops into LOCK-prefixed operations.
31543 const X86Subtarget &Subtarget) {
31544 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
31545 SDValue Chain = N->getOperand(0);
31546 SDValue LHS = N->getOperand(1);
31547 SDValue RHS = N->getOperand(2);
31548 unsigned Opc = N->getOpcode();
31549 MVT VT = N->getSimpleValueType(0);
31550 SDLoc DL(N);
31551
31552 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
31553 // can only be lowered when the result is unused. They should have already
31554 // been transformed into a cmpxchg loop in AtomicExpand.
31555 if (N->hasAnyUseOfValue(0)) {
31556 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
31557 // select LXADD if LOCK_SUB can't be selected.
31558 // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we
31559 // can use LXADD as opposed to cmpxchg.
31560 if (Opc == ISD::ATOMIC_LOAD_SUB ||
31562 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
31563 DAG.getNegative(RHS, DL, VT), AN->getMemOperand());
31564
31566 "Used AtomicRMW ops other than Add should have been expanded!");
31567 return N;
31568 }
31569
31570 // Specialized lowering for the canonical form of an idemptotent atomicrmw.
31571 // The core idea here is that since the memory location isn't actually
31572 // changing, all we need is a lowering for the *ordering* impacts of the
31573 // atomicrmw. As such, we can chose a different operation and memory
31574 // location to minimize impact on other code.
31575 // The above holds unless the node is marked volatile in which
31576 // case it needs to be preserved according to the langref.
31577 if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS) && !AN->isVolatile()) {
31578 // On X86, the only ordering which actually requires an instruction is
31579 // seq_cst which isn't SingleThread, everything just needs to be preserved
31580 // during codegen and then dropped. Note that we expect (but don't assume),
31581 // that orderings other than seq_cst and acq_rel have been canonicalized to
31582 // a store or load.
31585 // Prefer a locked operation against a stack location to minimize cache
31586 // traffic. This assumes that stack locations are very likely to be
31587 // accessed only by the owning thread.
31588 SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);
31589 assert(!N->hasAnyUseOfValue(0));
31590 // NOTE: The getUNDEF is needed to give something for the unused result 0.
31591 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31592 DAG.getUNDEF(VT), NewChain);
31593 }
31594 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
31595 SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);
31596 assert(!N->hasAnyUseOfValue(0));
31597 // NOTE: The getUNDEF is needed to give something for the unused result 0.
31598 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31599 DAG.getUNDEF(VT), NewChain);
31600 }
31601
31602 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
31603 // RAUW the chain, but don't worry about the result, as it's unused.
31604 assert(!N->hasAnyUseOfValue(0));
31605 // NOTE: The getUNDEF is needed to give something for the unused result 0.
31606 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),
31607 DAG.getUNDEF(VT), LockOp.getValue(1));
31608}
31609
31611 const X86Subtarget &Subtarget) {
31612 auto *Node = cast<AtomicSDNode>(Op.getNode());
31613 SDLoc dl(Node);
31614 EVT VT = Node->getMemoryVT();
31615
31616 bool IsSeqCst =
31617 Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;
31618 bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);
31619
31620 // If this store is not sequentially consistent and the type is legal
31621 // we can just keep it.
31622 if (!IsSeqCst && IsTypeLegal)
31623 return Op;
31624
31625 if (VT == MVT::i64 && !IsTypeLegal) {
31626 // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
31627 // is enabled.
31628 bool NoImplicitFloatOps =
31630 Attribute::NoImplicitFloat);
31631 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
31632 SDValue Chain;
31633 if (Subtarget.hasSSE1()) {
31634 SDValue SclToVec =
31635 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
31636 MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
31637 SclToVec = DAG.getBitcast(StVT, SclToVec);
31638 SDVTList Tys = DAG.getVTList(MVT::Other);
31639 SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
31640 Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
31641 MVT::i64, Node->getMemOperand());
31642 } else if (Subtarget.hasX87()) {
31643 // First load this into an 80-bit X87 register using a stack temporary.
31644 // This will put the whole integer into the significand.
31645 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
31646 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
31647 MachinePointerInfo MPI =
31649 Chain = DAG.getStore(Node->getChain(), dl, Node->getVal(), StackPtr,
31651 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
31652 SDValue LdOps[] = {Chain, StackPtr};
31654 X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
31655 /*Align*/ std::nullopt, MachineMemOperand::MOLoad);
31656 Chain = Value.getValue(1);
31657
31658 // Now use an FIST to do the atomic store.
31659 SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
31660 Chain =
31661 DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
31662 StoreOps, MVT::i64, Node->getMemOperand());
31663 }
31664
31665 if (Chain) {
31666 // If this is a sequentially consistent store, also emit an appropriate
31667 // barrier.
31668 if (IsSeqCst)
31669 Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
31670
31671 return Chain;
31672 }
31673 }
31674 }
31675
31676 // Convert seq_cst store -> xchg
31677 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
31678 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
31679 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, Node->getMemoryVT(),
31680 Node->getOperand(0), Node->getOperand(2),
31681 Node->getOperand(1), Node->getMemOperand());
31682 return Swap.getValue(1);
31683}
31684
31686 SDNode *N = Op.getNode();
31687 MVT VT = N->getSimpleValueType(0);
31688 unsigned Opc = Op.getOpcode();
31689
31690 // Let legalize expand this if it isn't a legal type yet.
31691 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
31692 return SDValue();
31693
31694 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
31695 SDLoc DL(N);
31696
31697 // Set the carry flag.
31698 SDValue Carry = Op.getOperand(2);
31699 EVT CarryVT = Carry.getValueType();
31700 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
31701 Carry, DAG.getAllOnesConstant(DL, CarryVT));
31702
31703 bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY;
31704 SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,
31705 Op.getOperand(0), Op.getOperand(1),
31706 Carry.getValue(1));
31707
31708 bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;
31709 SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,
31710 Sum.getValue(1), DL, DAG);
31711 if (N->getValueType(1) == MVT::i1)
31712 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
31713
31714 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
31715}
31716
31717static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
31718 SelectionDAG &DAG) {
31719 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
31720
31721 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
31722 // which returns the values as { float, float } (in XMM0) or
31723 // { double, double } (which is returned in XMM0, XMM1).
31724 SDLoc dl(Op);
31725 SDValue Arg = Op.getOperand(0);
31726 EVT ArgVT = Arg.getValueType();
31727 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
31728
31731
31732 Entry.Node = Arg;
31733 Entry.Ty = ArgTy;
31734 Entry.IsSExt = false;
31735 Entry.IsZExt = false;
31736 Args.push_back(Entry);
31737
31738 bool isF64 = ArgVT == MVT::f64;
31739 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
31740 // the small struct {f32, f32} is returned in (eax, edx). For f64,
31741 // the results are returned via SRet in memory.
31742 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31743 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
31744 const char *LibcallName = TLI.getLibcallName(LC);
31745 SDValue Callee =
31746 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
31747
31748 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
31749 : (Type *)FixedVectorType::get(ArgTy, 4);
31750
31752 CLI.setDebugLoc(dl)
31753 .setChain(DAG.getEntryNode())
31754 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
31755
31756 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
31757
31758 if (isF64)
31759 // Returned in xmm0 and xmm1.
31760 return CallResult.first;
31761
31762 // Returned in bits 0:31 and 32:64 xmm0.
31763 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
31764 CallResult.first, DAG.getIntPtrConstant(0, dl));
31765 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
31766 CallResult.first, DAG.getIntPtrConstant(1, dl));
31767 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
31768 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
31769}
31770
31771/// Widen a vector input to a vector of NVT. The
31772/// input vector must have the same element type as NVT.
31774 bool FillWithZeroes = false) {
31775 // Check if InOp already has the right width.
31776 MVT InVT = InOp.getSimpleValueType();
31777 if (InVT == NVT)
31778 return InOp;
31779
31780 if (InOp.isUndef())
31781 return DAG.getUNDEF(NVT);
31782
31784 "input and widen element type must match");
31785
31786 unsigned InNumElts = InVT.getVectorNumElements();
31787 unsigned WidenNumElts = NVT.getVectorNumElements();
31788 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
31789 "Unexpected request for vector widening");
31790
31791 SDLoc dl(InOp);
31792 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
31793 InOp.getNumOperands() == 2) {
31794 SDValue N1 = InOp.getOperand(1);
31795 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
31796 N1.isUndef()) {
31797 InOp = InOp.getOperand(0);
31798 InVT = InOp.getSimpleValueType();
31799 InNumElts = InVT.getVectorNumElements();
31800 }
31801 }
31805 for (unsigned i = 0; i < InNumElts; ++i)
31806 Ops.push_back(InOp.getOperand(i));
31807
31808 EVT EltVT = InOp.getOperand(0).getValueType();
31809
31810 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
31811 DAG.getUNDEF(EltVT);
31812 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
31813 Ops.push_back(FillVal);
31814 return DAG.getBuildVector(NVT, dl, Ops);
31815 }
31816 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
31817 DAG.getUNDEF(NVT);
31818 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
31819 InOp, DAG.getIntPtrConstant(0, dl));
31820}
31821
31823 SelectionDAG &DAG) {
31824 assert(Subtarget.hasAVX512() &&
31825 "MGATHER/MSCATTER are supported on AVX-512 arch only");
31826
31827 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
31828 SDValue Src = N->getValue();
31829 MVT VT = Src.getSimpleValueType();
31830 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
31831 SDLoc dl(Op);
31832
31833 SDValue Scale = N->getScale();
31834 SDValue Index = N->getIndex();
31835 SDValue Mask = N->getMask();
31836 SDValue Chain = N->getChain();
31837 SDValue BasePtr = N->getBasePtr();
31838
31839 if (VT == MVT::v2f32 || VT == MVT::v2i32) {
31840 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
31841 // If the index is v2i64 and we have VLX we can use xmm for data and index.
31842 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
31843 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31844 EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
31845 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
31846 SDVTList VTs = DAG.getVTList(MVT::Other);
31847 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
31848 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
31849 N->getMemoryVT(), N->getMemOperand());
31850 }
31851 return SDValue();
31852 }
31853
31854 MVT IndexVT = Index.getSimpleValueType();
31855
31856 // If the index is v2i32, we're being called by type legalization and we
31857 // should just let the default handling take care of it.
31858 if (IndexVT == MVT::v2i32)
31859 return SDValue();
31860
31861 // If we don't have VLX and neither the passthru or index is 512-bits, we
31862 // need to widen until one is.
31863 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
31864 !Index.getSimpleValueType().is512BitVector()) {
31865 // Determine how much we need to widen by to get a 512-bit type.
31866 unsigned Factor = std::min(512/VT.getSizeInBits(),
31867 512/IndexVT.getSizeInBits());
31868 unsigned NumElts = VT.getVectorNumElements() * Factor;
31869
31870 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
31871 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
31872 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
31873
31874 Src = ExtendToType(Src, VT, DAG);
31875 Index = ExtendToType(Index, IndexVT, DAG);
31876 Mask = ExtendToType(Mask, MaskVT, DAG, true);
31877 }
31878
31879 SDVTList VTs = DAG.getVTList(MVT::Other);
31880 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
31881 return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
31882 N->getMemoryVT(), N->getMemOperand());
31883}
31884
31885static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
31886 SelectionDAG &DAG) {
31887
31888 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
31889 MVT VT = Op.getSimpleValueType();
31890 MVT ScalarVT = VT.getScalarType();
31891 SDValue Mask = N->getMask();
31892 MVT MaskVT = Mask.getSimpleValueType();
31893 SDValue PassThru = N->getPassThru();
31894 SDLoc dl(Op);
31895
31896 // Handle AVX masked loads which don't support passthru other than 0.
31897 if (MaskVT.getVectorElementType() != MVT::i1) {
31898 // We also allow undef in the isel pattern.
31899 if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
31900 return Op;
31901
31902 SDValue NewLoad = DAG.getMaskedLoad(
31903 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
31904 getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
31905 N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
31906 N->isExpandingLoad());
31907 // Emit a blend.
31908 SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
31909 return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
31910 }
31911
31912 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
31913 "Expanding masked load is supported on AVX-512 target only!");
31914
31915 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
31916 "Expanding masked load is supported for 32 and 64-bit types only!");
31917
31918 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
31919 "Cannot lower masked load op.");
31920
31921 assert((ScalarVT.getSizeInBits() >= 32 ||
31922 (Subtarget.hasBWI() &&
31923 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
31924 "Unsupported masked load op.");
31925
31926 // This operation is legal for targets with VLX, but without
31927 // VLX the vector should be widened to 512 bit
31928 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
31929 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
31930 PassThru = ExtendToType(PassThru, WideDataVT, DAG);
31931
31932 // Mask element has to be i1.
31933 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
31934 "Unexpected mask type");
31935
31936 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
31937
31938 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
31939 SDValue NewLoad = DAG.getMaskedLoad(
31940 WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
31941 PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
31942 N->getExtensionType(), N->isExpandingLoad());
31943
31944 SDValue Extract =
31945 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
31946 DAG.getIntPtrConstant(0, dl));
31947 SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
31948 return DAG.getMergeValues(RetOps, dl);
31949}
31950
31951static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
31952 SelectionDAG &DAG) {
31953 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
31954 SDValue DataToStore = N->getValue();
31955 MVT VT = DataToStore.getSimpleValueType();
31956 MVT ScalarVT = VT.getScalarType();
31957 SDValue Mask = N->getMask();
31958 SDLoc dl(Op);
31959
31960 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
31961 "Expanding masked load is supported on AVX-512 target only!");
31962
31963 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
31964 "Expanding masked load is supported for 32 and 64-bit types only!");
31965
31966 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
31967 "Cannot lower masked store op.");
31968
31969 assert((ScalarVT.getSizeInBits() >= 32 ||
31970 (Subtarget.hasBWI() &&
31971 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
31972 "Unsupported masked store op.");
31973
31974 // This operation is legal for targets with VLX, but without
31975 // VLX the vector should be widened to 512 bit
31976 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
31977 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
31978
31979 // Mask element has to be i1.
31980 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
31981 "Unexpected mask type");
31982
31983 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
31984
31985 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
31986 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
31987 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
31988 N->getOffset(), Mask, N->getMemoryVT(),
31989 N->getMemOperand(), N->getAddressingMode(),
31990 N->isTruncatingStore(), N->isCompressingStore());
31991}
31992
31993static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
31994 SelectionDAG &DAG) {
31995 assert(Subtarget.hasAVX2() &&
31996 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
31997
31998 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
31999 SDLoc dl(Op);
32000 MVT VT = Op.getSimpleValueType();
32001 SDValue Index = N->getIndex();
32002 SDValue Mask = N->getMask();
32003 SDValue PassThru = N->getPassThru();
32004 MVT IndexVT = Index.getSimpleValueType();
32005
32006 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
32007
32008 // If the index is v2i32, we're being called by type legalization.
32009 if (IndexVT == MVT::v2i32)
32010 return SDValue();
32011
32012 // If we don't have VLX and neither the passthru or index is 512-bits, we
32013 // need to widen until one is.
32014 MVT OrigVT = VT;
32015 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
32016 !IndexVT.is512BitVector()) {
32017 // Determine how much we need to widen by to get a 512-bit type.
32018 unsigned Factor = std::min(512/VT.getSizeInBits(),
32019 512/IndexVT.getSizeInBits());
32020
32021 unsigned NumElts = VT.getVectorNumElements() * Factor;
32022
32023 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
32024 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
32025 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
32026
32027 PassThru = ExtendToType(PassThru, VT, DAG);
32028 Index = ExtendToType(Index, IndexVT, DAG);
32029 Mask = ExtendToType(Mask, MaskVT, DAG, true);
32030 }
32031
32032 // Break dependency on the data register.
32033 if (PassThru.isUndef())
32034 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
32035
32036 SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
32037 N->getScale() };
32038 SDValue NewGather = DAG.getMemIntrinsicNode(
32039 X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
32040 N->getMemOperand());
32041 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
32042 NewGather, DAG.getIntPtrConstant(0, dl));
32043 return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
32044}
32045
32047 SDLoc dl(Op);
32048 SDValue Src = Op.getOperand(0);
32049 MVT DstVT = Op.getSimpleValueType();
32050
32051 AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
32052 unsigned SrcAS = N->getSrcAddressSpace();
32053
32054 assert(SrcAS != N->getDestAddressSpace() &&
32055 "addrspacecast must be between different address spaces");
32056
32057 if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
32058 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
32059 } else if (DstVT == MVT::i64) {
32060 Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
32061 } else if (DstVT == MVT::i32) {
32062 Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
32063 } else {
32064 report_fatal_error("Bad address space in addrspacecast");
32065 }
32066 return Op;
32067}
32068
32069SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
32070 SelectionDAG &DAG) const {
32071 // TODO: Eventually, the lowering of these nodes should be informed by or
32072 // deferred to the GC strategy for the function in which they appear. For
32073 // now, however, they must be lowered to something. Since they are logically
32074 // no-ops in the case of a null GC strategy (or a GC strategy which does not
32075 // require special handling for these nodes), lower them as literal NOOPs for
32076 // the time being.
32078 Ops.push_back(Op.getOperand(0));
32079 if (Op->getGluedNode())
32080 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
32081
32082 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
32083 return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
32084}
32085
32086// Custom split CVTPS2PH with wide types.
32088 SDLoc dl(Op);
32089 EVT VT = Op.getValueType();
32090 SDValue Lo, Hi;
32091 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
32092 EVT LoVT, HiVT;
32093 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
32094 SDValue RC = Op.getOperand(1);
32095 Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
32096 Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
32097 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32098}
32099
32101 SelectionDAG &DAG) {
32102 unsigned IsData = Op.getConstantOperandVal(4);
32103
32104 // We don't support non-data prefetch without PREFETCHI.
32105 // Just preserve the chain.
32106 if (!IsData && !Subtarget.hasPREFETCHI())
32107 return Op.getOperand(0);
32108
32109 return Op;
32110}
32111
32113 unsigned OpNo) {
32114 const APInt Operand(32, OpNo);
32115 std::string OpNoStr = llvm::toString(Operand, 10, false);
32116 std::string Str(" $");
32117
32118 std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)
32119 std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}
32120
32121 auto I = StringRef::npos;
32122 for (auto &AsmStr : AsmStrs) {
32123 // Match the OpNo string. We should match exactly to exclude match
32124 // sub-string, e.g. "$12" contain "$1"
32125 if (AsmStr.ends_with(OpNoStr1))
32126 I = AsmStr.size() - OpNoStr1.size();
32127
32128 // Get the index of operand in AsmStr.
32129 if (I == StringRef::npos)
32130 I = AsmStr.find(OpNoStr1 + ",");
32131 if (I == StringRef::npos)
32132 I = AsmStr.find(OpNoStr2);
32133
32134 if (I == StringRef::npos)
32135 continue;
32136
32137 assert(I > 0 && "Unexpected inline asm string!");
32138 // Remove the operand string and label (if exsit).
32139 // For example:
32140 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"
32141 // ==>
32142 // ".L__MSASMLABEL_.${:uid}__l:call dword ptr "
32143 // ==>
32144 // "call dword ptr "
32145 auto TmpStr = AsmStr.substr(0, I);
32146 I = TmpStr.rfind(':');
32147 if (I != StringRef::npos)
32148 TmpStr = TmpStr.substr(I + 1);
32149 return TmpStr.take_while(llvm::isAlpha);
32150 }
32151
32152 return StringRef();
32153}
32154
32156 const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {
32157 // In a __asm block, __asm inst foo where inst is CALL or JMP should be
32158 // changed from indirect TargetLowering::C_Memory to direct
32159 // TargetLowering::C_Address.
32160 // We don't need to special case LOOP* and Jcc, which cannot target a memory
32161 // location.
32162 StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo);
32163 return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
32164}
32165
32166/// Provide custom lowering hooks for some operations.
32168 switch (Op.getOpcode()) {
32169 // clang-format off
32170 default: llvm_unreachable("Should not custom lower this!");
32171 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
32173 return LowerCMP_SWAP(Op, Subtarget, DAG);
32174 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
32179 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
32180 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);
32181 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
32182 case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);
32183 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
32184 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
32185 case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
32186 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
32187 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
32188 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
32189 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
32190 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
32191 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
32192 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
32193 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
32194 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
32195 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
32196 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
32197 case ISD::SHL_PARTS:
32198 case ISD::SRA_PARTS:
32199 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
32200 case ISD::FSHL:
32201 case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
32203 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
32205 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
32206 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
32207 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
32208 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
32209 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
32212 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
32213 case ISD::FP_TO_SINT:
32215 case ISD::FP_TO_UINT:
32216 case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
32218 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);
32219 case ISD::FP_EXTEND:
32220 case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
32221 case ISD::FP_ROUND:
32222 case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
32223 case ISD::FP16_TO_FP:
32224 case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);
32225 case ISD::FP_TO_FP16:
32226 case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
32227 case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);
32228 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
32229 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
32230 case ISD::FADD:
32231 case ISD::FSUB: return lowerFaddFsub(Op, DAG);
32232 case ISD::FROUND: return LowerFROUND(Op, DAG);
32233 case ISD::FABS:
32234 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
32235 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
32236 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
32237 case ISD::LRINT:
32238 case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);
32239 case ISD::SETCC:
32240 case ISD::STRICT_FSETCC:
32241 case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
32242 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
32243 case ISD::SELECT: return LowerSELECT(Op, DAG);
32244 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
32245 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
32246 case ISD::VASTART: return LowerVASTART(Op, DAG);
32247 case ISD::VAARG: return LowerVAARG(Op, DAG);
32248 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
32249 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
32251 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
32252 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
32253 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
32254 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
32256 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
32257 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
32258 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
32259 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
32260 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
32262 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
32263 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
32265 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
32266 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
32267 case ISD::GET_FPENV_MEM: return LowerGET_FPENV_MEM(Op, DAG);
32268 case ISD::SET_FPENV_MEM: return LowerSET_FPENV_MEM(Op, DAG);
32269 case ISD::RESET_FPENV: return LowerRESET_FPENV(Op, DAG);
32270 case ISD::CTLZ:
32271 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
32272 case ISD::CTTZ:
32273 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);
32274 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
32275 case ISD::MULHS:
32276 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
32277 case ISD::ROTL:
32278 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
32279 case ISD::SRA:
32280 case ISD::SRL:
32281 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
32282 case ISD::SADDO:
32283 case ISD::UADDO:
32284 case ISD::SSUBO:
32285 case ISD::USUBO: return LowerXALUO(Op, DAG);
32286 case ISD::SMULO:
32287 case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);
32288 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
32289 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
32290 case ISD::SADDO_CARRY:
32291 case ISD::SSUBO_CARRY:
32292 case ISD::UADDO_CARRY:
32293 case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG);
32294 case ISD::ADD:
32295 case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);
32296 case ISD::UADDSAT:
32297 case ISD::SADDSAT:
32298 case ISD::USUBSAT:
32299 case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);
32300 case ISD::SMAX:
32301 case ISD::SMIN:
32302 case ISD::UMAX:
32303 case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);
32304 case ISD::FMINIMUM:
32305 case ISD::FMAXIMUM:
32306 return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG);
32307 case ISD::ABS: return LowerABS(Op, Subtarget, DAG);
32308 case ISD::ABDS:
32309 case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
32310 case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
32311 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
32312 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
32313 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
32314 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
32315 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
32317 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
32318 case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);
32319 case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);
32320 case ISD::PREFETCH: return LowerPREFETCH(Op, Subtarget, DAG);
32321 // clang-format on
32322 }
32323}
32324
32325/// Replace a node with an illegal result type with a new node built out of
32326/// custom code.
32329 SelectionDAG &DAG) const {
32330 SDLoc dl(N);
32331 switch (N->getOpcode()) {
32332 default:
32333#ifndef NDEBUG
32334 dbgs() << "ReplaceNodeResults: ";
32335 N->dump(&DAG);
32336#endif
32337 llvm_unreachable("Do not know how to custom type legalize this operation!");
32338 case X86ISD::CVTPH2PS: {
32339 EVT VT = N->getValueType(0);
32340 SDValue Lo, Hi;
32341 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
32342 EVT LoVT, HiVT;
32343 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
32344 Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
32345 Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
32346 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32347 Results.push_back(Res);
32348 return;
32349 }
32351 EVT VT = N->getValueType(0);
32352 SDValue Lo, Hi;
32353 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
32354 EVT LoVT, HiVT;
32355 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
32356 Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
32357 {N->getOperand(0), Lo});
32358 Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
32359 {N->getOperand(0), Hi});
32360 SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
32361 Lo.getValue(1), Hi.getValue(1));
32362 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32363 Results.push_back(Res);
32364 Results.push_back(Chain);
32365 return;
32366 }
32367 case X86ISD::CVTPS2PH:
32368 Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));
32369 return;
32370 case ISD::CTPOP: {
32371 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
32372 // If we have at most 32 active bits, then perform as i32 CTPOP.
32373 // TODO: Perform this in generic legalizer?
32374 KnownBits Known = DAG.computeKnownBits(N->getOperand(0));
32375 unsigned LZ = Known.countMinLeadingZeros();
32376 unsigned TZ = Known.countMinTrailingZeros();
32377 if ((LZ + TZ) >= 32) {
32378 SDValue Op = DAG.getNode(ISD::SRL, dl, MVT::i64, N->getOperand(0),
32379 DAG.getShiftAmountConstant(TZ, MVT::i64, dl));
32380 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Op);
32381 Op = DAG.getNode(ISD::CTPOP, dl, MVT::i32, Op);
32382 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Op);
32383 Results.push_back(Op);
32384 return;
32385 }
32386 // Use a v2i64 if possible.
32387 bool NoImplicitFloatOps =
32389 Attribute::NoImplicitFloat);
32390 if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {
32391 SDValue Wide =
32392 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));
32393 Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);
32394 // Bit count should fit in 32-bits, extract it as that and then zero
32395 // extend to i64. Otherwise we end up extracting bits 63:32 separately.
32396 Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);
32397 Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,
32398 DAG.getIntPtrConstant(0, dl));
32399 Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);
32400 Results.push_back(Wide);
32401 }
32402 return;
32403 }
32404 case ISD::MUL: {
32405 EVT VT = N->getValueType(0);
32407 VT.getVectorElementType() == MVT::i8 && "Unexpected VT!");
32408 // Pre-promote these to vXi16 to avoid op legalization thinking all 16
32409 // elements are needed.
32410 MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
32411 SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));
32412 SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));
32413 SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);
32414 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
32415 unsigned NumConcats = 16 / VT.getVectorNumElements();
32416 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
32417 ConcatOps[0] = Res;
32418 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);
32419 Results.push_back(Res);
32420 return;
32421 }
32422 case ISD::SMULO:
32423 case ISD::UMULO: {
32424 EVT VT = N->getValueType(0);
32426 VT == MVT::v2i32 && "Unexpected VT!");
32427 bool IsSigned = N->getOpcode() == ISD::SMULO;
32428 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
32429 SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));
32430 SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));
32431 SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);
32432 // Extract the high 32 bits from each result using PSHUFD.
32433 // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.
32434 SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);
32435 Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});
32436 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,
32437 DAG.getIntPtrConstant(0, dl));
32438
32439 // Truncate the low bits of the result. This will become PSHUFD.
32440 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
32441
32442 SDValue HiCmp;
32443 if (IsSigned) {
32444 // SMULO overflows if the high bits don't match the sign of the low.
32445 HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));
32446 } else {
32447 // UMULO overflows if the high bits are non-zero.
32448 HiCmp = DAG.getConstant(0, dl, VT);
32449 }
32450 SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);
32451
32452 // Widen the result with by padding with undef.
32453 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
32454 DAG.getUNDEF(VT));
32455 Results.push_back(Res);
32456 Results.push_back(Ovf);
32457 return;
32458 }
32459 case X86ISD::VPMADDWD: {
32460 // Legalize types for X86ISD::VPMADDWD by widening.
32461 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32462
32463 EVT VT = N->getValueType(0);
32464 EVT InVT = N->getOperand(0).getValueType();
32465 assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&
32466 "Expected a VT that divides into 128 bits.");
32468 "Unexpected type action!");
32469 unsigned NumConcat = 128 / InVT.getSizeInBits();
32470
32471 EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),
32472 InVT.getVectorElementType(),
32473 NumConcat * InVT.getVectorNumElements());
32474 EVT WideVT = EVT::getVectorVT(*DAG.getContext(),
32476 NumConcat * VT.getVectorNumElements());
32477
32478 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
32479 Ops[0] = N->getOperand(0);
32480 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
32481 Ops[0] = N->getOperand(1);
32482 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);
32483
32484 SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);
32485 Results.push_back(Res);
32486 return;
32487 }
32488 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
32489 case X86ISD::FMINC:
32490 case X86ISD::FMIN:
32491 case X86ISD::FMAXC:
32492 case X86ISD::FMAX: {
32493 EVT VT = N->getValueType(0);
32494 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
32495 SDValue UNDEF = DAG.getUNDEF(VT);
32496 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
32497 N->getOperand(0), UNDEF);
32498 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
32499 N->getOperand(1), UNDEF);
32500 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
32501 return;
32502 }
32503 case ISD::SDIV:
32504 case ISD::UDIV:
32505 case ISD::SREM:
32506 case ISD::UREM: {
32507 EVT VT = N->getValueType(0);
32508 if (VT.isVector()) {
32510 "Unexpected type action!");
32511 // If this RHS is a constant splat vector we can widen this and let
32512 // division/remainder by constant optimize it.
32513 // TODO: Can we do something for non-splat?
32514 APInt SplatVal;
32515 if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {
32516 unsigned NumConcats = 128 / VT.getSizeInBits();
32517 SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));
32518 Ops0[0] = N->getOperand(0);
32519 EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);
32520 SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);
32521 SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);
32522 SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);
32523 Results.push_back(Res);
32524 }
32525 return;
32526 }
32527
32528 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
32529 Results.push_back(V);
32530 return;
32531 }
32532 case ISD::TRUNCATE: {
32533 MVT VT = N->getSimpleValueType(0);
32534 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
32535 return;
32536
32537 // The generic legalizer will try to widen the input type to the same
32538 // number of elements as the widened result type. But this isn't always
32539 // the best thing so do some custom legalization to avoid some cases.
32540 MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();
32541 SDValue In = N->getOperand(0);
32542 EVT InVT = In.getValueType();
32543 EVT InEltVT = InVT.getVectorElementType();
32544 EVT EltVT = VT.getVectorElementType();
32545 unsigned MinElts = VT.getVectorNumElements();
32546 unsigned WidenNumElts = WidenVT.getVectorNumElements();
32547 unsigned InBits = InVT.getSizeInBits();
32548
32549 // See if there are sufficient leading bits to perform a PACKUS/PACKSS.
32550 unsigned PackOpcode;
32551 if (SDValue Src =
32552 matchTruncateWithPACK(PackOpcode, VT, In, dl, DAG, Subtarget)) {
32553 if (SDValue Res = truncateVectorWithPACK(PackOpcode, VT, Src,
32554 dl, DAG, Subtarget)) {
32555 Res = widenSubVector(WidenVT, Res, false, Subtarget, DAG, dl);
32556 Results.push_back(Res);
32557 return;
32558 }
32559 }
32560
32561 if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {
32562 // 128 bit and smaller inputs should avoid truncate all together and
32563 // use a shuffle.
32564 if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {
32565 int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();
32566 SmallVector<int, 16> TruncMask(WidenNumElts, -1);
32567 for (unsigned I = 0; I < MinElts; ++I)
32568 TruncMask[I] = Scale * I;
32569 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);
32570 assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) &&
32571 "Illegal vector type in truncation");
32572 WidenIn = DAG.getBitcast(WidenVT, WidenIn);
32573 Results.push_back(
32574 DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));
32575 return;
32576 }
32577 }
32578
32579 // With AVX512 there are some cases that can use a target specific
32580 // truncate node to go from 256/512 to less than 128 with zeros in the
32581 // upper elements of the 128 bit result.
32582 if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {
32583 // We can use VTRUNC directly if for 256 bits with VLX or for any 512.
32584 if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {
32585 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
32586 return;
32587 }
32588 // There's one case we can widen to 512 bits and use VTRUNC.
32589 if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {
32590 In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,
32591 DAG.getUNDEF(MVT::v4i64));
32592 Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));
32593 return;
32594 }
32595 }
32596 if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&
32597 getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&
32598 isTypeLegal(MVT::v4i64)) {
32599 // Input needs to be split and output needs to widened. Let's use two
32600 // VTRUNCs, and shuffle their results together into the wider type.
32601 SDValue Lo, Hi;
32602 std::tie(Lo, Hi) = DAG.SplitVector(In, dl);
32603
32604 Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);
32605 Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);
32606 SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,
32607 { 0, 1, 2, 3, 16, 17, 18, 19,
32608 -1, -1, -1, -1, -1, -1, -1, -1 });
32609 Results.push_back(Res);
32610 return;
32611 }
32612
32613 // Attempt to widen the truncation input vector to let LowerTRUNCATE handle
32614 // this via type legalization.
32615 if ((InEltVT == MVT::i16 || InEltVT == MVT::i32 || InEltVT == MVT::i64) &&
32616 (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32) &&
32617 (!Subtarget.hasSSSE3() ||
32618 (!isTypeLegal(InVT) &&
32619 !(MinElts <= 4 && InEltVT == MVT::i64 && EltVT == MVT::i8)))) {
32620 SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl,
32621 InEltVT.getSizeInBits() * WidenNumElts);
32622 Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, WidenVT, WidenIn));
32623 return;
32624 }
32625
32626 return;
32627 }
32628 case ISD::ANY_EXTEND:
32629 // Right now, only MVT::v8i8 has Custom action for an illegal type.
32630 // It's intended to custom handle the input type.
32631 assert(N->getValueType(0) == MVT::v8i8 &&
32632 "Do not know how to legalize this Node");
32633 return;
32634 case ISD::SIGN_EXTEND:
32635 case ISD::ZERO_EXTEND: {
32636 EVT VT = N->getValueType(0);
32637 SDValue In = N->getOperand(0);
32638 EVT InVT = In.getValueType();
32639 if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&
32640 (InVT == MVT::v4i16 || InVT == MVT::v4i8)){
32642 "Unexpected type action!");
32643 assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode");
32644 // Custom split this so we can extend i8/i16->i32 invec. This is better
32645 // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using
32646 // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting
32647 // we allow the sra from the extend to i32 to be shared by the split.
32648 In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);
32649
32650 // Fill a vector with sign bits for each element.
32651 SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);
32652 SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);
32653
32654 // Create an unpackl and unpackh to interleave the sign bits then bitcast
32655 // to v2i64.
32656 SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
32657 {0, 4, 1, 5});
32658 Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);
32659 SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,
32660 {2, 6, 3, 7});
32661 Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);
32662
32663 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32664 Results.push_back(Res);
32665 return;
32666 }
32667
32668 if (VT == MVT::v16i32 || VT == MVT::v8i64) {
32669 if (!InVT.is128BitVector()) {
32670 // Not a 128 bit vector, but maybe type legalization will promote
32671 // it to 128 bits.
32672 if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)
32673 return;
32674 InVT = getTypeToTransformTo(*DAG.getContext(), InVT);
32675 if (!InVT.is128BitVector())
32676 return;
32677
32678 // Promote the input to 128 bits. Type legalization will turn this into
32679 // zext_inreg/sext_inreg.
32680 In = DAG.getNode(N->getOpcode(), dl, InVT, In);
32681 }
32682
32683 // Perform custom splitting instead of the two stage extend we would get
32684 // by default.
32685 EVT LoVT, HiVT;
32686 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
32687 assert(isTypeLegal(LoVT) && "Split VT not legal?");
32688
32689 SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);
32690
32691 // We need to shift the input over by half the number of elements.
32692 unsigned NumElts = InVT.getVectorNumElements();
32693 unsigned HalfNumElts = NumElts / 2;
32694 SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);
32695 for (unsigned i = 0; i != HalfNumElts; ++i)
32696 ShufMask[i] = i + HalfNumElts;
32697
32698 SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);
32699 Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);
32700
32701 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
32702 Results.push_back(Res);
32703 }
32704 return;
32705 }
32706 case ISD::FP_TO_SINT:
32708 case ISD::FP_TO_UINT:
32710 bool IsStrict = N->isStrictFPOpcode();
32711 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
32712 N->getOpcode() == ISD::STRICT_FP_TO_SINT;
32713 EVT VT = N->getValueType(0);
32714 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32715 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
32716 EVT SrcVT = Src.getValueType();
32717
32718 SDValue Res;
32719 if (isSoftF16(SrcVT, Subtarget)) {
32720 EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
32721 if (IsStrict) {
32722 Res =
32723 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
32724 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
32725 {NVT, MVT::Other}, {Chain, Src})});
32726 Chain = Res.getValue(1);
32727 } else {
32728 Res = DAG.getNode(N->getOpcode(), dl, VT,
32729 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
32730 }
32731 Results.push_back(Res);
32732 if (IsStrict)
32733 Results.push_back(Chain);
32734
32735 return;
32736 }
32737
32738 if (VT.isVector() && Subtarget.hasFP16() &&
32739 SrcVT.getVectorElementType() == MVT::f16) {
32740 EVT EleVT = VT.getVectorElementType();
32741 EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
32742
32743 if (SrcVT != MVT::v8f16) {
32744 SDValue Tmp =
32745 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
32746 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
32747 Ops[0] = Src;
32748 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
32749 }
32750
32751 if (IsStrict) {
32752 unsigned Opc =
32754 Res =
32755 DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});
32756 Chain = Res.getValue(1);
32757 } else {
32758 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
32759 Res = DAG.getNode(Opc, dl, ResVT, Src);
32760 }
32761
32762 // TODO: Need to add exception check code for strict FP.
32763 if (EleVT.getSizeInBits() < 16) {
32764 MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);
32765 Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);
32766
32767 // Now widen to 128 bits.
32768 unsigned NumConcats = 128 / TmpVT.getSizeInBits();
32769 MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);
32770 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));
32771 ConcatOps[0] = Res;
32772 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
32773 }
32774
32775 Results.push_back(Res);
32776 if (IsStrict)
32777 Results.push_back(Chain);
32778
32779 return;
32780 }
32781
32782 if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
32784 "Unexpected type action!");
32785
32786 // Try to create a 128 bit vector, but don't exceed a 32 bit element.
32787 unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
32788 MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
32790 SDValue Res;
32791 SDValue Chain;
32792 if (IsStrict) {
32793 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
32794 {N->getOperand(0), Src});
32795 Chain = Res.getValue(1);
32796 } else
32797 Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
32798
32799 // Preserve what we know about the size of the original result. If the
32800 // result is v2i32, we have to manually widen the assert.
32801 if (PromoteVT == MVT::v2i32)
32802 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,
32803 DAG.getUNDEF(MVT::v2i32));
32804
32805 Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,
32806 Res.getValueType(), Res,
32808
32809 if (PromoteVT == MVT::v2i32)
32810 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
32811 DAG.getIntPtrConstant(0, dl));
32812
32813 // Truncate back to the original width.
32814 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
32815
32816 // Now widen to 128 bits.
32817 unsigned NumConcats = 128 / VT.getSizeInBits();
32819 VT.getVectorNumElements() * NumConcats);
32820 SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));
32821 ConcatOps[0] = Res;
32822 Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
32823 Results.push_back(Res);
32824 if (IsStrict)
32825 Results.push_back(Chain);
32826 return;
32827 }
32828
32829
32830 if (VT == MVT::v2i32) {
32831 assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&
32832 "Strict unsigned conversion requires AVX512");
32833 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
32835 "Unexpected type action!");
32836 if (Src.getValueType() == MVT::v2f64) {
32837 if (!IsSigned && !Subtarget.hasAVX512()) {
32838 SDValue Res =
32839 expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);
32840 Results.push_back(Res);
32841 return;
32842 }
32843
32844 unsigned Opc;
32845 if (IsStrict)
32847 else
32848 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
32849
32850 // If we have VLX we can emit a target specific FP_TO_UINT node,.
32851 if (!IsSigned && !Subtarget.hasVLX()) {
32852 // Otherwise we can defer to the generic legalizer which will widen
32853 // the input as well. This will be further widened during op
32854 // legalization to v8i32<-v8f64.
32855 // For strict nodes we'll need to widen ourselves.
32856 // FIXME: Fix the type legalizer to safely widen strict nodes?
32857 if (!IsStrict)
32858 return;
32859 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
32860 DAG.getConstantFP(0.0, dl, MVT::v2f64));
32861 Opc = N->getOpcode();
32862 }
32863 SDValue Res;
32864 SDValue Chain;
32865 if (IsStrict) {
32866 Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
32867 {N->getOperand(0), Src});
32868 Chain = Res.getValue(1);
32869 } else {
32870 Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
32871 }
32872 Results.push_back(Res);
32873 if (IsStrict)
32874 Results.push_back(Chain);
32875 return;
32876 }
32877
32878 // Custom widen strict v2f32->v2i32 by padding with zeros.
32879 // FIXME: Should generic type legalizer do this?
32880 if (Src.getValueType() == MVT::v2f32 && IsStrict) {
32881 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
32882 DAG.getConstantFP(0.0, dl, MVT::v2f32));
32883 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
32884 {N->getOperand(0), Src});
32885 Results.push_back(Res);
32886 Results.push_back(Res.getValue(1));
32887 return;
32888 }
32889
32890 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
32891 // so early out here.
32892 return;
32893 }
32894
32895 assert(!VT.isVector() && "Vectors should have been handled above!");
32896
32897 if ((Subtarget.hasDQI() && VT == MVT::i64 &&
32898 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
32899 (Subtarget.hasFP16() && SrcVT == MVT::f16)) {
32900 assert(!Subtarget.is64Bit() && "i64 should be legal");
32901 unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
32902 // If we use a 128-bit result we might need to use a target specific node.
32903 unsigned SrcElts =
32904 std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
32905 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
32906 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
32907 unsigned Opc = N->getOpcode();
32908 if (NumElts != SrcElts) {
32909 if (IsStrict)
32911 else
32912 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
32913 }
32914
32915 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
32916 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
32917 DAG.getConstantFP(0.0, dl, VecInVT), Src,
32918 ZeroIdx);
32919 SDValue Chain;
32920 if (IsStrict) {
32921 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
32922 Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
32923 Chain = Res.getValue(1);
32924 } else
32925 Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
32926 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
32927 Results.push_back(Res);
32928 if (IsStrict)
32929 Results.push_back(Chain);
32930 return;
32931 }
32932
32933 if (VT == MVT::i128 && Subtarget.isTargetWin64()) {
32934 SDValue Chain;
32935 SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);
32936 Results.push_back(V);
32937 if (IsStrict)
32938 Results.push_back(Chain);
32939 return;
32940 }
32941
32942 if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
32943 Results.push_back(V);
32944 if (IsStrict)
32945 Results.push_back(Chain);
32946 }
32947 return;
32948 }
32949 case ISD::LRINT:
32950 case ISD::LLRINT: {
32951 if (SDValue V = LRINT_LLRINTHelper(N, DAG))
32952 Results.push_back(V);
32953 return;
32954 }
32955
32956 case ISD::SINT_TO_FP:
32958 case ISD::UINT_TO_FP:
32960 bool IsStrict = N->isStrictFPOpcode();
32961 bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
32962 N->getOpcode() == ISD::STRICT_SINT_TO_FP;
32963 EVT VT = N->getValueType(0);
32964 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
32965 if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&
32966 Subtarget.hasVLX()) {
32967 if (Src.getValueType().getVectorElementType() == MVT::i16)
32968 return;
32969
32970 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)
32971 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
32972 IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)
32973 : DAG.getUNDEF(MVT::v2i32));
32974 if (IsStrict) {
32975 unsigned Opc =
32977 SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
32978 {N->getOperand(0), Src});
32979 Results.push_back(Res);
32980 Results.push_back(Res.getValue(1));
32981 } else {
32982 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
32983 Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));
32984 }
32985 return;
32986 }
32987 if (VT != MVT::v2f32)
32988 return;
32989 EVT SrcVT = Src.getValueType();
32990 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
32991 if (IsStrict) {
32992 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
32994 SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
32995 {N->getOperand(0), Src});
32996 Results.push_back(Res);
32997 Results.push_back(Res.getValue(1));
32998 } else {
32999 unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
33000 Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
33001 }
33002 return;
33003 }
33004 if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
33005 Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
33006 SDValue Zero = DAG.getConstant(0, dl, SrcVT);
33007 SDValue One = DAG.getConstant(1, dl, SrcVT);
33008 SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
33009 DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
33010 DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
33011 SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
33012 SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
33013 SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
33014 for (int i = 0; i != 2; ++i) {
33015 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
33016 SignSrc, DAG.getIntPtrConstant(i, dl));
33017 if (IsStrict)
33018 SignCvts[i] =
33019 DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
33020 {N->getOperand(0), Elt});
33021 else
33022 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
33023 };
33024 SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
33025 SDValue Slow, Chain;
33026 if (IsStrict) {
33027 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
33028 SignCvts[0].getValue(1), SignCvts[1].getValue(1));
33029 Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
33030 {Chain, SignCvt, SignCvt});
33031 Chain = Slow.getValue(1);
33032 } else {
33033 Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
33034 }
33035 IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
33036 IsNeg =
33037 DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
33038 SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
33039 Results.push_back(Cvt);
33040 if (IsStrict)
33041 Results.push_back(Chain);
33042 return;
33043 }
33044
33045 if (SrcVT != MVT::v2i32)
33046 return;
33047
33048 if (IsSigned || Subtarget.hasAVX512()) {
33049 if (!IsStrict)
33050 return;
33051
33052 // Custom widen strict v2i32->v2f32 to avoid scalarization.
33053 // FIXME: Should generic type legalizer do this?
33054 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
33055 DAG.getConstant(0, dl, MVT::v2i32));
33056 SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
33057 {N->getOperand(0), Src});
33058 Results.push_back(Res);
33059 Results.push_back(Res.getValue(1));
33060 return;
33061 }
33062
33063 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33064 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
33065 SDValue VBias = DAG.getConstantFP(
33066 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);
33067 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
33068 DAG.getBitcast(MVT::v2i64, VBias));
33069 Or = DAG.getBitcast(MVT::v2f64, Or);
33070 if (IsStrict) {
33071 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
33072 {N->getOperand(0), Or, VBias});
33074 {MVT::v4f32, MVT::Other},
33075 {Sub.getValue(1), Sub});
33076 Results.push_back(Res);
33077 Results.push_back(Res.getValue(1));
33078 } else {
33079 // TODO: Are there any fast-math-flags to propagate here?
33080 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
33081 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
33082 }
33083 return;
33084 }
33086 case ISD::FP_ROUND: {
33087 bool IsStrict = N->isStrictFPOpcode();
33088 SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
33089 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
33090 SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);
33091 EVT SrcVT = Src.getValueType();
33092 EVT VT = N->getValueType(0);
33093 SDValue V;
33094 if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {
33095 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)
33096 : DAG.getUNDEF(MVT::v2f32);
33097 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);
33098 }
33099 if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {
33100 assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C");
33101 if (SrcVT.getVectorElementType() != MVT::f32)
33102 return;
33103
33104 if (IsStrict)
33105 V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
33106 {Chain, Src, Rnd});
33107 else
33108 V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);
33109
33110 Results.push_back(DAG.getBitcast(MVT::v8f16, V));
33111 if (IsStrict)
33112 Results.push_back(V.getValue(1));
33113 return;
33114 }
33115 if (!isTypeLegal(Src.getValueType()))
33116 return;
33117 EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;
33118 if (IsStrict)
33119 V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},
33120 {Chain, Src});
33121 else
33122 V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);
33123 Results.push_back(V);
33124 if (IsStrict)
33125 Results.push_back(V.getValue(1));
33126 return;
33127 }
33128 case ISD::FP_EXTEND:
33129 case ISD::STRICT_FP_EXTEND: {
33130 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
33131 // No other ValueType for FP_EXTEND should reach this point.
33132 assert(N->getValueType(0) == MVT::v2f32 &&
33133 "Do not know how to legalize this Node");
33134 if (!Subtarget.hasFP16() || !Subtarget.hasVLX())
33135 return;
33136 bool IsStrict = N->isStrictFPOpcode();
33137 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
33138 if (Src.getValueType().getVectorElementType() != MVT::f16)
33139 return;
33140 SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)
33141 : DAG.getUNDEF(MVT::v2f16);
33142 SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);
33143 if (IsStrict)
33144 V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},
33145 {N->getOperand(0), V});
33146 else
33147 V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);
33148 Results.push_back(V);
33149 if (IsStrict)
33150 Results.push_back(V.getValue(1));
33151 return;
33152 }
33154 unsigned IntNo = N->getConstantOperandVal(1);
33155 switch (IntNo) {
33156 default : llvm_unreachable("Do not know how to custom type "
33157 "legalize this intrinsic operation!");
33158 case Intrinsic::x86_rdtsc:
33159 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,
33160 Results);
33161 case Intrinsic::x86_rdtscp:
33162 return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,
33163 Results);
33164 case Intrinsic::x86_rdpmc:
33165 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,
33166 Results);
33167 return;
33168 case Intrinsic::x86_rdpru:
33169 expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,
33170 Results);
33171 return;
33172 case Intrinsic::x86_xgetbv:
33173 expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,
33174 Results);
33175 return;
33176 }
33177 }
33178 case ISD::READCYCLECOUNTER: {
33179 return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);
33180 }
33182 EVT T = N->getValueType(0);
33183 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
33184 bool Regs64bit = T == MVT::i128;
33185 assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&
33186 "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B");
33187 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
33188 SDValue cpInL, cpInH;
33189 std::tie(cpInL, cpInH) =
33190 DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);
33191 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
33192 Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());
33193 cpInH =
33194 DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,
33195 cpInH, cpInL.getValue(1));
33196 SDValue swapInL, swapInH;
33197 std::tie(swapInL, swapInH) =
33198 DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);
33199 swapInH =
33200 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
33201 swapInH, cpInH.getValue(1));
33202
33203 // In 64-bit mode we might need the base pointer in RBX, but we can't know
33204 // until later. So we keep the RBX input in a vreg and use a custom
33205 // inserter.
33206 // Since RBX will be a reserved register the register allocator will not
33207 // make sure its value will be properly saved and restored around this
33208 // live-range.
33209 SDValue Result;
33210 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
33211 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
33212 if (Regs64bit) {
33213 SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,
33214 swapInH.getValue(1)};
33215 Result =
33216 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);
33217 } else {
33218 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,
33219 swapInH.getValue(1));
33220 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
33221 swapInL.getValue(1)};
33222 Result =
33223 DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);
33224 }
33225
33226 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
33227 Regs64bit ? X86::RAX : X86::EAX,
33228 HalfT, Result.getValue(1));
33229 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
33230 Regs64bit ? X86::RDX : X86::EDX,
33231 HalfT, cpOutL.getValue(2));
33232 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
33233
33234 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
33235 MVT::i32, cpOutH.getValue(2));
33236 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
33237 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
33238
33239 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
33240 Results.push_back(Success);
33241 Results.push_back(EFLAGS.getValue(1));
33242 return;
33243 }
33244 case ISD::ATOMIC_LOAD: {
33245 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33246 bool NoImplicitFloatOps =
33248 Attribute::NoImplicitFloat);
33249 if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
33250 auto *Node = cast<AtomicSDNode>(N);
33251 if (Subtarget.hasSSE1()) {
33252 // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
33253 // Then extract the lower 64-bits.
33254 MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
33255 SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
33256 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
33257 SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
33258 MVT::i64, Node->getMemOperand());
33259 if (Subtarget.hasSSE2()) {
33260 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
33261 DAG.getIntPtrConstant(0, dl));
33262 Results.push_back(Res);
33263 Results.push_back(Ld.getValue(1));
33264 return;
33265 }
33266 // We use an alternative sequence for SSE1 that extracts as v2f32 and
33267 // then casts to i64. This avoids a 128-bit stack temporary being
33268 // created by type legalization if we were to cast v4f32->v2i64.
33269 SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
33270 DAG.getIntPtrConstant(0, dl));
33271 Res = DAG.getBitcast(MVT::i64, Res);
33272 Results.push_back(Res);
33273 Results.push_back(Ld.getValue(1));
33274 return;
33275 }
33276 if (Subtarget.hasX87()) {
33277 // First load this into an 80-bit X87 register. This will put the whole
33278 // integer into the significand.
33279 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
33280 SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
33282 dl, Tys, Ops, MVT::i64,
33283 Node->getMemOperand());
33284 SDValue Chain = Result.getValue(1);
33285
33286 // Now store the X87 register to a stack temporary and convert to i64.
33287 // This store is not atomic and doesn't need to be.
33288 // FIXME: We don't need a stack temporary if the result of the load
33289 // is already being stored. We could just directly store there.
33290 SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
33291 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
33292 MachinePointerInfo MPI =
33294 SDValue StoreOps[] = { Chain, Result, StackPtr };
33295 Chain = DAG.getMemIntrinsicNode(
33296 X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
33297 MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);
33298
33299 // Finally load the value back from the stack temporary and return it.
33300 // This load is not atomic and doesn't need to be.
33301 // This load will be further type legalized.
33302 Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);
33303 Results.push_back(Result);
33304 Results.push_back(Result.getValue(1));
33305 return;
33306 }
33307 }
33308 // TODO: Use MOVLPS when SSE1 is available?
33309 // Delegate to generic TypeLegalization. Situations we can really handle
33310 // should have already been dealt with by AtomicExpandPass.cpp.
33311 break;
33312 }
33313 case ISD::ATOMIC_SWAP:
33324 // Delegate to generic TypeLegalization. Situations we can really handle
33325 // should have already been dealt with by AtomicExpandPass.cpp.
33326 break;
33327
33328 case ISD::BITCAST: {
33329 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
33330 EVT DstVT = N->getValueType(0);
33331 EVT SrcVT = N->getOperand(0).getValueType();
33332
33333 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
33334 // we can split using the k-register rather than memory.
33335 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
33336 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
33337 SDValue Lo, Hi;
33338 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
33339 Lo = DAG.getBitcast(MVT::i32, Lo);
33340 Hi = DAG.getBitcast(MVT::i32, Hi);
33341 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
33342 Results.push_back(Res);
33343 return;
33344 }
33345
33346 if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
33347 // FIXME: Use v4f32 for SSE1?
33348 assert(Subtarget.hasSSE2() && "Requires SSE2");
33349 assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
33350 "Unexpected type action!");
33351 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
33352 SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
33353 N->getOperand(0));
33354 Res = DAG.getBitcast(WideVT, Res);
33355 Results.push_back(Res);
33356 return;
33357 }
33358
33359 return;
33360 }
33361 case ISD::MGATHER: {
33362 EVT VT = N->getValueType(0);
33363 if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&
33364 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
33365 auto *Gather = cast<MaskedGatherSDNode>(N);
33366 SDValue Index = Gather->getIndex();
33367 if (Index.getValueType() != MVT::v2i64)
33368 return;
33370 "Unexpected type action!");
33371 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
33372 SDValue Mask = Gather->getMask();
33373 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
33374 SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,
33375 Gather->getPassThru(),
33376 DAG.getUNDEF(VT));
33377 if (!Subtarget.hasVLX()) {
33378 // We need to widen the mask, but the instruction will only use 2
33379 // of its elements. So we can use undef.
33380 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
33381 DAG.getUNDEF(MVT::v2i1));
33382 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
33383 }
33384 SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
33385 Gather->getBasePtr(), Index, Gather->getScale() };
33386 SDValue Res = DAG.getMemIntrinsicNode(
33387 X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
33388 Gather->getMemoryVT(), Gather->getMemOperand());
33389 Results.push_back(Res);
33390 Results.push_back(Res.getValue(1));
33391 return;
33392 }
33393 return;
33394 }
33395 case ISD::LOAD: {
33396 // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This
33397 // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp
33398 // cast since type legalization will try to use an i64 load.
33399 MVT VT = N->getSimpleValueType(0);
33400 assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT");
33402 "Unexpected type action!");
33403 if (!ISD::isNON_EXTLoad(N))
33404 return;
33405 auto *Ld = cast<LoadSDNode>(N);
33406 if (Subtarget.hasSSE2()) {
33407 MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
33408 SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
33409 Ld->getPointerInfo(), Ld->getOriginalAlign(),
33410 Ld->getMemOperand()->getFlags());
33411 SDValue Chain = Res.getValue(1);
33412 MVT VecVT = MVT::getVectorVT(LdVT, 2);
33413 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);
33414 EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);
33415 Res = DAG.getBitcast(WideVT, Res);
33416 Results.push_back(Res);
33417 Results.push_back(Chain);
33418 return;
33419 }
33420 assert(Subtarget.hasSSE1() && "Expected SSE");
33421 SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
33422 SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
33423 SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
33424 MVT::i64, Ld->getMemOperand());
33425 Results.push_back(Res);
33426 Results.push_back(Res.getValue(1));
33427 return;
33428 }
33429 case ISD::ADDRSPACECAST: {
33430 SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
33431 Results.push_back(V);
33432 return;
33433 }
33434 case ISD::BITREVERSE: {
33435 assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33436 assert(Subtarget.hasXOP() && "Expected XOP");
33437 // We can use VPPERM by copying to a vector register and back. We'll need
33438 // to move the scalar in two i32 pieces.
33439 Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));
33440 return;
33441 }
33443 // f16 = extract vXf16 %vec, i64 %idx
33444 assert(N->getSimpleValueType(0) == MVT::f16 &&
33445 "Unexpected Value type of EXTRACT_VECTOR_ELT!");
33446 assert(Subtarget.hasFP16() && "Expected FP16");
33447 SDValue VecOp = N->getOperand(0);
33449 SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));
33450 Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,
33451 N->getOperand(1));
33452 Split = DAG.getBitcast(MVT::f16, Split);
33453 Results.push_back(Split);
33454 return;
33455 }
33456 }
33457}
33458
33459const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
33460 switch ((X86ISD::NodeType)Opcode) {
33461 case X86ISD::FIRST_NUMBER: break;
33462#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
33463 NODE_NAME_CASE(BSF)
33464 NODE_NAME_CASE(BSR)
33465 NODE_NAME_CASE(FSHL)
33466 NODE_NAME_CASE(FSHR)
33467 NODE_NAME_CASE(FAND)
33468 NODE_NAME_CASE(FANDN)
33469 NODE_NAME_CASE(FOR)
33470 NODE_NAME_CASE(FXOR)
33471 NODE_NAME_CASE(FILD)
33472 NODE_NAME_CASE(FIST)
33473 NODE_NAME_CASE(FP_TO_INT_IN_MEM)
33474 NODE_NAME_CASE(FLD)
33475 NODE_NAME_CASE(FST)
33476 NODE_NAME_CASE(CALL)
33477 NODE_NAME_CASE(CALL_RVMARKER)
33479 NODE_NAME_CASE(CMP)
33480 NODE_NAME_CASE(FCMP)
33481 NODE_NAME_CASE(STRICT_FCMP)
33482 NODE_NAME_CASE(STRICT_FCMPS)
33484 NODE_NAME_CASE(UCOMI)
33485 NODE_NAME_CASE(CMPM)
33486 NODE_NAME_CASE(CMPMM)
33487 NODE_NAME_CASE(STRICT_CMPM)
33488 NODE_NAME_CASE(CMPMM_SAE)
33489 NODE_NAME_CASE(SETCC)
33490 NODE_NAME_CASE(SETCC_CARRY)
33491 NODE_NAME_CASE(FSETCC)
33492 NODE_NAME_CASE(FSETCCM)
33493 NODE_NAME_CASE(FSETCCM_SAE)
33494 NODE_NAME_CASE(CMOV)
33495 NODE_NAME_CASE(BRCOND)
33496 NODE_NAME_CASE(RET_GLUE)
33497 NODE_NAME_CASE(IRET)
33498 NODE_NAME_CASE(REP_STOS)
33499 NODE_NAME_CASE(REP_MOVS)
33500 NODE_NAME_CASE(GlobalBaseReg)
33502 NODE_NAME_CASE(WrapperRIP)
33503 NODE_NAME_CASE(MOVQ2DQ)
33504 NODE_NAME_CASE(MOVDQ2Q)
33505 NODE_NAME_CASE(MMX_MOVD2W)
33506 NODE_NAME_CASE(MMX_MOVW2D)
33507 NODE_NAME_CASE(PEXTRB)
33508 NODE_NAME_CASE(PEXTRW)
33509 NODE_NAME_CASE(INSERTPS)
33510 NODE_NAME_CASE(PINSRB)
33511 NODE_NAME_CASE(PINSRW)
33512 NODE_NAME_CASE(PSHUFB)
33513 NODE_NAME_CASE(ANDNP)
33514 NODE_NAME_CASE(BLENDI)
33516 NODE_NAME_CASE(HADD)
33517 NODE_NAME_CASE(HSUB)
33518 NODE_NAME_CASE(FHADD)
33519 NODE_NAME_CASE(FHSUB)
33520 NODE_NAME_CASE(CONFLICT)
33521 NODE_NAME_CASE(FMAX)
33522 NODE_NAME_CASE(FMAXS)
33523 NODE_NAME_CASE(FMAX_SAE)
33524 NODE_NAME_CASE(FMAXS_SAE)
33525 NODE_NAME_CASE(FMIN)
33526 NODE_NAME_CASE(FMINS)
33527 NODE_NAME_CASE(FMIN_SAE)
33528 NODE_NAME_CASE(FMINS_SAE)
33529 NODE_NAME_CASE(FMAXC)
33530 NODE_NAME_CASE(FMINC)
33531 NODE_NAME_CASE(FRSQRT)
33532 NODE_NAME_CASE(FRCP)
33533 NODE_NAME_CASE(EXTRQI)
33534 NODE_NAME_CASE(INSERTQI)
33535 NODE_NAME_CASE(TLSADDR)
33536 NODE_NAME_CASE(TLSBASEADDR)
33537 NODE_NAME_CASE(TLSCALL)
33538 NODE_NAME_CASE(TLSDESC)
33539 NODE_NAME_CASE(EH_SJLJ_SETJMP)
33540 NODE_NAME_CASE(EH_SJLJ_LONGJMP)
33541 NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
33542 NODE_NAME_CASE(EH_RETURN)
33543 NODE_NAME_CASE(TC_RETURN)
33544 NODE_NAME_CASE(FNSTCW16m)
33545 NODE_NAME_CASE(FLDCW16m)
33546 NODE_NAME_CASE(FNSTENVm)
33547 NODE_NAME_CASE(FLDENVm)
33548 NODE_NAME_CASE(LCMPXCHG_DAG)
33549 NODE_NAME_CASE(LCMPXCHG8_DAG)
33550 NODE_NAME_CASE(LCMPXCHG16_DAG)
33551 NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
33552 NODE_NAME_CASE(LADD)
33553 NODE_NAME_CASE(LSUB)
33554 NODE_NAME_CASE(LOR)
33555 NODE_NAME_CASE(LXOR)
33556 NODE_NAME_CASE(LAND)
33557 NODE_NAME_CASE(LBTS)
33558 NODE_NAME_CASE(LBTC)
33559 NODE_NAME_CASE(LBTR)
33560 NODE_NAME_CASE(LBTS_RM)
33561 NODE_NAME_CASE(LBTC_RM)
33562 NODE_NAME_CASE(LBTR_RM)
33563 NODE_NAME_CASE(AADD)
33564 NODE_NAME_CASE(AOR)
33565 NODE_NAME_CASE(AXOR)
33566 NODE_NAME_CASE(AAND)
33567 NODE_NAME_CASE(VZEXT_MOVL)
33568 NODE_NAME_CASE(VZEXT_LOAD)
33569 NODE_NAME_CASE(VEXTRACT_STORE)
33570 NODE_NAME_CASE(VTRUNC)
33571 NODE_NAME_CASE(VTRUNCS)
33572 NODE_NAME_CASE(VTRUNCUS)
33573 NODE_NAME_CASE(VMTRUNC)
33574 NODE_NAME_CASE(VMTRUNCS)
33575 NODE_NAME_CASE(VMTRUNCUS)
33576 NODE_NAME_CASE(VTRUNCSTORES)
33577 NODE_NAME_CASE(VTRUNCSTOREUS)
33578 NODE_NAME_CASE(VMTRUNCSTORES)
33579 NODE_NAME_CASE(VMTRUNCSTOREUS)
33580 NODE_NAME_CASE(VFPEXT)
33581 NODE_NAME_CASE(STRICT_VFPEXT)
33582 NODE_NAME_CASE(VFPEXT_SAE)
33583 NODE_NAME_CASE(VFPEXTS)
33584 NODE_NAME_CASE(VFPEXTS_SAE)
33585 NODE_NAME_CASE(VFPROUND)
33586 NODE_NAME_CASE(STRICT_VFPROUND)
33587 NODE_NAME_CASE(VMFPROUND)
33588 NODE_NAME_CASE(VFPROUND_RND)
33589 NODE_NAME_CASE(VFPROUNDS)
33590 NODE_NAME_CASE(VFPROUNDS_RND)
33591 NODE_NAME_CASE(VSHLDQ)
33592 NODE_NAME_CASE(VSRLDQ)
33593 NODE_NAME_CASE(VSHL)
33594 NODE_NAME_CASE(VSRL)
33595 NODE_NAME_CASE(VSRA)
33596 NODE_NAME_CASE(VSHLI)
33597 NODE_NAME_CASE(VSRLI)
33598 NODE_NAME_CASE(VSRAI)
33599 NODE_NAME_CASE(VSHLV)
33600 NODE_NAME_CASE(VSRLV)
33601 NODE_NAME_CASE(VSRAV)
33602 NODE_NAME_CASE(VROTLI)
33603 NODE_NAME_CASE(VROTRI)
33604 NODE_NAME_CASE(VPPERM)
33605 NODE_NAME_CASE(CMPP)
33606 NODE_NAME_CASE(STRICT_CMPP)
33607 NODE_NAME_CASE(PCMPEQ)
33608 NODE_NAME_CASE(PCMPGT)
33609 NODE_NAME_CASE(PHMINPOS)
33610 NODE_NAME_CASE(ADD)
33611 NODE_NAME_CASE(SUB)
33612 NODE_NAME_CASE(ADC)
33613 NODE_NAME_CASE(SBB)
33614 NODE_NAME_CASE(SMUL)
33615 NODE_NAME_CASE(UMUL)
33616 NODE_NAME_CASE(OR)
33617 NODE_NAME_CASE(XOR)
33618 NODE_NAME_CASE(AND)
33619 NODE_NAME_CASE(BEXTR)
33621 NODE_NAME_CASE(BZHI)
33622 NODE_NAME_CASE(PDEP)
33623 NODE_NAME_CASE(PEXT)
33624 NODE_NAME_CASE(MUL_IMM)
33625 NODE_NAME_CASE(MOVMSK)
33626 NODE_NAME_CASE(PTEST)
33627 NODE_NAME_CASE(TESTP)
33628 NODE_NAME_CASE(KORTEST)
33629 NODE_NAME_CASE(KTEST)
33630 NODE_NAME_CASE(KADD)
33631 NODE_NAME_CASE(KSHIFTL)
33632 NODE_NAME_CASE(KSHIFTR)
33633 NODE_NAME_CASE(PACKSS)
33634 NODE_NAME_CASE(PACKUS)
33635 NODE_NAME_CASE(PALIGNR)
33636 NODE_NAME_CASE(VALIGN)
33637 NODE_NAME_CASE(VSHLD)
33638 NODE_NAME_CASE(VSHRD)
33639 NODE_NAME_CASE(VSHLDV)
33640 NODE_NAME_CASE(VSHRDV)
33641 NODE_NAME_CASE(PSHUFD)
33642 NODE_NAME_CASE(PSHUFHW)
33643 NODE_NAME_CASE(PSHUFLW)
33644 NODE_NAME_CASE(SHUFP)
33645 NODE_NAME_CASE(SHUF128)
33646 NODE_NAME_CASE(MOVLHPS)
33647 NODE_NAME_CASE(MOVHLPS)
33648 NODE_NAME_CASE(MOVDDUP)
33649 NODE_NAME_CASE(MOVSHDUP)
33650 NODE_NAME_CASE(MOVSLDUP)
33651 NODE_NAME_CASE(MOVSD)
33652 NODE_NAME_CASE(MOVSS)
33653 NODE_NAME_CASE(MOVSH)
33654 NODE_NAME_CASE(UNPCKL)
33655 NODE_NAME_CASE(UNPCKH)
33656 NODE_NAME_CASE(VBROADCAST)
33657 NODE_NAME_CASE(VBROADCAST_LOAD)
33658 NODE_NAME_CASE(VBROADCASTM)
33659 NODE_NAME_CASE(SUBV_BROADCAST_LOAD)
33660 NODE_NAME_CASE(VPERMILPV)
33661 NODE_NAME_CASE(VPERMILPI)
33662 NODE_NAME_CASE(VPERM2X128)
33663 NODE_NAME_CASE(VPERMV)
33664 NODE_NAME_CASE(VPERMV3)
33665 NODE_NAME_CASE(VPERMI)
33666 NODE_NAME_CASE(VPTERNLOG)
33667 NODE_NAME_CASE(VFIXUPIMM)
33668 NODE_NAME_CASE(VFIXUPIMM_SAE)
33669 NODE_NAME_CASE(VFIXUPIMMS)
33670 NODE_NAME_CASE(VFIXUPIMMS_SAE)
33671 NODE_NAME_CASE(VRANGE)
33672 NODE_NAME_CASE(VRANGE_SAE)
33673 NODE_NAME_CASE(VRANGES)
33674 NODE_NAME_CASE(VRANGES_SAE)
33675 NODE_NAME_CASE(PMULUDQ)
33676 NODE_NAME_CASE(PMULDQ)
33677 NODE_NAME_CASE(PSADBW)
33678 NODE_NAME_CASE(DBPSADBW)
33679 NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
33680 NODE_NAME_CASE(VAARG_64)
33681 NODE_NAME_CASE(VAARG_X32)
33682 NODE_NAME_CASE(DYN_ALLOCA)
33683 NODE_NAME_CASE(MFENCE)
33684 NODE_NAME_CASE(SEG_ALLOCA)
33685 NODE_NAME_CASE(PROBED_ALLOCA)
33688 NODE_NAME_CASE(RDPKRU)
33689 NODE_NAME_CASE(WRPKRU)
33690 NODE_NAME_CASE(VPMADDUBSW)
33691 NODE_NAME_CASE(VPMADDWD)
33692 NODE_NAME_CASE(VPSHA)
33693 NODE_NAME_CASE(VPSHL)
33694 NODE_NAME_CASE(VPCOM)
33695 NODE_NAME_CASE(VPCOMU)
33696 NODE_NAME_CASE(VPERMIL2)
33698 NODE_NAME_CASE(STRICT_FMSUB)
33700 NODE_NAME_CASE(STRICT_FNMADD)
33702 NODE_NAME_CASE(STRICT_FNMSUB)
33703 NODE_NAME_CASE(FMADDSUB)
33704 NODE_NAME_CASE(FMSUBADD)
33705 NODE_NAME_CASE(FMADD_RND)
33706 NODE_NAME_CASE(FNMADD_RND)
33707 NODE_NAME_CASE(FMSUB_RND)
33708 NODE_NAME_CASE(FNMSUB_RND)
33709 NODE_NAME_CASE(FMADDSUB_RND)
33710 NODE_NAME_CASE(FMSUBADD_RND)
33711 NODE_NAME_CASE(VFMADDC)
33712 NODE_NAME_CASE(VFMADDC_RND)
33713 NODE_NAME_CASE(VFCMADDC)
33714 NODE_NAME_CASE(VFCMADDC_RND)
33715 NODE_NAME_CASE(VFMULC)
33716 NODE_NAME_CASE(VFMULC_RND)
33717 NODE_NAME_CASE(VFCMULC)
33718 NODE_NAME_CASE(VFCMULC_RND)
33719 NODE_NAME_CASE(VFMULCSH)
33720 NODE_NAME_CASE(VFMULCSH_RND)
33721 NODE_NAME_CASE(VFCMULCSH)
33722 NODE_NAME_CASE(VFCMULCSH_RND)
33723 NODE_NAME_CASE(VFMADDCSH)
33724 NODE_NAME_CASE(VFMADDCSH_RND)
33725 NODE_NAME_CASE(VFCMADDCSH)
33726 NODE_NAME_CASE(VFCMADDCSH_RND)
33727 NODE_NAME_CASE(VPMADD52H)
33728 NODE_NAME_CASE(VPMADD52L)
33729 NODE_NAME_CASE(VRNDSCALE)
33730 NODE_NAME_CASE(STRICT_VRNDSCALE)
33731 NODE_NAME_CASE(VRNDSCALE_SAE)
33732 NODE_NAME_CASE(VRNDSCALES)
33733 NODE_NAME_CASE(VRNDSCALES_SAE)
33734 NODE_NAME_CASE(VREDUCE)
33735 NODE_NAME_CASE(VREDUCE_SAE)
33736 NODE_NAME_CASE(VREDUCES)
33737 NODE_NAME_CASE(VREDUCES_SAE)
33738 NODE_NAME_CASE(VGETMANT)
33739 NODE_NAME_CASE(VGETMANT_SAE)
33740 NODE_NAME_CASE(VGETMANTS)
33741 NODE_NAME_CASE(VGETMANTS_SAE)
33742 NODE_NAME_CASE(PCMPESTR)
33743 NODE_NAME_CASE(PCMPISTR)
33745 NODE_NAME_CASE(COMPRESS)
33747 NODE_NAME_CASE(SELECTS)
33748 NODE_NAME_CASE(ADDSUB)
33749 NODE_NAME_CASE(RCP14)
33750 NODE_NAME_CASE(RCP14S)
33751 NODE_NAME_CASE(RCP28)
33752 NODE_NAME_CASE(RCP28_SAE)
33753 NODE_NAME_CASE(RCP28S)
33754 NODE_NAME_CASE(RCP28S_SAE)
33755 NODE_NAME_CASE(EXP2)
33756 NODE_NAME_CASE(EXP2_SAE)
33757 NODE_NAME_CASE(RSQRT14)
33758 NODE_NAME_CASE(RSQRT14S)
33759 NODE_NAME_CASE(RSQRT28)
33760 NODE_NAME_CASE(RSQRT28_SAE)
33761 NODE_NAME_CASE(RSQRT28S)
33762 NODE_NAME_CASE(RSQRT28S_SAE)
33763 NODE_NAME_CASE(FADD_RND)
33764 NODE_NAME_CASE(FADDS)
33765 NODE_NAME_CASE(FADDS_RND)
33766 NODE_NAME_CASE(FSUB_RND)
33767 NODE_NAME_CASE(FSUBS)
33768 NODE_NAME_CASE(FSUBS_RND)
33769 NODE_NAME_CASE(FMUL_RND)
33770 NODE_NAME_CASE(FMULS)
33771 NODE_NAME_CASE(FMULS_RND)
33772 NODE_NAME_CASE(FDIV_RND)
33773 NODE_NAME_CASE(FDIVS)
33774 NODE_NAME_CASE(FDIVS_RND)
33775 NODE_NAME_CASE(FSQRT_RND)
33776 NODE_NAME_CASE(FSQRTS)
33777 NODE_NAME_CASE(FSQRTS_RND)
33778 NODE_NAME_CASE(FGETEXP)
33779 NODE_NAME_CASE(FGETEXP_SAE)
33780 NODE_NAME_CASE(FGETEXPS)
33781 NODE_NAME_CASE(FGETEXPS_SAE)
33782 NODE_NAME_CASE(SCALEF)
33783 NODE_NAME_CASE(SCALEF_RND)
33784 NODE_NAME_CASE(SCALEFS)
33785 NODE_NAME_CASE(SCALEFS_RND)
33786 NODE_NAME_CASE(MULHRS)
33787 NODE_NAME_CASE(SINT_TO_FP_RND)
33788 NODE_NAME_CASE(UINT_TO_FP_RND)
33789 NODE_NAME_CASE(CVTTP2SI)
33790 NODE_NAME_CASE(CVTTP2UI)
33791 NODE_NAME_CASE(STRICT_CVTTP2SI)
33792 NODE_NAME_CASE(STRICT_CVTTP2UI)
33793 NODE_NAME_CASE(MCVTTP2SI)
33794 NODE_NAME_CASE(MCVTTP2UI)
33795 NODE_NAME_CASE(CVTTP2SI_SAE)
33796 NODE_NAME_CASE(CVTTP2UI_SAE)
33797 NODE_NAME_CASE(CVTTS2SI)
33798 NODE_NAME_CASE(CVTTS2UI)
33799 NODE_NAME_CASE(CVTTS2SI_SAE)
33800 NODE_NAME_CASE(CVTTS2UI_SAE)
33801 NODE_NAME_CASE(CVTSI2P)
33802 NODE_NAME_CASE(CVTUI2P)
33803 NODE_NAME_CASE(STRICT_CVTSI2P)
33804 NODE_NAME_CASE(STRICT_CVTUI2P)
33805 NODE_NAME_CASE(MCVTSI2P)
33806 NODE_NAME_CASE(MCVTUI2P)
33807 NODE_NAME_CASE(VFPCLASS)
33808 NODE_NAME_CASE(VFPCLASSS)
33809 NODE_NAME_CASE(MULTISHIFT)
33810 NODE_NAME_CASE(SCALAR_SINT_TO_FP)
33811 NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
33812 NODE_NAME_CASE(SCALAR_UINT_TO_FP)
33813 NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
33814 NODE_NAME_CASE(CVTPS2PH)
33815 NODE_NAME_CASE(STRICT_CVTPS2PH)
33816 NODE_NAME_CASE(CVTPS2PH_SAE)
33817 NODE_NAME_CASE(MCVTPS2PH)
33818 NODE_NAME_CASE(MCVTPS2PH_SAE)
33819 NODE_NAME_CASE(CVTPH2PS)
33820 NODE_NAME_CASE(STRICT_CVTPH2PS)
33821 NODE_NAME_CASE(CVTPH2PS_SAE)
33822 NODE_NAME_CASE(CVTP2SI)
33823 NODE_NAME_CASE(CVTP2UI)
33824 NODE_NAME_CASE(MCVTP2SI)
33825 NODE_NAME_CASE(MCVTP2UI)
33826 NODE_NAME_CASE(CVTP2SI_RND)
33827 NODE_NAME_CASE(CVTP2UI_RND)
33828 NODE_NAME_CASE(CVTS2SI)
33829 NODE_NAME_CASE(CVTS2UI)
33830 NODE_NAME_CASE(CVTS2SI_RND)
33831 NODE_NAME_CASE(CVTS2UI_RND)
33832 NODE_NAME_CASE(CVTNE2PS2BF16)
33833 NODE_NAME_CASE(CVTNEPS2BF16)
33834 NODE_NAME_CASE(MCVTNEPS2BF16)
33835 NODE_NAME_CASE(DPBF16PS)
33836 NODE_NAME_CASE(LWPINS)
33837 NODE_NAME_CASE(MGATHER)
33838 NODE_NAME_CASE(MSCATTER)
33839 NODE_NAME_CASE(VPDPBUSD)
33840 NODE_NAME_CASE(VPDPBUSDS)
33841 NODE_NAME_CASE(VPDPWSSD)
33842 NODE_NAME_CASE(VPDPWSSDS)
33843 NODE_NAME_CASE(VPSHUFBITQMB)
33844 NODE_NAME_CASE(GF2P8MULB)
33845 NODE_NAME_CASE(GF2P8AFFINEQB)
33846 NODE_NAME_CASE(GF2P8AFFINEINVQB)
33847 NODE_NAME_CASE(NT_CALL)
33848 NODE_NAME_CASE(NT_BRIND)
33849 NODE_NAME_CASE(UMWAIT)
33850 NODE_NAME_CASE(TPAUSE)
33851 NODE_NAME_CASE(ENQCMD)
33852 NODE_NAME_CASE(ENQCMDS)
33853 NODE_NAME_CASE(VP2INTERSECT)
33854 NODE_NAME_CASE(VPDPBSUD)
33855 NODE_NAME_CASE(VPDPBSUDS)
33856 NODE_NAME_CASE(VPDPBUUD)
33857 NODE_NAME_CASE(VPDPBUUDS)
33858 NODE_NAME_CASE(VPDPBSSD)
33859 NODE_NAME_CASE(VPDPBSSDS)
33860 NODE_NAME_CASE(AESENC128KL)
33861 NODE_NAME_CASE(AESDEC128KL)
33862 NODE_NAME_CASE(AESENC256KL)
33863 NODE_NAME_CASE(AESDEC256KL)
33864 NODE_NAME_CASE(AESENCWIDE128KL)
33865 NODE_NAME_CASE(AESDECWIDE128KL)
33866 NODE_NAME_CASE(AESENCWIDE256KL)
33867 NODE_NAME_CASE(AESDECWIDE256KL)
33868 NODE_NAME_CASE(CMPCCXADD)
33869 NODE_NAME_CASE(TESTUI)
33870 NODE_NAME_CASE(FP80_ADD)
33871 NODE_NAME_CASE(STRICT_FP80_ADD)
33872 }
33873 return nullptr;
33874#undef NODE_NAME_CASE
33875}
33876
33877/// Return true if the addressing mode represented by AM is legal for this
33878/// target, for a load/store of the specified type.
33880 const AddrMode &AM, Type *Ty,
33881 unsigned AS,
33882 Instruction *I) const {
33883 // X86 supports extremely general addressing modes.
33885
33886 // X86 allows a sign-extended 32-bit immediate field as a displacement.
33887 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
33888 return false;
33889
33890 if (AM.BaseGV) {
33891 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
33892
33893 // If a reference to this global requires an extra load, we can't fold it.
33894 if (isGlobalStubReference(GVFlags))
33895 return false;
33896
33897 // If BaseGV requires a register for the PIC base, we cannot also have a
33898 // BaseReg specified.
33899 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
33900 return false;
33901
33902 // If lower 4G is not available, then we must use rip-relative addressing.
33903 if ((M != CodeModel::Small || isPositionIndependent()) &&
33904 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
33905 return false;
33906 }
33907
33908 switch (AM.Scale) {
33909 case 0:
33910 case 1:
33911 case 2:
33912 case 4:
33913 case 8:
33914 // These scales always work.
33915 break;
33916 case 3:
33917 case 5:
33918 case 9:
33919 // These scales are formed with basereg+scalereg. Only accept if there is
33920 // no basereg yet.
33921 if (AM.HasBaseReg)
33922 return false;
33923 break;
33924 default: // Other stuff never works.
33925 return false;
33926 }
33927
33928 return true;
33929}
33930
33932 unsigned Bits = Ty->getScalarSizeInBits();
33933
33934 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
33935 // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
33936 if (Subtarget.hasXOP() &&
33937 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
33938 return false;
33939
33940 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
33941 // shifts just as cheap as scalar ones.
33942 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
33943 return false;
33944
33945 // AVX512BW has shifts such as vpsllvw.
33946 if (Subtarget.hasBWI() && Bits == 16)
33947 return false;
33948
33949 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
33950 // fully general vector.
33951 return true;
33952}
33953
33954bool X86TargetLowering::isBinOp(unsigned Opcode) const {
33955 switch (Opcode) {
33956 // These are non-commutative binops.
33957 // TODO: Add more X86ISD opcodes once we have test coverage.
33958 case X86ISD::ANDNP:
33959 case X86ISD::PCMPGT:
33960 case X86ISD::FMAX:
33961 case X86ISD::FMIN:
33962 case X86ISD::FANDN:
33963 case X86ISD::VPSHA:
33964 case X86ISD::VPSHL:
33965 case X86ISD::VSHLV:
33966 case X86ISD::VSRLV:
33967 case X86ISD::VSRAV:
33968 return true;
33969 }
33970
33971 return TargetLoweringBase::isBinOp(Opcode);
33972}
33973
33974bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {
33975 switch (Opcode) {
33976 // TODO: Add more X86ISD opcodes once we have test coverage.
33977 case X86ISD::PCMPEQ:
33978 case X86ISD::PMULDQ:
33979 case X86ISD::PMULUDQ:
33980 case X86ISD::FMAXC:
33981 case X86ISD::FMINC:
33982 case X86ISD::FAND:
33983 case X86ISD::FOR:
33984 case X86ISD::FXOR:
33985 return true;
33986 }
33987
33989}
33990
33992 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
33993 return false;
33994 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
33995 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
33996 return NumBits1 > NumBits2;
33997}
33998
34000 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
34001 return false;
34002
34003 if (!isTypeLegal(EVT::getEVT(Ty1)))
34004 return false;
34005
34006 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
34007
34008 // Assuming the caller doesn't have a zeroext or signext return parameter,
34009 // truncation all the way down to i1 is valid.
34010 return true;
34011}
34012
34014 return isInt<32>(Imm);
34015}
34016
34018 // Can also use sub to handle negated immediates.
34019 return isInt<32>(Imm);
34020}
34021
34023 return isInt<32>(Imm);
34024}
34025
34027 if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
34028 return false;
34029 unsigned NumBits1 = VT1.getSizeInBits();
34030 unsigned NumBits2 = VT2.getSizeInBits();
34031 return NumBits1 > NumBits2;
34032}
34033
34035 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
34036 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
34037}
34038
34040 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
34041 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
34042}
34043
34045 EVT VT1 = Val.getValueType();
34046 if (isZExtFree(VT1, VT2))
34047 return true;
34048
34049 if (Val.getOpcode() != ISD::LOAD)
34050 return false;
34051
34052 if (!VT1.isSimple() || !VT1.isInteger() ||
34053 !VT2.isSimple() || !VT2.isInteger())
34054 return false;
34055
34056 switch (VT1.getSimpleVT().SimpleTy) {
34057 default: break;
34058 case MVT::i8:
34059 case MVT::i16:
34060 case MVT::i32:
34061 // X86 has 8, 16, and 32-bit zero-extending loads.
34062 return true;
34063 }
34064
34065 return false;
34066}
34067
34069 SmallVectorImpl<Use *> &Ops) const {
34070 using namespace llvm::PatternMatch;
34071
34072 FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());
34073 if (!VTy)
34074 return false;
34075
34076 if (I->getOpcode() == Instruction::Mul &&
34077 VTy->getElementType()->isIntegerTy(64)) {
34078 for (auto &Op : I->operands()) {
34079 // Make sure we are not already sinking this operand
34080 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
34081 continue;
34082
34083 // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or
34084 // the PMULUDQ pattern where the input is a zext_inreg from vXi32.
34085 if (Subtarget.hasSSE41() &&
34086 match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),
34087 m_SpecificInt(32)))) {
34088 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
34089 Ops.push_back(&Op);
34090 } else if (Subtarget.hasSSE2() &&
34091 match(Op.get(),
34092 m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
34093 Ops.push_back(&Op);
34094 }
34095 }
34096
34097 return !Ops.empty();
34098 }
34099
34100 // A uniform shift amount in a vector shift or funnel shift may be much
34101 // cheaper than a generic variable vector shift, so make that pattern visible
34102 // to SDAG by sinking the shuffle instruction next to the shift.
34103 int ShiftAmountOpNum = -1;
34104 if (I->isShift())
34105 ShiftAmountOpNum = 1;
34106 else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
34107 if (II->getIntrinsicID() == Intrinsic::fshl ||
34108 II->getIntrinsicID() == Intrinsic::fshr)
34109 ShiftAmountOpNum = 2;
34110 }
34111
34112 if (ShiftAmountOpNum == -1)
34113 return false;
34114
34115 auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
34116 if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
34117 isVectorShiftByScalarCheap(I->getType())) {
34118 Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
34119 return true;
34120 }
34121
34122 return false;
34123}
34124
34126 if (!Subtarget.is64Bit())
34127 return false;
34129}
34130
34132 if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
34133 return false;
34134
34135 EVT SrcVT = ExtVal.getOperand(0).getValueType();
34136
34137 // There is no extending load for vXi1.
34138 if (SrcVT.getScalarType() == MVT::i1)
34139 return false;
34140
34141 return true;
34142}
34143
34145 EVT VT) const {
34146 if (!Subtarget.hasAnyFMA())
34147 return false;
34148
34149 VT = VT.getScalarType();
34150
34151 if (!VT.isSimple())
34152 return false;
34153
34154 switch (VT.getSimpleVT().SimpleTy) {
34155 case MVT::f16:
34156 return Subtarget.hasFP16();
34157 case MVT::f32:
34158 case MVT::f64:
34159 return true;
34160 default:
34161 break;
34162 }
34163
34164 return false;
34165}
34166
34168 // i16 instructions are longer (0x66 prefix) and potentially slower.
34169 return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
34170}
34171
34173 EVT VT) const {
34174 // TODO: This is too general. There are cases where pre-AVX512 codegen would
34175 // benefit. The transform may also be profitable for scalar code.
34176 if (!Subtarget.hasAVX512())
34177 return false;
34178 if (!Subtarget.hasVLX() && !VT.is512BitVector())
34179 return false;
34180 if (!VT.isVector() || VT.getScalarType() == MVT::i1)
34181 return false;
34182
34183 return true;
34184}
34185
34186/// Targets can use this to indicate that they only support *some*
34187/// VECTOR_SHUFFLE operations, those with specific masks.
34188/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
34189/// are assumed to be legal.
34191 if (!VT.isSimple())
34192 return false;
34193
34194 // Not for i1 vectors
34195 if (VT.getSimpleVT().getScalarType() == MVT::i1)
34196 return false;
34197
34198 // Very little shuffling can be done for 64-bit vectors right now.
34199 if (VT.getSimpleVT().getSizeInBits() == 64)
34200 return false;
34201
34202 // We only care that the types being shuffled are legal. The lowering can
34203 // handle any possible shuffle mask that results.
34204 return isTypeLegal(VT.getSimpleVT());
34205}
34206
34208 EVT VT) const {
34209 // Don't convert an 'and' into a shuffle that we don't directly support.
34210 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
34211 if (!Subtarget.hasAVX2())
34212 if (VT == MVT::v32i8 || VT == MVT::v16i16)
34213 return false;
34214
34215 // Just delegate to the generic legality, clear masks aren't special.
34216 return isShuffleMaskLegal(Mask, VT);
34217}
34218
34220 // If the subtarget is using thunks, we need to not generate jump tables.
34221 if (Subtarget.useIndirectThunkBranches())
34222 return false;
34223
34224 // Otherwise, fallback on the generic logic.
34226}
34227
34229 EVT ConditionVT) const {
34230 // Avoid 8 and 16 bit types because they increase the chance for unnecessary
34231 // zero-extensions.
34232 if (ConditionVT.getSizeInBits() < 32)
34233 return MVT::i32;
34235 ConditionVT);
34236}
34237
34238//===----------------------------------------------------------------------===//
34239// X86 Scheduler Hooks
34240//===----------------------------------------------------------------------===//
34241
34242// Returns true if EFLAG is consumed after this iterator in the rest of the
34243// basic block or any successors of the basic block.
34245 MachineBasicBlock *BB) {
34246 // Scan forward through BB for a use/def of EFLAGS.
34247 for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {
34248 if (mi.readsRegister(X86::EFLAGS, /*TRI=*/nullptr))
34249 return true;
34250 // If we found a def, we can stop searching.
34251 if (mi.definesRegister(X86::EFLAGS, /*TRI=*/nullptr))
34252 return false;
34253 }
34254
34255 // If we hit the end of the block, check whether EFLAGS is live into a
34256 // successor.
34257 for (MachineBasicBlock *Succ : BB->successors())
34258 if (Succ->isLiveIn(X86::EFLAGS))
34259 return true;
34260
34261 return false;
34262}
34263
34264/// Utility function to emit xbegin specifying the start of an RTM region.
34266 const TargetInstrInfo *TII) {
34267 const MIMetadata MIMD(MI);
34268
34269 const BasicBlock *BB = MBB->getBasicBlock();
34271
34272 // For the v = xbegin(), we generate
34273 //
34274 // thisMBB:
34275 // xbegin sinkMBB
34276 //
34277 // mainMBB:
34278 // s0 = -1
34279 //
34280 // fallBB:
34281 // eax = # XABORT_DEF
34282 // s1 = eax
34283 //
34284 // sinkMBB:
34285 // v = phi(s0/mainBB, s1/fallBB)
34286
34287 MachineBasicBlock *thisMBB = MBB;
34288 MachineFunction *MF = MBB->getParent();
34289 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
34290 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
34291 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
34292 MF->insert(I, mainMBB);
34293 MF->insert(I, fallMBB);
34294 MF->insert(I, sinkMBB);
34295
34296 if (isEFLAGSLiveAfter(MI, MBB)) {
34297 mainMBB->addLiveIn(X86::EFLAGS);
34298 fallMBB->addLiveIn(X86::EFLAGS);
34299 sinkMBB->addLiveIn(X86::EFLAGS);
34300 }
34301
34302 // Transfer the remainder of BB and its successor edges to sinkMBB.
34303 sinkMBB->splice(sinkMBB->begin(), MBB,
34304 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
34306
34308 Register DstReg = MI.getOperand(0).getReg();
34309 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
34310 Register mainDstReg = MRI.createVirtualRegister(RC);
34311 Register fallDstReg = MRI.createVirtualRegister(RC);
34312
34313 // thisMBB:
34314 // xbegin fallMBB
34315 // # fallthrough to mainMBB
34316 // # abortion to fallMBB
34317 BuildMI(thisMBB, MIMD, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
34318 thisMBB->addSuccessor(mainMBB);
34319 thisMBB->addSuccessor(fallMBB);
34320
34321 // mainMBB:
34322 // mainDstReg := -1
34323 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
34324 BuildMI(mainMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
34325 mainMBB->addSuccessor(sinkMBB);
34326
34327 // fallMBB:
34328 // ; pseudo instruction to model hardware's definition from XABORT
34329 // EAX := XABORT_DEF
34330 // fallDstReg := EAX
34331 BuildMI(fallMBB, MIMD, TII->get(X86::XABORT_DEF));
34332 BuildMI(fallMBB, MIMD, TII->get(TargetOpcode::COPY), fallDstReg)
34333 .addReg(X86::EAX);
34334 fallMBB->addSuccessor(sinkMBB);
34335
34336 // sinkMBB:
34337 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
34338 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
34339 .addReg(mainDstReg).addMBB(mainMBB)
34340 .addReg(fallDstReg).addMBB(fallMBB);
34341
34342 MI.eraseFromParent();
34343 return sinkMBB;
34344}
34345
34347X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,
34348 MachineBasicBlock *MBB) const {
34349 // Emit va_arg instruction on X86-64.
34350
34351 // Operands to this pseudo-instruction:
34352 // 0 ) Output : destination address (reg)
34353 // 1-5) Input : va_list address (addr, i64mem)
34354 // 6 ) ArgSize : Size (in bytes) of vararg type
34355 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
34356 // 8 ) Align : Alignment of type
34357 // 9 ) EFLAGS (implicit-def)
34358
34359 assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!");
34360 static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");
34361
34362 Register DestReg = MI.getOperand(0).getReg();
34363 MachineOperand &Base = MI.getOperand(1);
34364 MachineOperand &Scale = MI.getOperand(2);
34365 MachineOperand &Index = MI.getOperand(3);
34366 MachineOperand &Disp = MI.getOperand(4);
34367 MachineOperand &Segment = MI.getOperand(5);
34368 unsigned ArgSize = MI.getOperand(6).getImm();
34369 unsigned ArgMode = MI.getOperand(7).getImm();
34370 Align Alignment = Align(MI.getOperand(8).getImm());
34371
34372 MachineFunction *MF = MBB->getParent();
34373
34374 // Memory Reference
34375 assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand");
34376
34377 MachineMemOperand *OldMMO = MI.memoperands().front();
34378
34379 // Clone the MMO into two separate MMOs for loading and storing
34380 MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(
34381 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);
34382 MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(
34383 OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);
34384
34385 // Machine Information
34386 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34388 const TargetRegisterClass *AddrRegClass =
34390 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
34391 const MIMetadata MIMD(MI);
34392
34393 // struct va_list {
34394 // i32 gp_offset
34395 // i32 fp_offset
34396 // i64 overflow_area (address)
34397 // i64 reg_save_area (address)
34398 // }
34399 // sizeof(va_list) = 24
34400 // alignment(va_list) = 8
34401
34402 unsigned TotalNumIntRegs = 6;
34403 unsigned TotalNumXMMRegs = 8;
34404 bool UseGPOffset = (ArgMode == 1);
34405 bool UseFPOffset = (ArgMode == 2);
34406 unsigned MaxOffset = TotalNumIntRegs * 8 +
34407 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
34408
34409 /* Align ArgSize to a multiple of 8 */
34410 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
34411 bool NeedsAlign = (Alignment > 8);
34412
34413 MachineBasicBlock *thisMBB = MBB;
34414 MachineBasicBlock *overflowMBB;
34415 MachineBasicBlock *offsetMBB;
34416 MachineBasicBlock *endMBB;
34417
34418 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
34419 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
34420 unsigned OffsetReg = 0;
34421
34422 if (!UseGPOffset && !UseFPOffset) {
34423 // If we only pull from the overflow region, we don't create a branch.
34424 // We don't need to alter control flow.
34425 OffsetDestReg = 0; // unused
34426 OverflowDestReg = DestReg;
34427
34428 offsetMBB = nullptr;
34429 overflowMBB = thisMBB;
34430 endMBB = thisMBB;
34431 } else {
34432 // First emit code to check if gp_offset (or fp_offset) is below the bound.
34433 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
34434 // If not, pull from overflow_area. (branch to overflowMBB)
34435 //
34436 // thisMBB
34437 // | .
34438 // | .
34439 // offsetMBB overflowMBB
34440 // | .
34441 // | .
34442 // endMBB
34443
34444 // Registers for the PHI in endMBB
34445 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
34446 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
34447
34448 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
34449 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34450 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34451 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
34452
34454
34455 // Insert the new basic blocks
34456 MF->insert(MBBIter, offsetMBB);
34457 MF->insert(MBBIter, overflowMBB);
34458 MF->insert(MBBIter, endMBB);
34459
34460 // Transfer the remainder of MBB and its successor edges to endMBB.
34461 endMBB->splice(endMBB->begin(), thisMBB,
34462 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
34463 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
34464
34465 // Make offsetMBB and overflowMBB successors of thisMBB
34466 thisMBB->addSuccessor(offsetMBB);
34467 thisMBB->addSuccessor(overflowMBB);
34468
34469 // endMBB is a successor of both offsetMBB and overflowMBB
34470 offsetMBB->addSuccessor(endMBB);
34471 overflowMBB->addSuccessor(endMBB);
34472
34473 // Load the offset value into a register
34474 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
34475 BuildMI(thisMBB, MIMD, TII->get(X86::MOV32rm), OffsetReg)
34476 .add(Base)
34477 .add(Scale)
34478 .add(Index)
34479 .addDisp(Disp, UseFPOffset ? 4 : 0)
34480 .add(Segment)
34481 .setMemRefs(LoadOnlyMMO);
34482
34483 // Check if there is enough room left to pull this argument.
34484 BuildMI(thisMBB, MIMD, TII->get(X86::CMP32ri))
34485 .addReg(OffsetReg)
34486 .addImm(MaxOffset + 8 - ArgSizeA8);
34487
34488 // Branch to "overflowMBB" if offset >= max
34489 // Fall through to "offsetMBB" otherwise
34490 BuildMI(thisMBB, MIMD, TII->get(X86::JCC_1))
34491 .addMBB(overflowMBB).addImm(X86::COND_AE);
34492 }
34493
34494 // In offsetMBB, emit code to use the reg_save_area.
34495 if (offsetMBB) {
34496 assert(OffsetReg != 0);
34497
34498 // Read the reg_save_area address.
34499 Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
34500 BuildMI(
34501 offsetMBB, MIMD,
34502 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
34503 RegSaveReg)
34504 .add(Base)
34505 .add(Scale)
34506 .add(Index)
34507 .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)
34508 .add(Segment)
34509 .setMemRefs(LoadOnlyMMO);
34510
34511 if (Subtarget.isTarget64BitLP64()) {
34512 // Zero-extend the offset
34513 Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
34514 BuildMI(offsetMBB, MIMD, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
34515 .addImm(0)
34516 .addReg(OffsetReg)
34517 .addImm(X86::sub_32bit);
34518
34519 // Add the offset to the reg_save_area to get the final address.
34520 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD64rr), OffsetDestReg)
34521 .addReg(OffsetReg64)
34522 .addReg(RegSaveReg);
34523 } else {
34524 // Add the offset to the reg_save_area to get the final address.
34525 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32rr), OffsetDestReg)
34526 .addReg(OffsetReg)
34527 .addReg(RegSaveReg);
34528 }
34529
34530 // Compute the offset for the next argument
34531 Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
34532 BuildMI(offsetMBB, MIMD, TII->get(X86::ADD32ri), NextOffsetReg)
34533 .addReg(OffsetReg)
34534 .addImm(UseFPOffset ? 16 : 8);
34535
34536 // Store it back into the va_list.
34537 BuildMI(offsetMBB, MIMD, TII->get(X86::MOV32mr))
34538 .add(Base)
34539 .add(Scale)
34540 .add(Index)
34541 .addDisp(Disp, UseFPOffset ? 4 : 0)
34542 .add(Segment)
34543 .addReg(NextOffsetReg)
34544 .setMemRefs(StoreOnlyMMO);
34545
34546 // Jump to endMBB
34547 BuildMI(offsetMBB, MIMD, TII->get(X86::JMP_1))
34548 .addMBB(endMBB);
34549 }
34550
34551 //
34552 // Emit code to use overflow area
34553 //
34554
34555 // Load the overflow_area address into a register.
34556 Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
34557 BuildMI(overflowMBB, MIMD,
34558 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),
34559 OverflowAddrReg)
34560 .add(Base)
34561 .add(Scale)
34562 .add(Index)
34563 .addDisp(Disp, 8)
34564 .add(Segment)
34565 .setMemRefs(LoadOnlyMMO);
34566
34567 // If we need to align it, do so. Otherwise, just copy the address
34568 // to OverflowDestReg.
34569 if (NeedsAlign) {
34570 // Align the overflow address
34571 Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
34572
34573 // aligned_addr = (addr + (align-1)) & ~(align-1)
34574 BuildMI(
34575 overflowMBB, MIMD,
34576 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
34577 TmpReg)
34578 .addReg(OverflowAddrReg)
34579 .addImm(Alignment.value() - 1);
34580
34581 BuildMI(
34582 overflowMBB, MIMD,
34583 TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),
34584 OverflowDestReg)
34585 .addReg(TmpReg)
34586 .addImm(~(uint64_t)(Alignment.value() - 1));
34587 } else {
34588 BuildMI(overflowMBB, MIMD, TII->get(TargetOpcode::COPY), OverflowDestReg)
34589 .addReg(OverflowAddrReg);
34590 }
34591
34592 // Compute the next overflow address after this argument.
34593 // (the overflow address should be kept 8-byte aligned)
34594 Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
34595 BuildMI(
34596 overflowMBB, MIMD,
34597 TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),
34598 NextAddrReg)
34599 .addReg(OverflowDestReg)
34600 .addImm(ArgSizeA8);
34601
34602 // Store the new overflow address.
34603 BuildMI(overflowMBB, MIMD,
34604 TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))
34605 .add(Base)
34606 .add(Scale)
34607 .add(Index)
34608 .addDisp(Disp, 8)
34609 .add(Segment)
34610 .addReg(NextAddrReg)
34611 .setMemRefs(StoreOnlyMMO);
34612
34613 // If we branched, emit the PHI to the front of endMBB.
34614 if (offsetMBB) {
34615 BuildMI(*endMBB, endMBB->begin(), MIMD,
34616 TII->get(X86::PHI), DestReg)
34617 .addReg(OffsetDestReg).addMBB(offsetMBB)
34618 .addReg(OverflowDestReg).addMBB(overflowMBB);
34619 }
34620
34621 // Erase the pseudo instruction
34622 MI.eraseFromParent();
34623
34624 return endMBB;
34625}
34626
34627// The EFLAGS operand of SelectItr might be missing a kill marker
34628// because there were multiple uses of EFLAGS, and ISel didn't know
34629// which to mark. Figure out whether SelectItr should have had a
34630// kill marker, and set it if it should. Returns the correct kill
34631// marker value.
34634 const TargetRegisterInfo* TRI) {
34635 if (isEFLAGSLiveAfter(SelectItr, BB))
34636 return false;
34637
34638 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
34639 // out. SelectMI should have a kill flag on EFLAGS.
34640 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
34641 return true;
34642}
34643
34644// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
34645// together with other CMOV pseudo-opcodes into a single basic-block with
34646// conditional jump around it.
34648 switch (MI.getOpcode()) {
34649 case X86::CMOV_FR16:
34650 case X86::CMOV_FR16X:
34651 case X86::CMOV_FR32:
34652 case X86::CMOV_FR32X:
34653 case X86::CMOV_FR64:
34654 case X86::CMOV_FR64X:
34655 case X86::CMOV_GR8:
34656 case X86::CMOV_GR16:
34657 case X86::CMOV_GR32:
34658 case X86::CMOV_RFP32:
34659 case X86::CMOV_RFP64:
34660 case X86::CMOV_RFP80:
34661 case X86::CMOV_VR64:
34662 case X86::CMOV_VR128:
34663 case X86::CMOV_VR128X:
34664 case X86::CMOV_VR256:
34665 case X86::CMOV_VR256X:
34666 case X86::CMOV_VR512:
34667 case X86::CMOV_VK1:
34668 case X86::CMOV_VK2:
34669 case X86::CMOV_VK4:
34670 case X86::CMOV_VK8:
34671 case X86::CMOV_VK16:
34672 case X86::CMOV_VK32:
34673 case X86::CMOV_VK64:
34674 return true;
34675
34676 default:
34677 return false;
34678 }
34679}
34680
34681// Helper function, which inserts PHI functions into SinkMBB:
34682// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
34683// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
34684// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
34685// the last PHI function inserted.
34688 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
34689 MachineBasicBlock *SinkMBB) {
34690 MachineFunction *MF = TrueMBB->getParent();
34692 const MIMetadata MIMD(*MIItBegin);
34693
34694 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
34696
34697 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
34698
34699 // As we are creating the PHIs, we have to be careful if there is more than
34700 // one. Later CMOVs may reference the results of earlier CMOVs, but later
34701 // PHIs have to reference the individual true/false inputs from earlier PHIs.
34702 // That also means that PHI construction must work forward from earlier to
34703 // later, and that the code must maintain a mapping from earlier PHI's
34704 // destination registers, and the registers that went into the PHI.
34707
34708 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
34709 Register DestReg = MIIt->getOperand(0).getReg();
34710 Register Op1Reg = MIIt->getOperand(1).getReg();
34711 Register Op2Reg = MIIt->getOperand(2).getReg();
34712
34713 // If this CMOV we are generating is the opposite condition from
34714 // the jump we generated, then we have to swap the operands for the
34715 // PHI that is going to be generated.
34716 if (MIIt->getOperand(3).getImm() == OppCC)
34717 std::swap(Op1Reg, Op2Reg);
34718
34719 if (RegRewriteTable.contains(Op1Reg))
34720 Op1Reg = RegRewriteTable[Op1Reg].first;
34721
34722 if (RegRewriteTable.contains(Op2Reg))
34723 Op2Reg = RegRewriteTable[Op2Reg].second;
34724
34725 MIB =
34726 BuildMI(*SinkMBB, SinkInsertionPoint, MIMD, TII->get(X86::PHI), DestReg)
34727 .addReg(Op1Reg)
34728 .addMBB(FalseMBB)
34729 .addReg(Op2Reg)
34730 .addMBB(TrueMBB);
34731
34732 // Add this PHI to the rewrite table.
34733 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
34734 }
34735
34736 return MIB;
34737}
34738
34739// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
34741X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
34742 MachineInstr &SecondCascadedCMOV,
34743 MachineBasicBlock *ThisMBB) const {
34744 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34745 const MIMetadata MIMD(FirstCMOV);
34746
34747 // We lower cascaded CMOVs such as
34748 //
34749 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
34750 //
34751 // to two successive branches.
34752 //
34753 // Without this, we would add a PHI between the two jumps, which ends up
34754 // creating a few copies all around. For instance, for
34755 //
34756 // (sitofp (zext (fcmp une)))
34757 //
34758 // we would generate:
34759 //
34760 // ucomiss %xmm1, %xmm0
34761 // movss <1.0f>, %xmm0
34762 // movaps %xmm0, %xmm1
34763 // jne .LBB5_2
34764 // xorps %xmm1, %xmm1
34765 // .LBB5_2:
34766 // jp .LBB5_4
34767 // movaps %xmm1, %xmm0
34768 // .LBB5_4:
34769 // retq
34770 //
34771 // because this custom-inserter would have generated:
34772 //
34773 // A
34774 // | \
34775 // | B
34776 // | /
34777 // C
34778 // | \
34779 // | D
34780 // | /
34781 // E
34782 //
34783 // A: X = ...; Y = ...
34784 // B: empty
34785 // C: Z = PHI [X, A], [Y, B]
34786 // D: empty
34787 // E: PHI [X, C], [Z, D]
34788 //
34789 // If we lower both CMOVs in a single step, we can instead generate:
34790 //
34791 // A
34792 // | \
34793 // | C
34794 // | /|
34795 // |/ |
34796 // | |
34797 // | D
34798 // | /
34799 // E
34800 //
34801 // A: X = ...; Y = ...
34802 // D: empty
34803 // E: PHI [X, A], [X, C], [Y, D]
34804 //
34805 // Which, in our sitofp/fcmp example, gives us something like:
34806 //
34807 // ucomiss %xmm1, %xmm0
34808 // movss <1.0f>, %xmm0
34809 // jne .LBB5_4
34810 // jp .LBB5_4
34811 // xorps %xmm0, %xmm0
34812 // .LBB5_4:
34813 // retq
34814 //
34815
34816 // We lower cascaded CMOV into two successive branches to the same block.
34817 // EFLAGS is used by both, so mark it as live in the second.
34818 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
34819 MachineFunction *F = ThisMBB->getParent();
34820 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
34821 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
34822 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
34823
34824 MachineFunction::iterator It = ++ThisMBB->getIterator();
34825 F->insert(It, FirstInsertedMBB);
34826 F->insert(It, SecondInsertedMBB);
34827 F->insert(It, SinkMBB);
34828
34829 // For a cascaded CMOV, we lower it to two successive branches to
34830 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
34831 // the FirstInsertedMBB.
34832 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
34833
34834 // If the EFLAGS register isn't dead in the terminator, then claim that it's
34835 // live into the sink and copy blocks.
34836 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
34837 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
34838 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
34839 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
34840 SinkMBB->addLiveIn(X86::EFLAGS);
34841 }
34842
34843 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
34844 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
34845 std::next(MachineBasicBlock::iterator(FirstCMOV)),
34846 ThisMBB->end());
34847 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
34848
34849 // Fallthrough block for ThisMBB.
34850 ThisMBB->addSuccessor(FirstInsertedMBB);
34851 // The true block target of the first branch is always SinkMBB.
34852 ThisMBB->addSuccessor(SinkMBB);
34853 // Fallthrough block for FirstInsertedMBB.
34854 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
34855 // The true block for the branch of FirstInsertedMBB.
34856 FirstInsertedMBB->addSuccessor(SinkMBB);
34857 // This is fallthrough.
34858 SecondInsertedMBB->addSuccessor(SinkMBB);
34859
34860 // Create the conditional branch instructions.
34861 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
34862 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);
34863
34864 X86::CondCode SecondCC =
34865 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
34866 BuildMI(FirstInsertedMBB, MIMD, TII->get(X86::JCC_1))
34867 .addMBB(SinkMBB)
34868 .addImm(SecondCC);
34869
34870 // SinkMBB:
34871 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
34872 Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();
34873 Register Op1Reg = FirstCMOV.getOperand(1).getReg();
34874 Register Op2Reg = FirstCMOV.getOperand(2).getReg();
34876 BuildMI(*SinkMBB, SinkMBB->begin(), MIMD, TII->get(X86::PHI), DestReg)
34877 .addReg(Op1Reg)
34878 .addMBB(SecondInsertedMBB)
34879 .addReg(Op2Reg)
34880 .addMBB(ThisMBB);
34881
34882 // The second SecondInsertedMBB provides the same incoming value as the
34883 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
34884 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
34885
34886 // Now remove the CMOVs.
34887 FirstCMOV.eraseFromParent();
34888 SecondCascadedCMOV.eraseFromParent();
34889
34890 return SinkMBB;
34891}
34892
34894X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
34895 MachineBasicBlock *ThisMBB) const {
34896 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34897 const MIMetadata MIMD(MI);
34898
34899 // To "insert" a SELECT_CC instruction, we actually have to insert the
34900 // diamond control-flow pattern. The incoming instruction knows the
34901 // destination vreg to set, the condition code register to branch on, the
34902 // true/false values to select between and a branch opcode to use.
34903
34904 // ThisMBB:
34905 // ...
34906 // TrueVal = ...
34907 // cmpTY ccX, r1, r2
34908 // bCC copy1MBB
34909 // fallthrough --> FalseMBB
34910
34911 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
34912 // as described above, by inserting a BB, and then making a PHI at the join
34913 // point to select the true and false operands of the CMOV in the PHI.
34914 //
34915 // The code also handles two different cases of multiple CMOV opcodes
34916 // in a row.
34917 //
34918 // Case 1:
34919 // In this case, there are multiple CMOVs in a row, all which are based on
34920 // the same condition setting (or the exact opposite condition setting).
34921 // In this case we can lower all the CMOVs using a single inserted BB, and
34922 // then make a number of PHIs at the join point to model the CMOVs. The only
34923 // trickiness here, is that in a case like:
34924 //
34925 // t2 = CMOV cond1 t1, f1
34926 // t3 = CMOV cond1 t2, f2
34927 //
34928 // when rewriting this into PHIs, we have to perform some renaming on the
34929 // temps since you cannot have a PHI operand refer to a PHI result earlier
34930 // in the same block. The "simple" but wrong lowering would be:
34931 //
34932 // t2 = PHI t1(BB1), f1(BB2)
34933 // t3 = PHI t2(BB1), f2(BB2)
34934 //
34935 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
34936 // renaming is to note that on the path through BB1, t2 is really just a
34937 // copy of t1, and do that renaming, properly generating:
34938 //
34939 // t2 = PHI t1(BB1), f1(BB2)
34940 // t3 = PHI t1(BB1), f2(BB2)
34941 //
34942 // Case 2:
34943 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
34944 // function - EmitLoweredCascadedSelect.
34945
34946 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
34948 MachineInstr *LastCMOV = &MI;
34950
34951 // Check for case 1, where there are multiple CMOVs with the same condition
34952 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
34953 // number of jumps the most.
34954
34955 if (isCMOVPseudo(MI)) {
34956 // See if we have a string of CMOVS with the same condition. Skip over
34957 // intervening debug insts.
34958 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
34959 (NextMIIt->getOperand(3).getImm() == CC ||
34960 NextMIIt->getOperand(3).getImm() == OppCC)) {
34961 LastCMOV = &*NextMIIt;
34962 NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
34963 }
34964 }
34965
34966 // This checks for case 2, but only do this if we didn't already find
34967 // case 1, as indicated by LastCMOV == MI.
34968 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
34969 NextMIIt->getOpcode() == MI.getOpcode() &&
34970 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
34971 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
34972 NextMIIt->getOperand(1).isKill()) {
34973 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
34974 }
34975
34976 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
34977 MachineFunction *F = ThisMBB->getParent();
34978 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
34979 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
34980
34981 MachineFunction::iterator It = ++ThisMBB->getIterator();
34982 F->insert(It, FalseMBB);
34983 F->insert(It, SinkMBB);
34984
34985 // Set the call frame size on entry to the new basic blocks.
34986 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
34987 FalseMBB->setCallFrameSize(CallFrameSize);
34988 SinkMBB->setCallFrameSize(CallFrameSize);
34989
34990 // If the EFLAGS register isn't dead in the terminator, then claim that it's
34991 // live into the sink and copy blocks.
34992 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
34993 if (!LastCMOV->killsRegister(X86::EFLAGS, /*TRI=*/nullptr) &&
34994 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
34995 FalseMBB->addLiveIn(X86::EFLAGS);
34996 SinkMBB->addLiveIn(X86::EFLAGS);
34997 }
34998
34999 // Transfer any debug instructions inside the CMOV sequence to the sunk block.
35001 MachineBasicBlock::iterator(LastCMOV));
35002 for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))
35003 if (MI.isDebugInstr())
35004 SinkMBB->push_back(MI.removeFromParent());
35005
35006 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
35007 SinkMBB->splice(SinkMBB->end(), ThisMBB,
35008 std::next(MachineBasicBlock::iterator(LastCMOV)),
35009 ThisMBB->end());
35010 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
35011
35012 // Fallthrough block for ThisMBB.
35013 ThisMBB->addSuccessor(FalseMBB);
35014 // The true block target of the first (or only) branch is always a SinkMBB.
35015 ThisMBB->addSuccessor(SinkMBB);
35016 // Fallthrough block for FalseMBB.
35017 FalseMBB->addSuccessor(SinkMBB);
35018
35019 // Create the conditional branch instruction.
35020 BuildMI(ThisMBB, MIMD, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);
35021
35022 // SinkMBB:
35023 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
35024 // ...
35027 std::next(MachineBasicBlock::iterator(LastCMOV));
35028 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
35029
35030 // Now remove the CMOV(s).
35031 ThisMBB->erase(MIItBegin, MIItEnd);
35032
35033 return SinkMBB;
35034}
35035
35036static unsigned getSUBriOpcode(bool IsLP64) {
35037 if (IsLP64)
35038 return X86::SUB64ri32;
35039 else
35040 return X86::SUB32ri;
35041}
35042
35044X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
35045 MachineBasicBlock *MBB) const {
35046 MachineFunction *MF = MBB->getParent();
35047 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35048 const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
35049 const MIMetadata MIMD(MI);
35050 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
35051
35052 const unsigned ProbeSize = getStackProbeSize(*MF);
35053
35055 MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35056 MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35057 MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35058
35060 MF->insert(MBBIter, testMBB);
35061 MF->insert(MBBIter, blockMBB);
35062 MF->insert(MBBIter, tailMBB);
35063
35064 Register sizeVReg = MI.getOperand(1).getReg();
35065
35066 Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
35067
35068 Register TmpStackPtr = MRI.createVirtualRegister(
35069 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
35070 Register FinalStackPtr = MRI.createVirtualRegister(
35071 TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
35072
35073 BuildMI(*MBB, {MI}, MIMD, TII->get(TargetOpcode::COPY), TmpStackPtr)
35074 .addReg(physSPReg);
35075 {
35076 const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
35077 BuildMI(*MBB, {MI}, MIMD, TII->get(Opc), FinalStackPtr)
35078 .addReg(TmpStackPtr)
35079 .addReg(sizeVReg);
35080 }
35081
35082 // test rsp size
35083
35084 BuildMI(testMBB, MIMD,
35085 TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
35086 .addReg(FinalStackPtr)
35087 .addReg(physSPReg);
35088
35089 BuildMI(testMBB, MIMD, TII->get(X86::JCC_1))
35090 .addMBB(tailMBB)
35092 testMBB->addSuccessor(blockMBB);
35093 testMBB->addSuccessor(tailMBB);
35094
35095 // Touch the block then extend it. This is done on the opposite side of
35096 // static probe where we allocate then touch, to avoid the need of probing the
35097 // tail of the static alloca. Possible scenarios are:
35098 //
35099 // + ---- <- ------------ <- ------------- <- ------------ +
35100 // | |
35101 // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
35102 // | |
35103 // + <- ----------- <- ------------ <- ----------- <- ------------ +
35104 //
35105 // The property we want to enforce is to never have more than [page alloc] between two probes.
35106
35107 const unsigned XORMIOpc =
35108 TFI.Uses64BitFramePtr ? X86::XOR64mi32 : X86::XOR32mi;
35109 addRegOffset(BuildMI(blockMBB, MIMD, TII->get(XORMIOpc)), physSPReg, false, 0)
35110 .addImm(0);
35111
35112 BuildMI(blockMBB, MIMD, TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr)),
35113 physSPReg)
35114 .addReg(physSPReg)
35115 .addImm(ProbeSize);
35116
35117 BuildMI(blockMBB, MIMD, TII->get(X86::JMP_1)).addMBB(testMBB);
35118 blockMBB->addSuccessor(testMBB);
35119
35120 // Replace original instruction by the expected stack ptr
35121 BuildMI(tailMBB, MIMD, TII->get(TargetOpcode::COPY),
35122 MI.getOperand(0).getReg())
35123 .addReg(FinalStackPtr);
35124
35125 tailMBB->splice(tailMBB->end(), MBB,
35126 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35128 MBB->addSuccessor(testMBB);
35129
35130 // Delete the original pseudo instruction.
35131 MI.eraseFromParent();
35132
35133 // And we're done.
35134 return tailMBB;
35135}
35136
35138X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
35139 MachineBasicBlock *BB) const {
35140 MachineFunction *MF = BB->getParent();
35141 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35142 const MIMetadata MIMD(MI);
35143 const BasicBlock *LLVM_BB = BB->getBasicBlock();
35144
35145 assert(MF->shouldSplitStack());
35146
35147 const bool Is64Bit = Subtarget.is64Bit();
35148 const bool IsLP64 = Subtarget.isTarget64BitLP64();
35149
35150 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
35151 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
35152
35153 // BB:
35154 // ... [Till the alloca]
35155 // If stacklet is not large enough, jump to mallocMBB
35156 //
35157 // bumpMBB:
35158 // Allocate by subtracting from RSP
35159 // Jump to continueMBB
35160 //
35161 // mallocMBB:
35162 // Allocate by call to runtime
35163 //
35164 // continueMBB:
35165 // ...
35166 // [rest of original BB]
35167 //
35168
35169 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35170 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35171 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
35172
35174 const TargetRegisterClass *AddrRegClass =
35176
35177 Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
35178 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
35179 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
35180 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
35181 sizeVReg = MI.getOperand(1).getReg(),
35182 physSPReg =
35183 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
35184
35185 MachineFunction::iterator MBBIter = ++BB->getIterator();
35186
35187 MF->insert(MBBIter, bumpMBB);
35188 MF->insert(MBBIter, mallocMBB);
35189 MF->insert(MBBIter, continueMBB);
35190
35191 continueMBB->splice(continueMBB->begin(), BB,
35192 std::next(MachineBasicBlock::iterator(MI)), BB->end());
35193 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
35194
35195 // Add code to the main basic block to check if the stack limit has been hit,
35196 // and if so, jump to mallocMBB otherwise to bumpMBB.
35197 BuildMI(BB, MIMD, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
35198 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
35199 .addReg(tmpSPVReg).addReg(sizeVReg);
35200 BuildMI(BB, MIMD, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
35201 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
35202 .addReg(SPLimitVReg);
35203 BuildMI(BB, MIMD, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);
35204
35205 // bumpMBB simply decreases the stack pointer, since we know the current
35206 // stacklet has enough space.
35207 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), physSPReg)
35208 .addReg(SPLimitVReg);
35209 BuildMI(bumpMBB, MIMD, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
35210 .addReg(SPLimitVReg);
35211 BuildMI(bumpMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
35212
35213 // Calls into a routine in libgcc to allocate more space from the heap.
35214 const uint32_t *RegMask =
35216 if (IsLP64) {
35217 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV64rr), X86::RDI)
35218 .addReg(sizeVReg);
35219 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
35220 .addExternalSymbol("__morestack_allocate_stack_space")
35221 .addRegMask(RegMask)
35222 .addReg(X86::RDI, RegState::Implicit)
35223 .addReg(X86::RAX, RegState::ImplicitDefine);
35224 } else if (Is64Bit) {
35225 BuildMI(mallocMBB, MIMD, TII->get(X86::MOV32rr), X86::EDI)
35226 .addReg(sizeVReg);
35227 BuildMI(mallocMBB, MIMD, TII->get(X86::CALL64pcrel32))
35228 .addExternalSymbol("__morestack_allocate_stack_space")
35229 .addRegMask(RegMask)
35230 .addReg(X86::EDI, RegState::Implicit)
35231 .addReg(X86::EAX, RegState::ImplicitDefine);
35232 } else {
35233 BuildMI(mallocMBB, MIMD, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
35234 .addImm(12);
35235 BuildMI(mallocMBB, MIMD, TII->get(X86::PUSH32r)).addReg(sizeVReg);
35236 BuildMI(mallocMBB, MIMD, TII->get(X86::CALLpcrel32))
35237 .addExternalSymbol("__morestack_allocate_stack_space")
35238 .addRegMask(RegMask)
35239 .addReg(X86::EAX, RegState::ImplicitDefine);
35240 }
35241
35242 if (!Is64Bit)
35243 BuildMI(mallocMBB, MIMD, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
35244 .addImm(16);
35245
35246 BuildMI(mallocMBB, MIMD, TII->get(TargetOpcode::COPY), mallocPtrVReg)
35247 .addReg(IsLP64 ? X86::RAX : X86::EAX);
35248 BuildMI(mallocMBB, MIMD, TII->get(X86::JMP_1)).addMBB(continueMBB);
35249
35250 // Set up the CFG correctly.
35251 BB->addSuccessor(bumpMBB);
35252 BB->addSuccessor(mallocMBB);
35253 mallocMBB->addSuccessor(continueMBB);
35254 bumpMBB->addSuccessor(continueMBB);
35255
35256 // Take care of the PHI nodes.
35257 BuildMI(*continueMBB, continueMBB->begin(), MIMD, TII->get(X86::PHI),
35258 MI.getOperand(0).getReg())
35259 .addReg(mallocPtrVReg)
35260 .addMBB(mallocMBB)
35261 .addReg(bumpSPPtrVReg)
35262 .addMBB(bumpMBB);
35263
35264 // Delete the original pseudo instruction.
35265 MI.eraseFromParent();
35266
35267 // And we're done.
35268 return continueMBB;
35269}
35270
35272X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
35273 MachineBasicBlock *BB) const {
35274 MachineFunction *MF = BB->getParent();
35275 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
35276 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
35277 const MIMetadata MIMD(MI);
35278
35281 "SEH does not use catchret!");
35282
35283 // Only 32-bit EH needs to worry about manually restoring stack pointers.
35284 if (!Subtarget.is32Bit())
35285 return BB;
35286
35287 // C++ EH creates a new target block to hold the restore code, and wires up
35288 // the new block to the return destination with a normal JMP_4.
35289 MachineBasicBlock *RestoreMBB =
35291 assert(BB->succ_size() == 1);
35292 MF->insert(std::next(BB->getIterator()), RestoreMBB);
35293 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
35294 BB->addSuccessor(RestoreMBB);
35295 MI.getOperand(0).setMBB(RestoreMBB);
35296
35297 // Marking this as an EH pad but not a funclet entry block causes PEI to
35298 // restore stack pointers in the block.
35299 RestoreMBB->setIsEHPad(true);
35300
35301 auto RestoreMBBI = RestoreMBB->begin();
35302 BuildMI(*RestoreMBB, RestoreMBBI, MIMD, TII.get(X86::JMP_4)).addMBB(TargetMBB);
35303 return BB;
35304}
35305
35307X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
35308 MachineBasicBlock *BB) const {
35309 // So, here we replace TLSADDR with the sequence:
35310 // adjust_stackdown -> TLSADDR -> adjust_stackup.
35311 // We need this because TLSADDR is lowered into calls
35312 // inside MC, therefore without the two markers shrink-wrapping
35313 // may push the prologue/epilogue pass them.
35314 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
35315 const MIMetadata MIMD(MI);
35316 MachineFunction &MF = *BB->getParent();
35317
35318 // Emit CALLSEQ_START right before the instruction.
35319 BB->getParent()->getFrameInfo().setAdjustsStack(true);
35320 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
35321 MachineInstrBuilder CallseqStart =
35322 BuildMI(MF, MIMD, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
35323 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
35324
35325 // Emit CALLSEQ_END right after the instruction.
35326 // We don't call erase from parent because we want to keep the
35327 // original instruction around.
35328 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
35329 MachineInstrBuilder CallseqEnd =
35330 BuildMI(MF, MIMD, TII.get(AdjStackUp)).addImm(0).addImm(0);
35331 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
35332
35333 return BB;
35334}
35335
35337X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
35338 MachineBasicBlock *BB) const {
35339 // This is pretty easy. We're taking the value that we received from
35340 // our load from the relocation, sticking it in either RDI (x86-64)
35341 // or EAX and doing an indirect call. The return value will then
35342 // be in the normal return register.
35343 MachineFunction *F = BB->getParent();
35344 const X86InstrInfo *TII = Subtarget.getInstrInfo();
35345 const MIMetadata MIMD(MI);
35346
35347 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
35348 assert(MI.getOperand(3).isGlobal() && "This should be a global");
35349
35350 // Get a register mask for the lowered call.
35351 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
35352 // proper register mask.
35353 const uint32_t *RegMask =
35354 Subtarget.is64Bit() ?
35357 if (Subtarget.is64Bit()) {
35359 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV64rm), X86::RDI)
35360 .addReg(X86::RIP)
35361 .addImm(0)
35362 .addReg(0)
35363 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
35364 MI.getOperand(3).getTargetFlags())
35365 .addReg(0);
35366 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL64m));
35367 addDirectMem(MIB, X86::RDI);
35368 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
35369 } else if (!isPositionIndependent()) {
35371 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
35372 .addReg(0)
35373 .addImm(0)
35374 .addReg(0)
35375 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
35376 MI.getOperand(3).getTargetFlags())
35377 .addReg(0);
35378 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
35379 addDirectMem(MIB, X86::EAX);
35380 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
35381 } else {
35383 BuildMI(*BB, MI, MIMD, TII->get(X86::MOV32rm), X86::EAX)
35384 .addReg(TII->getGlobalBaseReg(F))
35385 .addImm(0)
35386 .addReg(0)
35387 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
35388 MI.getOperand(3).getTargetFlags())
35389 .addReg(0);
35390 MIB = BuildMI(*BB, MI, MIMD, TII->get(X86::CALL32m));
35391 addDirectMem(MIB, X86::EAX);
35392 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
35393 }
35394
35395 MI.eraseFromParent(); // The pseudo instruction is gone now.
35396 return BB;
35397}
35398
35399static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
35400 switch (RPOpc) {
35401 case X86::INDIRECT_THUNK_CALL32:
35402 return X86::CALLpcrel32;
35403 case X86::INDIRECT_THUNK_CALL64:
35404 return X86::CALL64pcrel32;
35405 case X86::INDIRECT_THUNK_TCRETURN32:
35406 return X86::TCRETURNdi;
35407 case X86::INDIRECT_THUNK_TCRETURN64:
35408 return X86::TCRETURNdi64;
35409 }
35410 llvm_unreachable("not indirect thunk opcode");
35411}
35412
35413static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
35414 unsigned Reg) {
35415 if (Subtarget.useRetpolineExternalThunk()) {
35416 // When using an external thunk for retpolines, we pick names that match the
35417 // names GCC happens to use as well. This helps simplify the implementation
35418 // of the thunks for kernels where they have no easy ability to create
35419 // aliases and are doing non-trivial configuration of the thunk's body. For
35420 // example, the Linux kernel will do boot-time hot patching of the thunk
35421 // bodies and cannot easily export aliases of these to loaded modules.
35422 //
35423 // Note that at any point in the future, we may need to change the semantics
35424 // of how we implement retpolines and at that time will likely change the
35425 // name of the called thunk. Essentially, there is no hard guarantee that
35426 // LLVM will generate calls to specific thunks, we merely make a best-effort
35427 // attempt to help out kernels and other systems where duplicating the
35428 // thunks is costly.
35429 switch (Reg) {
35430 case X86::EAX:
35431 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35432 return "__x86_indirect_thunk_eax";
35433 case X86::ECX:
35434 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35435 return "__x86_indirect_thunk_ecx";
35436 case X86::EDX:
35437 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35438 return "__x86_indirect_thunk_edx";
35439 case X86::EDI:
35440 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35441 return "__x86_indirect_thunk_edi";
35442 case X86::R11:
35443 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
35444 return "__x86_indirect_thunk_r11";
35445 }
35446 llvm_unreachable("unexpected reg for external indirect thunk");
35447 }
35448
35449 if (Subtarget.useRetpolineIndirectCalls() ||
35450 Subtarget.useRetpolineIndirectBranches()) {
35451 // When targeting an internal COMDAT thunk use an LLVM-specific name.
35452 switch (Reg) {
35453 case X86::EAX:
35454 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35455 return "__llvm_retpoline_eax";
35456 case X86::ECX:
35457 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35458 return "__llvm_retpoline_ecx";
35459 case X86::EDX:
35460 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35461 return "__llvm_retpoline_edx";
35462 case X86::EDI:
35463 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
35464 return "__llvm_retpoline_edi";
35465 case X86::R11:
35466 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
35467 return "__llvm_retpoline_r11";
35468 }
35469 llvm_unreachable("unexpected reg for retpoline");
35470 }
35471
35472 if (Subtarget.useLVIControlFlowIntegrity()) {
35473 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
35474 return "__llvm_lvi_thunk_r11";
35475 }
35476 llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
35477}
35478
35480X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
35481 MachineBasicBlock *BB) const {
35482 // Copy the virtual register into the R11 physical register and
35483 // call the retpoline thunk.
35484 const MIMetadata MIMD(MI);
35485 const X86InstrInfo *TII = Subtarget.getInstrInfo();
35486 Register CalleeVReg = MI.getOperand(0).getReg();
35487 unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
35488
35489 // Find an available scratch register to hold the callee. On 64-bit, we can
35490 // just use R11, but we scan for uses anyway to ensure we don't generate
35491 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
35492 // already a register use operand to the call to hold the callee. If none
35493 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
35494 // register and ESI is the base pointer to realigned stack frames with VLAs.
35495 SmallVector<unsigned, 3> AvailableRegs;
35496 if (Subtarget.is64Bit())
35497 AvailableRegs.push_back(X86::R11);
35498 else
35499 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
35500
35501 // Zero out any registers that are already used.
35502 for (const auto &MO : MI.operands()) {
35503 if (MO.isReg() && MO.isUse())
35504 for (unsigned &Reg : AvailableRegs)
35505 if (Reg == MO.getReg())
35506 Reg = 0;
35507 }
35508
35509 // Choose the first remaining non-zero available register.
35510 unsigned AvailableReg = 0;
35511 for (unsigned MaybeReg : AvailableRegs) {
35512 if (MaybeReg) {
35513 AvailableReg = MaybeReg;
35514 break;
35515 }
35516 }
35517 if (!AvailableReg)
35518 report_fatal_error("calling convention incompatible with retpoline, no "
35519 "available registers");
35520
35521 const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
35522
35523 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), AvailableReg)
35524 .addReg(CalleeVReg);
35525 MI.getOperand(0).ChangeToES(Symbol);
35526 MI.setDesc(TII->get(Opc));
35528 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
35529 return BB;
35530}
35531
35532/// SetJmp implies future control flow change upon calling the corresponding
35533/// LongJmp.
35534/// Instead of using the 'return' instruction, the long jump fixes the stack and
35535/// performs an indirect branch. To do so it uses the registers that were stored
35536/// in the jump buffer (when calling SetJmp).
35537/// In case the shadow stack is enabled we need to fix it as well, because some
35538/// return addresses will be skipped.
35539/// The function will save the SSP for future fixing in the function
35540/// emitLongJmpShadowStackFix.
35541/// \sa emitLongJmpShadowStackFix
35542/// \param [in] MI The temporary Machine Instruction for the builtin.
35543/// \param [in] MBB The Machine Basic Block that will be modified.
35544void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
35545 MachineBasicBlock *MBB) const {
35546 const MIMetadata MIMD(MI);
35547 MachineFunction *MF = MBB->getParent();
35548 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35551
35552 // Memory Reference.
35553 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35554 MI.memoperands_end());
35555
35556 // Initialize a register with zero.
35557 MVT PVT = getPointerTy(MF->getDataLayout());
35558 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35559 Register ZReg = MRI.createVirtualRegister(PtrRC);
35560 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
35561 BuildMI(*MBB, MI, MIMD, TII->get(XorRROpc))
35562 .addDef(ZReg)
35563 .addReg(ZReg, RegState::Undef)
35564 .addReg(ZReg, RegState::Undef);
35565
35566 // Read the current SSP Register value to the zeroed register.
35567 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
35568 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
35569 BuildMI(*MBB, MI, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
35570
35571 // Write the SSP register value to offset 3 in input memory buffer.
35572 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
35573 MIB = BuildMI(*MBB, MI, MIMD, TII->get(PtrStoreOpc));
35574 const int64_t SSPOffset = 3 * PVT.getStoreSize();
35575 const unsigned MemOpndSlot = 1;
35576 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35577 if (i == X86::AddrDisp)
35578 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
35579 else
35580 MIB.add(MI.getOperand(MemOpndSlot + i));
35581 }
35582 MIB.addReg(SSPCopyReg);
35583 MIB.setMemRefs(MMOs);
35584}
35585
35587X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
35588 MachineBasicBlock *MBB) const {
35589 const MIMetadata MIMD(MI);
35590 MachineFunction *MF = MBB->getParent();
35591 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35592 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
35594
35595 const BasicBlock *BB = MBB->getBasicBlock();
35597
35598 // Memory Reference
35599 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35600 MI.memoperands_end());
35601
35602 unsigned DstReg;
35603 unsigned MemOpndSlot = 0;
35604
35605 unsigned CurOp = 0;
35606
35607 DstReg = MI.getOperand(CurOp++).getReg();
35608 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
35609 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
35610 (void)TRI;
35611 Register mainDstReg = MRI.createVirtualRegister(RC);
35612 Register restoreDstReg = MRI.createVirtualRegister(RC);
35613
35614 MemOpndSlot = CurOp;
35615
35616 MVT PVT = getPointerTy(MF->getDataLayout());
35617 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
35618 "Invalid Pointer Size!");
35619
35620 // For v = setjmp(buf), we generate
35621 //
35622 // thisMBB:
35623 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
35624 // SjLjSetup restoreMBB
35625 //
35626 // mainMBB:
35627 // v_main = 0
35628 //
35629 // sinkMBB:
35630 // v = phi(main, restore)
35631 //
35632 // restoreMBB:
35633 // if base pointer being used, load it from frame
35634 // v_restore = 1
35635
35636 MachineBasicBlock *thisMBB = MBB;
35637 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
35638 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35639 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
35640 MF->insert(I, mainMBB);
35641 MF->insert(I, sinkMBB);
35642 MF->push_back(restoreMBB);
35643 restoreMBB->setMachineBlockAddressTaken();
35644
35646
35647 // Transfer the remainder of BB and its successor edges to sinkMBB.
35648 sinkMBB->splice(sinkMBB->begin(), MBB,
35649 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
35651
35652 // thisMBB:
35653 unsigned PtrStoreOpc = 0;
35654 unsigned LabelReg = 0;
35655 const int64_t LabelOffset = 1 * PVT.getStoreSize();
35656 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
35658
35659 // Prepare IP either in reg or imm.
35660 if (!UseImmLabel) {
35661 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
35662 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35663 LabelReg = MRI.createVirtualRegister(PtrRC);
35664 if (Subtarget.is64Bit()) {
35665 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA64r), LabelReg)
35666 .addReg(X86::RIP)
35667 .addImm(0)
35668 .addReg(0)
35669 .addMBB(restoreMBB)
35670 .addReg(0);
35671 } else {
35672 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
35673 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::LEA32r), LabelReg)
35674 .addReg(XII->getGlobalBaseReg(MF))
35675 .addImm(0)
35676 .addReg(0)
35677 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
35678 .addReg(0);
35679 }
35680 } else
35681 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
35682 // Store IP
35683 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrStoreOpc));
35684 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35685 if (i == X86::AddrDisp)
35686 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
35687 else
35688 MIB.add(MI.getOperand(MemOpndSlot + i));
35689 }
35690 if (!UseImmLabel)
35691 MIB.addReg(LabelReg);
35692 else
35693 MIB.addMBB(restoreMBB);
35694 MIB.setMemRefs(MMOs);
35695
35696 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
35697 emitSetJmpShadowStackFix(MI, thisMBB);
35698 }
35699
35700 // Setup
35701 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(X86::EH_SjLj_Setup))
35702 .addMBB(restoreMBB);
35703
35704 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
35705 MIB.addRegMask(RegInfo->getNoPreservedMask());
35706 thisMBB->addSuccessor(mainMBB);
35707 thisMBB->addSuccessor(restoreMBB);
35708
35709 // mainMBB:
35710 // EAX = 0
35711 BuildMI(mainMBB, MIMD, TII->get(X86::MOV32r0), mainDstReg);
35712 mainMBB->addSuccessor(sinkMBB);
35713
35714 // sinkMBB:
35715 BuildMI(*sinkMBB, sinkMBB->begin(), MIMD, TII->get(X86::PHI), DstReg)
35716 .addReg(mainDstReg)
35717 .addMBB(mainMBB)
35718 .addReg(restoreDstReg)
35719 .addMBB(restoreMBB);
35720
35721 // restoreMBB:
35722 if (RegInfo->hasBasePointer(*MF)) {
35723 const bool Uses64BitFramePtr =
35724 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
35726 X86FI->setRestoreBasePointer(MF);
35727 Register FramePtr = RegInfo->getFrameRegister(*MF);
35728 Register BasePtr = RegInfo->getBaseRegister();
35729 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
35730 addRegOffset(BuildMI(restoreMBB, MIMD, TII->get(Opm), BasePtr),
35731 FramePtr, true, X86FI->getRestoreBasePointerOffset())
35733 }
35734 BuildMI(restoreMBB, MIMD, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
35735 BuildMI(restoreMBB, MIMD, TII->get(X86::JMP_1)).addMBB(sinkMBB);
35736 restoreMBB->addSuccessor(sinkMBB);
35737
35738 MI.eraseFromParent();
35739 return sinkMBB;
35740}
35741
35742/// Fix the shadow stack using the previously saved SSP pointer.
35743/// \sa emitSetJmpShadowStackFix
35744/// \param [in] MI The temporary Machine Instruction for the builtin.
35745/// \param [in] MBB The Machine Basic Block that will be modified.
35746/// \return The sink MBB that will perform the future indirect branch.
35748X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
35749 MachineBasicBlock *MBB) const {
35750 const MIMetadata MIMD(MI);
35751 MachineFunction *MF = MBB->getParent();
35752 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35754
35755 // Memory Reference
35756 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35757 MI.memoperands_end());
35758
35759 MVT PVT = getPointerTy(MF->getDataLayout());
35760 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
35761
35762 // checkSspMBB:
35763 // xor vreg1, vreg1
35764 // rdssp vreg1
35765 // test vreg1, vreg1
35766 // je sinkMBB # Jump if Shadow Stack is not supported
35767 // fallMBB:
35768 // mov buf+24/12(%rip), vreg2
35769 // sub vreg1, vreg2
35770 // jbe sinkMBB # No need to fix the Shadow Stack
35771 // fixShadowMBB:
35772 // shr 3/2, vreg2
35773 // incssp vreg2 # fix the SSP according to the lower 8 bits
35774 // shr 8, vreg2
35775 // je sinkMBB
35776 // fixShadowLoopPrepareMBB:
35777 // shl vreg2
35778 // mov 128, vreg3
35779 // fixShadowLoopMBB:
35780 // incssp vreg3
35781 // dec vreg2
35782 // jne fixShadowLoopMBB # Iterate until you finish fixing
35783 // # the Shadow Stack
35784 // sinkMBB:
35785
35787 const BasicBlock *BB = MBB->getBasicBlock();
35788
35789 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
35790 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
35791 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
35792 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
35793 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
35794 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
35795 MF->insert(I, checkSspMBB);
35796 MF->insert(I, fallMBB);
35797 MF->insert(I, fixShadowMBB);
35798 MF->insert(I, fixShadowLoopPrepareMBB);
35799 MF->insert(I, fixShadowLoopMBB);
35800 MF->insert(I, sinkMBB);
35801
35802 // Transfer the remainder of BB and its successor edges to sinkMBB.
35803 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
35804 MBB->end());
35806
35807 MBB->addSuccessor(checkSspMBB);
35808
35809 // Initialize a register with zero.
35810 Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
35811 BuildMI(checkSspMBB, MIMD, TII->get(X86::MOV32r0), ZReg);
35812
35813 if (PVT == MVT::i64) {
35814 Register TmpZReg = MRI.createVirtualRegister(PtrRC);
35815 BuildMI(checkSspMBB, MIMD, TII->get(X86::SUBREG_TO_REG), TmpZReg)
35816 .addImm(0)
35817 .addReg(ZReg)
35818 .addImm(X86::sub_32bit);
35819 ZReg = TmpZReg;
35820 }
35821
35822 // Read the current SSP Register value to the zeroed register.
35823 Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
35824 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
35825 BuildMI(checkSspMBB, MIMD, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
35826
35827 // Check whether the result of the SSP register is zero and jump directly
35828 // to the sink.
35829 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
35830 BuildMI(checkSspMBB, MIMD, TII->get(TestRROpc))
35831 .addReg(SSPCopyReg)
35832 .addReg(SSPCopyReg);
35833 BuildMI(checkSspMBB, MIMD, TII->get(X86::JCC_1))
35834 .addMBB(sinkMBB)
35836 checkSspMBB->addSuccessor(sinkMBB);
35837 checkSspMBB->addSuccessor(fallMBB);
35838
35839 // Reload the previously saved SSP register value.
35840 Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);
35841 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
35842 const int64_t SPPOffset = 3 * PVT.getStoreSize();
35844 BuildMI(fallMBB, MIMD, TII->get(PtrLoadOpc), PrevSSPReg);
35845 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35846 const MachineOperand &MO = MI.getOperand(i);
35847 if (i == X86::AddrDisp)
35848 MIB.addDisp(MO, SPPOffset);
35849 else if (MO.isReg()) // Don't add the whole operand, we don't want to
35850 // preserve kill flags.
35851 MIB.addReg(MO.getReg());
35852 else
35853 MIB.add(MO);
35854 }
35855 MIB.setMemRefs(MMOs);
35856
35857 // Subtract the current SSP from the previous SSP.
35858 Register SspSubReg = MRI.createVirtualRegister(PtrRC);
35859 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
35860 BuildMI(fallMBB, MIMD, TII->get(SubRROpc), SspSubReg)
35861 .addReg(PrevSSPReg)
35862 .addReg(SSPCopyReg);
35863
35864 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
35865 BuildMI(fallMBB, MIMD, TII->get(X86::JCC_1))
35866 .addMBB(sinkMBB)
35868 fallMBB->addSuccessor(sinkMBB);
35869 fallMBB->addSuccessor(fixShadowMBB);
35870
35871 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
35872 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
35873 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
35874 Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
35875 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspFirstShrReg)
35876 .addReg(SspSubReg)
35877 .addImm(Offset);
35878
35879 // Increase SSP when looking only on the lower 8 bits of the delta.
35880 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
35881 BuildMI(fixShadowMBB, MIMD, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
35882
35883 // Reset the lower 8 bits.
35884 Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
35885 BuildMI(fixShadowMBB, MIMD, TII->get(ShrRIOpc), SspSecondShrReg)
35886 .addReg(SspFirstShrReg)
35887 .addImm(8);
35888
35889 // Jump if the result of the shift is zero.
35890 BuildMI(fixShadowMBB, MIMD, TII->get(X86::JCC_1))
35891 .addMBB(sinkMBB)
35893 fixShadowMBB->addSuccessor(sinkMBB);
35894 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
35895
35896 // Do a single shift left.
35897 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64ri : X86::SHL32ri;
35898 Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
35899 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(ShlR1Opc), SspAfterShlReg)
35900 .addReg(SspSecondShrReg)
35901 .addImm(1);
35902
35903 // Save the value 128 to a register (will be used next with incssp).
35904 Register Value128InReg = MRI.createVirtualRegister(PtrRC);
35905 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
35906 BuildMI(fixShadowLoopPrepareMBB, MIMD, TII->get(MovRIOpc), Value128InReg)
35907 .addImm(128);
35908 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
35909
35910 // Since incssp only looks at the lower 8 bits, we might need to do several
35911 // iterations of incssp until we finish fixing the shadow stack.
35912 Register DecReg = MRI.createVirtualRegister(PtrRC);
35913 Register CounterReg = MRI.createVirtualRegister(PtrRC);
35914 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::PHI), CounterReg)
35915 .addReg(SspAfterShlReg)
35916 .addMBB(fixShadowLoopPrepareMBB)
35917 .addReg(DecReg)
35918 .addMBB(fixShadowLoopMBB);
35919
35920 // Every iteration we increase the SSP by 128.
35921 BuildMI(fixShadowLoopMBB, MIMD, TII->get(IncsspOpc)).addReg(Value128InReg);
35922
35923 // Every iteration we decrement the counter by 1.
35924 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
35925 BuildMI(fixShadowLoopMBB, MIMD, TII->get(DecROpc), DecReg).addReg(CounterReg);
35926
35927 // Jump if the counter is not zero yet.
35928 BuildMI(fixShadowLoopMBB, MIMD, TII->get(X86::JCC_1))
35929 .addMBB(fixShadowLoopMBB)
35931 fixShadowLoopMBB->addSuccessor(sinkMBB);
35932 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
35933
35934 return sinkMBB;
35935}
35936
35938X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
35939 MachineBasicBlock *MBB) const {
35940 const MIMetadata MIMD(MI);
35941 MachineFunction *MF = MBB->getParent();
35942 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
35944
35945 // Memory Reference
35946 SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),
35947 MI.memoperands_end());
35948
35949 MVT PVT = getPointerTy(MF->getDataLayout());
35950 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
35951 "Invalid Pointer Size!");
35952
35953 const TargetRegisterClass *RC =
35954 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
35955 Register Tmp = MRI.createVirtualRegister(RC);
35956 // Since FP is only updated here but NOT referenced, it's treated as GPR.
35957 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
35958 Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
35959 Register SP = RegInfo->getStackRegister();
35960
35962
35963 const int64_t LabelOffset = 1 * PVT.getStoreSize();
35964 const int64_t SPOffset = 2 * PVT.getStoreSize();
35965
35966 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
35967 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
35968
35969 MachineBasicBlock *thisMBB = MBB;
35970
35971 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
35972 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
35973 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
35974 }
35975
35976 // Reload FP
35977 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), FP);
35978 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35979 const MachineOperand &MO = MI.getOperand(i);
35980 if (MO.isReg()) // Don't add the whole operand, we don't want to
35981 // preserve kill flags.
35982 MIB.addReg(MO.getReg());
35983 else
35984 MIB.add(MO);
35985 }
35986 MIB.setMemRefs(MMOs);
35987
35988 // Reload IP
35989 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), Tmp);
35990 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
35991 const MachineOperand &MO = MI.getOperand(i);
35992 if (i == X86::AddrDisp)
35993 MIB.addDisp(MO, LabelOffset);
35994 else if (MO.isReg()) // Don't add the whole operand, we don't want to
35995 // preserve kill flags.
35996 MIB.addReg(MO.getReg());
35997 else
35998 MIB.add(MO);
35999 }
36000 MIB.setMemRefs(MMOs);
36001
36002 // Reload SP
36003 MIB = BuildMI(*thisMBB, MI, MIMD, TII->get(PtrLoadOpc), SP);
36004 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
36005 if (i == X86::AddrDisp)
36006 MIB.addDisp(MI.getOperand(i), SPOffset);
36007 else
36008 MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's
36009 // the last instruction of the expansion.
36010 }
36011 MIB.setMemRefs(MMOs);
36012
36013 // Jump
36014 BuildMI(*thisMBB, MI, MIMD, TII->get(IJmpOpc)).addReg(Tmp);
36015
36016 MI.eraseFromParent();
36017 return thisMBB;
36018}
36019
36020void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
36022 MachineBasicBlock *DispatchBB,
36023 int FI) const {
36024 const MIMetadata MIMD(MI);
36025 MachineFunction *MF = MBB->getParent();
36027 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36028
36029 MVT PVT = getPointerTy(MF->getDataLayout());
36030 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
36031
36032 unsigned Op = 0;
36033 unsigned VR = 0;
36034
36035 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
36037
36038 if (UseImmLabel) {
36039 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
36040 } else {
36041 const TargetRegisterClass *TRC =
36042 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
36043 VR = MRI->createVirtualRegister(TRC);
36044 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
36045
36046 if (Subtarget.is64Bit())
36047 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA64r), VR)
36048 .addReg(X86::RIP)
36049 .addImm(1)
36050 .addReg(0)
36051 .addMBB(DispatchBB)
36052 .addReg(0);
36053 else
36054 BuildMI(*MBB, MI, MIMD, TII->get(X86::LEA32r), VR)
36055 .addReg(0) /* TII->getGlobalBaseReg(MF) */
36056 .addImm(1)
36057 .addReg(0)
36058 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
36059 .addReg(0);
36060 }
36061
36062 MachineInstrBuilder MIB = BuildMI(*MBB, MI, MIMD, TII->get(Op));
36063 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
36064 if (UseImmLabel)
36065 MIB.addMBB(DispatchBB);
36066 else
36067 MIB.addReg(VR);
36068}
36069
36071X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
36072 MachineBasicBlock *BB) const {
36073 const MIMetadata MIMD(MI);
36074 MachineFunction *MF = BB->getParent();
36076 const X86InstrInfo *TII = Subtarget.getInstrInfo();
36077 int FI = MF->getFrameInfo().getFunctionContextIndex();
36078
36079 // Get a mapping of the call site numbers to all of the landing pads they're
36080 // associated with.
36082 unsigned MaxCSNum = 0;
36083 for (auto &MBB : *MF) {
36084 if (!MBB.isEHPad())
36085 continue;
36086
36087 MCSymbol *Sym = nullptr;
36088 for (const auto &MI : MBB) {
36089 if (MI.isDebugInstr())
36090 continue;
36091
36092 assert(MI.isEHLabel() && "expected EH_LABEL");
36093 Sym = MI.getOperand(0).getMCSymbol();
36094 break;
36095 }
36096
36097 if (!MF->hasCallSiteLandingPad(Sym))
36098 continue;
36099
36100 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
36101 CallSiteNumToLPad[CSI].push_back(&MBB);
36102 MaxCSNum = std::max(MaxCSNum, CSI);
36103 }
36104 }
36105
36106 // Get an ordered list of the machine basic blocks for the jump table.
36107 std::vector<MachineBasicBlock *> LPadList;
36109 LPadList.reserve(CallSiteNumToLPad.size());
36110
36111 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
36112 for (auto &LP : CallSiteNumToLPad[CSI]) {
36113 LPadList.push_back(LP);
36114 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
36115 }
36116 }
36117
36118 assert(!LPadList.empty() &&
36119 "No landing pad destinations for the dispatch jump table!");
36120
36121 // Create the MBBs for the dispatch code.
36122
36123 // Shove the dispatch's address into the return slot in the function context.
36124 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
36125 DispatchBB->setIsEHPad(true);
36126
36127 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
36128 BuildMI(TrapBB, MIMD, TII->get(X86::TRAP));
36129 DispatchBB->addSuccessor(TrapBB);
36130
36131 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
36132 DispatchBB->addSuccessor(DispContBB);
36133
36134 // Insert MBBs.
36135 MF->push_back(DispatchBB);
36136 MF->push_back(DispContBB);
36137 MF->push_back(TrapBB);
36138
36139 // Insert code into the entry block that creates and registers the function
36140 // context.
36141 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
36142
36143 // Create the jump table and associated information
36144 unsigned JTE = getJumpTableEncoding();
36145 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
36146 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
36147
36148 const X86RegisterInfo &RI = TII->getRegisterInfo();
36149 // Add a register mask with no preserved registers. This results in all
36150 // registers being marked as clobbered.
36151 if (RI.hasBasePointer(*MF)) {
36152 const bool FPIs64Bit =
36153 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
36154 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
36155 MFI->setRestoreBasePointer(MF);
36156
36157 Register FP = RI.getFrameRegister(*MF);
36158 Register BP = RI.getBaseRegister();
36159 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
36160 addRegOffset(BuildMI(DispatchBB, MIMD, TII->get(Op), BP), FP, true,
36163 } else {
36164 BuildMI(DispatchBB, MIMD, TII->get(X86::NOOP))
36166 }
36167
36168 // IReg is used as an index in a memory operand and therefore can't be SP
36169 Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
36170 addFrameReference(BuildMI(DispatchBB, MIMD, TII->get(X86::MOV32rm), IReg), FI,
36171 Subtarget.is64Bit() ? 8 : 4);
36172 BuildMI(DispatchBB, MIMD, TII->get(X86::CMP32ri))
36173 .addReg(IReg)
36174 .addImm(LPadList.size());
36175 BuildMI(DispatchBB, MIMD, TII->get(X86::JCC_1))
36176 .addMBB(TrapBB)
36178
36179 if (Subtarget.is64Bit()) {
36180 Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
36181 Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
36182
36183 // leaq .LJTI0_0(%rip), BReg
36184 BuildMI(DispContBB, MIMD, TII->get(X86::LEA64r), BReg)
36185 .addReg(X86::RIP)
36186 .addImm(1)
36187 .addReg(0)
36188 .addJumpTableIndex(MJTI)
36189 .addReg(0);
36190 // movzx IReg64, IReg
36191 BuildMI(DispContBB, MIMD, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
36192 .addImm(0)
36193 .addReg(IReg)
36194 .addImm(X86::sub_32bit);
36195
36196 switch (JTE) {
36198 // jmpq *(BReg,IReg64,8)
36199 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64m))
36200 .addReg(BReg)
36201 .addImm(8)
36202 .addReg(IReg64)
36203 .addImm(0)
36204 .addReg(0);
36205 break;
36207 Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
36208 Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
36209 Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
36210
36211 // movl (BReg,IReg64,4), OReg
36212 BuildMI(DispContBB, MIMD, TII->get(X86::MOV32rm), OReg)
36213 .addReg(BReg)
36214 .addImm(4)
36215 .addReg(IReg64)
36216 .addImm(0)
36217 .addReg(0);
36218 // movsx OReg64, OReg
36219 BuildMI(DispContBB, MIMD, TII->get(X86::MOVSX64rr32), OReg64)
36220 .addReg(OReg);
36221 // addq BReg, OReg64, TReg
36222 BuildMI(DispContBB, MIMD, TII->get(X86::ADD64rr), TReg)
36223 .addReg(OReg64)
36224 .addReg(BReg);
36225 // jmpq *TReg
36226 BuildMI(DispContBB, MIMD, TII->get(X86::JMP64r)).addReg(TReg);
36227 break;
36228 }
36229 default:
36230 llvm_unreachable("Unexpected jump table encoding");
36231 }
36232 } else {
36233 // jmpl *.LJTI0_0(,IReg,4)
36234 BuildMI(DispContBB, MIMD, TII->get(X86::JMP32m))
36235 .addReg(0)
36236 .addImm(4)
36237 .addReg(IReg)
36238 .addJumpTableIndex(MJTI)
36239 .addReg(0);
36240 }
36241
36242 // Add the jump table entries as successors to the MBB.
36244 for (auto &LP : LPadList)
36245 if (SeenMBBs.insert(LP).second)
36246 DispContBB->addSuccessor(LP);
36247
36248 // N.B. the order the invoke BBs are processed in doesn't matter here.
36250 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
36251 for (MachineBasicBlock *MBB : InvokeBBs) {
36252 // Remove the landing pad successor from the invoke block and replace it
36253 // with the new dispatch block.
36254 // Keep a copy of Successors since it's modified inside the loop.
36256 MBB->succ_rend());
36257 // FIXME: Avoid quadratic complexity.
36258 for (auto *MBBS : Successors) {
36259 if (MBBS->isEHPad()) {
36260 MBB->removeSuccessor(MBBS);
36261 MBBLPads.push_back(MBBS);
36262 }
36263 }
36264
36265 MBB->addSuccessor(DispatchBB);
36266
36267 // Find the invoke call and mark all of the callee-saved registers as
36268 // 'implicit defined' so that they're spilled. This prevents code from
36269 // moving instructions to before the EH block, where they will never be
36270 // executed.
36271 for (auto &II : reverse(*MBB)) {
36272 if (!II.isCall())
36273 continue;
36274
36276 for (auto &MOp : II.operands())
36277 if (MOp.isReg())
36278 DefRegs[MOp.getReg()] = true;
36279
36280 MachineInstrBuilder MIB(*MF, &II);
36281 for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {
36282 unsigned Reg = SavedRegs[RegIdx];
36283 if (!DefRegs[Reg])
36285 }
36286
36287 break;
36288 }
36289 }
36290
36291 // Mark all former landing pads as non-landing pads. The dispatch is the only
36292 // landing pad now.
36293 for (auto &LP : MBBLPads)
36294 LP->setIsEHPad(false);
36295
36296 // The instruction is gone now.
36297 MI.eraseFromParent();
36298 return BB;
36299}
36300
36303 MachineBasicBlock *BB) const {
36304 MachineFunction *MF = BB->getParent();
36305 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36306 const MIMetadata MIMD(MI);
36307
36308 auto TMMImmToTMMReg = [](unsigned Imm) {
36309 assert (Imm < 8 && "Illegal tmm index");
36310 return X86::TMM0 + Imm;
36311 };
36312 switch (MI.getOpcode()) {
36313 default: llvm_unreachable("Unexpected instr type to insert");
36314 case X86::TLS_addr32:
36315 case X86::TLS_addr64:
36316 case X86::TLS_addrX32:
36317 case X86::TLS_base_addr32:
36318 case X86::TLS_base_addr64:
36319 case X86::TLS_base_addrX32:
36320 case X86::TLS_desc32:
36321 case X86::TLS_desc64:
36322 return EmitLoweredTLSAddr(MI, BB);
36323 case X86::INDIRECT_THUNK_CALL32:
36324 case X86::INDIRECT_THUNK_CALL64:
36325 case X86::INDIRECT_THUNK_TCRETURN32:
36326 case X86::INDIRECT_THUNK_TCRETURN64:
36327 return EmitLoweredIndirectThunk(MI, BB);
36328 case X86::CATCHRET:
36329 return EmitLoweredCatchRet(MI, BB);
36330 case X86::SEG_ALLOCA_32:
36331 case X86::SEG_ALLOCA_64:
36332 return EmitLoweredSegAlloca(MI, BB);
36333 case X86::PROBED_ALLOCA_32:
36334 case X86::PROBED_ALLOCA_64:
36335 return EmitLoweredProbedAlloca(MI, BB);
36336 case X86::TLSCall_32:
36337 case X86::TLSCall_64:
36338 return EmitLoweredTLSCall(MI, BB);
36339 case X86::CMOV_FR16:
36340 case X86::CMOV_FR16X:
36341 case X86::CMOV_FR32:
36342 case X86::CMOV_FR32X:
36343 case X86::CMOV_FR64:
36344 case X86::CMOV_FR64X:
36345 case X86::CMOV_GR8:
36346 case X86::CMOV_GR16:
36347 case X86::CMOV_GR32:
36348 case X86::CMOV_RFP32:
36349 case X86::CMOV_RFP64:
36350 case X86::CMOV_RFP80:
36351 case X86::CMOV_VR64:
36352 case X86::CMOV_VR128:
36353 case X86::CMOV_VR128X:
36354 case X86::CMOV_VR256:
36355 case X86::CMOV_VR256X:
36356 case X86::CMOV_VR512:
36357 case X86::CMOV_VK1:
36358 case X86::CMOV_VK2:
36359 case X86::CMOV_VK4:
36360 case X86::CMOV_VK8:
36361 case X86::CMOV_VK16:
36362 case X86::CMOV_VK32:
36363 case X86::CMOV_VK64:
36364 return EmitLoweredSelect(MI, BB);
36365
36366 case X86::FP80_ADDr:
36367 case X86::FP80_ADDm32: {
36368 // Change the floating point control register to use double extended
36369 // precision when performing the addition.
36370 int OrigCWFrameIdx =
36371 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36372 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
36373 OrigCWFrameIdx);
36374
36375 // Load the old value of the control word...
36376 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36377 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
36378 OrigCWFrameIdx);
36379
36380 // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
36381 // precision.
36382 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36383 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
36384 .addReg(OldCW, RegState::Kill)
36385 .addImm(0x300);
36386
36387 // Extract to 16 bits.
36388 Register NewCW16 =
36389 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
36390 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
36391 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
36392
36393 // Prepare memory for FLDCW.
36394 int NewCWFrameIdx =
36395 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36396 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
36397 NewCWFrameIdx)
36398 .addReg(NewCW16, RegState::Kill);
36399
36400 // Reload the modified control word now...
36401 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
36402 NewCWFrameIdx);
36403
36404 // Do the addition.
36405 if (MI.getOpcode() == X86::FP80_ADDr) {
36406 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80))
36407 .add(MI.getOperand(0))
36408 .add(MI.getOperand(1))
36409 .add(MI.getOperand(2));
36410 } else {
36411 BuildMI(*BB, MI, MIMD, TII->get(X86::ADD_Fp80m32))
36412 .add(MI.getOperand(0))
36413 .add(MI.getOperand(1))
36414 .add(MI.getOperand(2))
36415 .add(MI.getOperand(3))
36416 .add(MI.getOperand(4))
36417 .add(MI.getOperand(5))
36418 .add(MI.getOperand(6));
36419 }
36420
36421 // Reload the original control word now.
36422 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
36423 OrigCWFrameIdx);
36424
36425 MI.eraseFromParent(); // The pseudo instruction is gone now.
36426 return BB;
36427 }
36428
36429 case X86::FP32_TO_INT16_IN_MEM:
36430 case X86::FP32_TO_INT32_IN_MEM:
36431 case X86::FP32_TO_INT64_IN_MEM:
36432 case X86::FP64_TO_INT16_IN_MEM:
36433 case X86::FP64_TO_INT32_IN_MEM:
36434 case X86::FP64_TO_INT64_IN_MEM:
36435 case X86::FP80_TO_INT16_IN_MEM:
36436 case X86::FP80_TO_INT32_IN_MEM:
36437 case X86::FP80_TO_INT64_IN_MEM: {
36438 // Change the floating point control register to use "round towards zero"
36439 // mode when truncating to an integer value.
36440 int OrigCWFrameIdx =
36441 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36442 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FNSTCW16m)),
36443 OrigCWFrameIdx);
36444
36445 // Load the old value of the control word...
36446 Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36447 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOVZX32rm16), OldCW),
36448 OrigCWFrameIdx);
36449
36450 // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.
36451 Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
36452 BuildMI(*BB, MI, MIMD, TII->get(X86::OR32ri), NewCW)
36453 .addReg(OldCW, RegState::Kill).addImm(0xC00);
36454
36455 // Extract to 16 bits.
36456 Register NewCW16 =
36457 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
36458 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), NewCW16)
36459 .addReg(NewCW, RegState::Kill, X86::sub_16bit);
36460
36461 // Prepare memory for FLDCW.
36462 int NewCWFrameIdx =
36463 MF->getFrameInfo().CreateStackObject(2, Align(2), false);
36464 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::MOV16mr)),
36465 NewCWFrameIdx)
36466 .addReg(NewCW16, RegState::Kill);
36467
36468 // Reload the modified control word now...
36469 addFrameReference(BuildMI(*BB, MI, MIMD,
36470 TII->get(X86::FLDCW16m)), NewCWFrameIdx);
36471
36472 // Get the X86 opcode to use.
36473 unsigned Opc;
36474 switch (MI.getOpcode()) {
36475 // clang-format off
36476 default: llvm_unreachable("illegal opcode!");
36477 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
36478 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
36479 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
36480 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
36481 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
36482 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
36483 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
36484 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
36485 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
36486 // clang-format on
36487 }
36488
36490 addFullAddress(BuildMI(*BB, MI, MIMD, TII->get(Opc)), AM)
36491 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
36492
36493 // Reload the original control word now.
36494 addFrameReference(BuildMI(*BB, MI, MIMD, TII->get(X86::FLDCW16m)),
36495 OrigCWFrameIdx);
36496
36497 MI.eraseFromParent(); // The pseudo instruction is gone now.
36498 return BB;
36499 }
36500
36501 // xbegin
36502 case X86::XBEGIN:
36503 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
36504
36505 case X86::VAARG_64:
36506 case X86::VAARG_X32:
36507 return EmitVAARGWithCustomInserter(MI, BB);
36508
36509 case X86::EH_SjLj_SetJmp32:
36510 case X86::EH_SjLj_SetJmp64:
36511 return emitEHSjLjSetJmp(MI, BB);
36512
36513 case X86::EH_SjLj_LongJmp32:
36514 case X86::EH_SjLj_LongJmp64:
36515 return emitEHSjLjLongJmp(MI, BB);
36516
36517 case X86::Int_eh_sjlj_setup_dispatch:
36518 return EmitSjLjDispatchBlock(MI, BB);
36519
36520 case TargetOpcode::STATEPOINT:
36521 // As an implementation detail, STATEPOINT shares the STACKMAP format at
36522 // this point in the process. We diverge later.
36523 return emitPatchPoint(MI, BB);
36524
36525 case TargetOpcode::STACKMAP:
36526 case TargetOpcode::PATCHPOINT:
36527 return emitPatchPoint(MI, BB);
36528
36529 case TargetOpcode::PATCHABLE_EVENT_CALL:
36530 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
36531 return BB;
36532
36533 case X86::LCMPXCHG8B: {
36534 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36535 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
36536 // requires a memory operand. If it happens that current architecture is
36537 // i686 and for current function we need a base pointer
36538 // - which is ESI for i686 - register allocator would not be able to
36539 // allocate registers for an address in form of X(%reg, %reg, Y)
36540 // - there never would be enough unreserved registers during regalloc
36541 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
36542 // We are giving a hand to register allocator by precomputing the address in
36543 // a new vreg using LEA.
36544
36545 // If it is not i686 or there is no base pointer - nothing to do here.
36546 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
36547 return BB;
36548
36549 // Even though this code does not necessarily needs the base pointer to
36550 // be ESI, we check for that. The reason: if this assert fails, there are
36551 // some changes happened in the compiler base pointer handling, which most
36552 // probably have to be addressed somehow here.
36553 assert(TRI->getBaseRegister() == X86::ESI &&
36554 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
36555 "base pointer in mind");
36556
36558 MVT SPTy = getPointerTy(MF->getDataLayout());
36559 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
36560 Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
36561
36563 // Regalloc does not need any help when the memory operand of CMPXCHG8B
36564 // does not use index register.
36565 if (AM.IndexReg == X86::NoRegister)
36566 return BB;
36567
36568 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
36569 // four operand definitions that are E[ABCD] registers. We skip them and
36570 // then insert the LEA.
36571 MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());
36572 while (RMBBI != BB->rend() &&
36573 (RMBBI->definesRegister(X86::EAX, /*TRI=*/nullptr) ||
36574 RMBBI->definesRegister(X86::EBX, /*TRI=*/nullptr) ||
36575 RMBBI->definesRegister(X86::ECX, /*TRI=*/nullptr) ||
36576 RMBBI->definesRegister(X86::EDX, /*TRI=*/nullptr))) {
36577 ++RMBBI;
36578 }
36581 BuildMI(*BB, *MBBI, MIMD, TII->get(X86::LEA32r), computedAddrVReg), AM);
36582
36583 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
36584
36585 return BB;
36586 }
36587 case X86::LCMPXCHG16B_NO_RBX: {
36588 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36589 Register BasePtr = TRI->getBaseRegister();
36590 if (TRI->hasBasePointer(*MF) &&
36591 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
36592 if (!BB->isLiveIn(BasePtr))
36593 BB->addLiveIn(BasePtr);
36594 // Save RBX into a virtual register.
36595 Register SaveRBX =
36596 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36597 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
36598 .addReg(X86::RBX);
36599 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36601 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);
36602 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
36603 MIB.add(MI.getOperand(Idx));
36604 MIB.add(MI.getOperand(X86::AddrNumOperands));
36605 MIB.addReg(SaveRBX);
36606 } else {
36607 // Simple case, just copy the virtual register to RBX.
36608 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::RBX)
36609 .add(MI.getOperand(X86::AddrNumOperands));
36611 BuildMI(*BB, MI, MIMD, TII->get(X86::LCMPXCHG16B));
36612 for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)
36613 MIB.add(MI.getOperand(Idx));
36614 }
36615 MI.eraseFromParent();
36616 return BB;
36617 }
36618 case X86::MWAITX: {
36619 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36620 Register BasePtr = TRI->getBaseRegister();
36621 bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);
36622 // If no need to save the base pointer, we generate MWAITXrrr,
36623 // else we generate pseudo MWAITX_SAVE_RBX.
36624 if (!IsRBX || !TRI->hasBasePointer(*MF)) {
36625 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
36626 .addReg(MI.getOperand(0).getReg());
36627 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
36628 .addReg(MI.getOperand(1).getReg());
36629 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EBX)
36630 .addReg(MI.getOperand(2).getReg());
36631 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITXrrr));
36632 MI.eraseFromParent();
36633 } else {
36634 if (!BB->isLiveIn(BasePtr)) {
36635 BB->addLiveIn(BasePtr);
36636 }
36637 // Parameters can be copied into ECX and EAX but not EBX yet.
36638 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::ECX)
36639 .addReg(MI.getOperand(0).getReg());
36640 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), X86::EAX)
36641 .addReg(MI.getOperand(1).getReg());
36642 assert(Subtarget.is64Bit() && "Expected 64-bit mode!");
36643 // Save RBX into a virtual register.
36644 Register SaveRBX =
36645 MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36646 BuildMI(*BB, MI, MIMD, TII->get(TargetOpcode::COPY), SaveRBX)
36647 .addReg(X86::RBX);
36648 // Generate mwaitx pseudo.
36649 Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);
36650 BuildMI(*BB, MI, MIMD, TII->get(X86::MWAITX_SAVE_RBX))
36651 .addDef(Dst) // Destination tied in with SaveRBX.
36652 .addReg(MI.getOperand(2).getReg()) // input value of EBX.
36653 .addUse(SaveRBX); // Save of base pointer.
36654 MI.eraseFromParent();
36655 }
36656 return BB;
36657 }
36658 case TargetOpcode::PREALLOCATED_SETUP: {
36659 assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
36660 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
36661 MFI->setHasPreallocatedCall(true);
36662 int64_t PreallocatedId = MI.getOperand(0).getImm();
36663 size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
36664 assert(StackAdjustment != 0 && "0 stack adjustment");
36665 LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
36666 << StackAdjustment << "\n");
36667 BuildMI(*BB, MI, MIMD, TII->get(X86::SUB32ri), X86::ESP)
36668 .addReg(X86::ESP)
36669 .addImm(StackAdjustment);
36670 MI.eraseFromParent();
36671 return BB;
36672 }
36673 case TargetOpcode::PREALLOCATED_ARG: {
36674 assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
36675 int64_t PreallocatedId = MI.getOperand(1).getImm();
36676 int64_t ArgIdx = MI.getOperand(2).getImm();
36677 auto MFI = MF->getInfo<X86MachineFunctionInfo>();
36678 size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
36679 LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
36680 << ", arg offset " << ArgOffset << "\n");
36681 // stack pointer + offset
36682 addRegOffset(BuildMI(*BB, MI, MIMD, TII->get(X86::LEA32r),
36683 MI.getOperand(0).getReg()),
36684 X86::ESP, false, ArgOffset);
36685 MI.eraseFromParent();
36686 return BB;
36687 }
36688 case X86::PTDPBSSD:
36689 case X86::PTDPBSUD:
36690 case X86::PTDPBUSD:
36691 case X86::PTDPBUUD:
36692 case X86::PTDPBF16PS:
36693 case X86::PTDPFP16PS: {
36694 unsigned Opc;
36695 switch (MI.getOpcode()) {
36696 // clang-format off
36697 default: llvm_unreachable("illegal opcode!");
36698 case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
36699 case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
36700 case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
36701 case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
36702 case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
36703 case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
36704 // clang-format on
36705 }
36706
36707 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
36708 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
36709 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
36710 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
36711 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
36712
36713 MI.eraseFromParent(); // The pseudo is gone now.
36714 return BB;
36715 }
36716 case X86::PTILEZERO: {
36717 unsigned Imm = MI.getOperand(0).getImm();
36718 BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
36719 MI.eraseFromParent(); // The pseudo is gone now.
36720 return BB;
36721 }
36722 case X86::PTILELOADD:
36723 case X86::PTILELOADDT1:
36724 case X86::PTILESTORED: {
36725 unsigned Opc;
36726 switch (MI.getOpcode()) {
36727 default: llvm_unreachable("illegal opcode!");
36728#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
36729 case X86::PTILELOADD:
36730 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
36731 break;
36732 case X86::PTILELOADDT1:
36733 Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
36734 break;
36735 case X86::PTILESTORED:
36736 Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
36737 break;
36738#undef GET_EGPR_IF_ENABLED
36739 }
36740
36741 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
36742 unsigned CurOp = 0;
36743 if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX)
36744 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
36746
36747 MIB.add(MI.getOperand(CurOp++)); // base
36748 MIB.add(MI.getOperand(CurOp++)); // scale
36749 MIB.add(MI.getOperand(CurOp++)); // index -- stride
36750 MIB.add(MI.getOperand(CurOp++)); // displacement
36751 MIB.add(MI.getOperand(CurOp++)); // segment
36752
36753 if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX)
36754 MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
36756
36757 MI.eraseFromParent(); // The pseudo is gone now.
36758 return BB;
36759 }
36760 case X86::PTCMMIMFP16PS:
36761 case X86::PTCMMRLFP16PS: {
36762 const MIMetadata MIMD(MI);
36763 unsigned Opc;
36764 switch (MI.getOpcode()) {
36765 // clang-format off
36766 default: llvm_unreachable("Unexpected instruction!");
36767 case X86::PTCMMIMFP16PS: Opc = X86::TCMMIMFP16PS; break;
36768 case X86::PTCMMRLFP16PS: Opc = X86::TCMMRLFP16PS; break;
36769 // clang-format on
36770 }
36771 MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
36772 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
36773 MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
36774 MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
36775 MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
36776 MI.eraseFromParent(); // The pseudo is gone now.
36777 return BB;
36778 }
36779 }
36780}
36781
36782//===----------------------------------------------------------------------===//
36783// X86 Optimization Hooks
36784//===----------------------------------------------------------------------===//
36785
36786bool
36788 const APInt &DemandedBits,
36789 const APInt &DemandedElts,
36790 TargetLoweringOpt &TLO) const {
36791 EVT VT = Op.getValueType();
36792 unsigned Opcode = Op.getOpcode();
36793 unsigned EltSize = VT.getScalarSizeInBits();
36794
36795 if (VT.isVector()) {
36796 // If the constant is only all signbits in the active bits, then we should
36797 // extend it to the entire constant to allow it act as a boolean constant
36798 // vector.
36799 auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
36800 if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
36801 return false;
36802 for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
36803 if (!DemandedElts[i] || V.getOperand(i).isUndef())
36804 continue;
36805 const APInt &Val = V.getConstantOperandAPInt(i);
36806 if (Val.getBitWidth() > Val.getNumSignBits() &&
36807 Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
36808 return true;
36809 }
36810 return false;
36811 };
36812 // For vectors - if we have a constant, then try to sign extend.
36813 // TODO: Handle AND cases.
36814 unsigned ActiveBits = DemandedBits.getActiveBits();
36815 if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
36816 (Opcode == ISD::OR || Opcode == ISD::XOR || Opcode == X86ISD::ANDNP) &&
36817 NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
36818 EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
36819 EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
36821 SDValue NewC =
36823 Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
36824 SDValue NewOp =
36825 TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
36826 return TLO.CombineTo(Op, NewOp);
36827 }
36828 return false;
36829 }
36830
36831 // Only optimize Ands to prevent shrinking a constant that could be
36832 // matched by movzx.
36833 if (Opcode != ISD::AND)
36834 return false;
36835
36836 // Make sure the RHS really is a constant.
36837 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
36838 if (!C)
36839 return false;
36840
36841 const APInt &Mask = C->getAPIntValue();
36842
36843 // Clear all non-demanded bits initially.
36844 APInt ShrunkMask = Mask & DemandedBits;
36845
36846 // Find the width of the shrunk mask.
36847 unsigned Width = ShrunkMask.getActiveBits();
36848
36849 // If the mask is all 0s there's nothing to do here.
36850 if (Width == 0)
36851 return false;
36852
36853 // Find the next power of 2 width, rounding up to a byte.
36854 Width = llvm::bit_ceil(std::max(Width, 8U));
36855 // Truncate the width to size to handle illegal types.
36856 Width = std::min(Width, EltSize);
36857
36858 // Calculate a possible zero extend mask for this constant.
36859 APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
36860
36861 // If we aren't changing the mask, just return true to keep it and prevent
36862 // the caller from optimizing.
36863 if (ZeroExtendMask == Mask)
36864 return true;
36865
36866 // Make sure the new mask can be represented by a combination of mask bits
36867 // and non-demanded bits.
36868 if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
36869 return false;
36870
36871 // Replace the constant with the zero extend mask.
36872 SDLoc DL(Op);
36873 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
36874 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
36875 return TLO.CombineTo(Op, NewOp);
36876}
36877
36879 KnownBits &Known,
36880 const APInt &DemandedElts,
36881 const SelectionDAG &DAG, unsigned Depth) {
36882 KnownBits Known2;
36883 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
36884 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
36885 Known = DAG.computeKnownBits(RHS, DemandedSrcElts, Depth + 1);
36886 Known2 = DAG.computeKnownBits(LHS, DemandedSrcElts, Depth + 1);
36887 Known = KnownBits::abdu(Known, Known2).zext(16);
36888 // Known = (((D0 + D1) + (D2 + D3)) + ((D4 + D5) + (D6 + D7)))
36889 Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true,
36890 Known, Known);
36891 Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true,
36892 Known, Known);
36893 Known = KnownBits::computeForAddSub(/*Add=*/true, /*NSW=*/true, /*NUW=*/true,
36894 Known, Known);
36895 Known = Known.zext(64);
36896}
36897
36899 KnownBits &Known,
36900 const APInt &DemandedElts,
36901 const SelectionDAG &DAG,
36902 unsigned Depth) const {
36903 unsigned BitWidth = Known.getBitWidth();
36904 unsigned NumElts = DemandedElts.getBitWidth();
36905 unsigned Opc = Op.getOpcode();
36906 EVT VT = Op.getValueType();
36907 assert((Opc >= ISD::BUILTIN_OP_END ||
36908 Opc == ISD::INTRINSIC_WO_CHAIN ||
36909 Opc == ISD::INTRINSIC_W_CHAIN ||
36910 Opc == ISD::INTRINSIC_VOID) &&
36911 "Should use MaskedValueIsZero if you don't know whether Op"
36912 " is a target node!");
36913
36914 Known.resetAll();
36915 switch (Opc) {
36916 default: break;
36917 case X86ISD::MUL_IMM: {
36918 KnownBits Known2;
36919 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
36920 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36921 Known = KnownBits::mul(Known, Known2);
36922 break;
36923 }
36924 case X86ISD::SETCC:
36925 Known.Zero.setBitsFrom(1);
36926 break;
36927 case X86ISD::MOVMSK: {
36928 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
36929 Known.Zero.setBitsFrom(NumLoBits);
36930 break;
36931 }
36932 case X86ISD::PEXTRB:
36933 case X86ISD::PEXTRW: {
36934 SDValue Src = Op.getOperand(0);
36935 EVT SrcVT = Src.getValueType();
36936 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
36937 Op.getConstantOperandVal(1));
36938 Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
36939 Known = Known.anyextOrTrunc(BitWidth);
36940 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
36941 break;
36942 }
36943 case X86ISD::VSRAI:
36944 case X86ISD::VSHLI:
36945 case X86ISD::VSRLI: {
36946 unsigned ShAmt = Op.getConstantOperandVal(1);
36947 if (ShAmt >= VT.getScalarSizeInBits()) {
36948 // Out of range logical bit shifts are guaranteed to be zero.
36949 // Out of range arithmetic bit shifts splat the sign bit.
36950 if (Opc != X86ISD::VSRAI) {
36951 Known.setAllZero();
36952 break;
36953 }
36954
36955 ShAmt = VT.getScalarSizeInBits() - 1;
36956 }
36957
36958 Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
36959 if (Opc == X86ISD::VSHLI) {
36960 Known.Zero <<= ShAmt;
36961 Known.One <<= ShAmt;
36962 // Low bits are known zero.
36963 Known.Zero.setLowBits(ShAmt);
36964 } else if (Opc == X86ISD::VSRLI) {
36965 Known.Zero.lshrInPlace(ShAmt);
36966 Known.One.lshrInPlace(ShAmt);
36967 // High bits are known zero.
36968 Known.Zero.setHighBits(ShAmt);
36969 } else {
36970 Known.Zero.ashrInPlace(ShAmt);
36971 Known.One.ashrInPlace(ShAmt);
36972 }
36973 break;
36974 }
36975 case X86ISD::PACKUS: {
36976 // PACKUS is just a truncation if the upper half is zero.
36977 APInt DemandedLHS, DemandedRHS;
36978 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
36979
36980 Known.One = APInt::getAllOnes(BitWidth * 2);
36981 Known.Zero = APInt::getAllOnes(BitWidth * 2);
36982
36983 KnownBits Known2;
36984 if (!!DemandedLHS) {
36985 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);
36986 Known = Known.intersectWith(Known2);
36987 }
36988 if (!!DemandedRHS) {
36989 Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);
36990 Known = Known.intersectWith(Known2);
36991 }
36992
36993 if (Known.countMinLeadingZeros() < BitWidth)
36994 Known.resetAll();
36995 Known = Known.trunc(BitWidth);
36996 break;
36997 }
36998 case X86ISD::PSHUFB: {
36999 SDValue Src = Op.getOperand(0);
37000 SDValue Idx = Op.getOperand(1);
37001
37002 // If the index vector is never negative (MSB is zero), then all elements
37003 // come from the source vector. This is useful for cases where
37004 // PSHUFB is being used as a LUT (ctpop etc.) - the target shuffle handling
37005 // below will handle the more common constant shuffle mask case.
37006 KnownBits KnownIdx = DAG.computeKnownBits(Idx, DemandedElts, Depth + 1);
37007 if (KnownIdx.isNonNegative())
37008 Known = DAG.computeKnownBits(Src, Depth + 1);
37009 break;
37010 }
37011 case X86ISD::VBROADCAST: {
37012 SDValue Src = Op.getOperand(0);
37013 if (!Src.getSimpleValueType().isVector()) {
37014 Known = DAG.computeKnownBits(Src, Depth + 1);
37015 return;
37016 }
37017 break;
37018 }
37019 case X86ISD::AND: {
37020 if (Op.getResNo() == 0) {
37021 KnownBits Known2;
37022 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37023 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37024 Known &= Known2;
37025 }
37026 break;
37027 }
37028 case X86ISD::ANDNP: {
37029 KnownBits Known2;
37030 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37031 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37032
37033 // ANDNP = (~X & Y);
37034 Known.One &= Known2.Zero;
37035 Known.Zero |= Known2.One;
37036 break;
37037 }
37038 case X86ISD::FOR: {
37039 KnownBits Known2;
37040 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37041 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37042
37043 Known |= Known2;
37044 break;
37045 }
37046 case X86ISD::PSADBW: {
37047 SDValue LHS = Op.getOperand(0);
37048 SDValue RHS = Op.getOperand(1);
37049 assert(VT.getScalarType() == MVT::i64 &&
37050 LHS.getValueType() == RHS.getValueType() &&
37051 LHS.getValueType().getScalarType() == MVT::i8 &&
37052 "Unexpected PSADBW types");
37053 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
37054 break;
37055 }
37056 case X86ISD::PCMPGT:
37057 case X86ISD::PCMPEQ: {
37058 KnownBits KnownLhs =
37059 DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37060 KnownBits KnownRhs =
37061 DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37062 std::optional<bool> Res = Opc == X86ISD::PCMPEQ
37063 ? KnownBits::eq(KnownLhs, KnownRhs)
37064 : KnownBits::sgt(KnownLhs, KnownRhs);
37065 if (Res) {
37066 if (*Res)
37067 Known.setAllOnes();
37068 else
37069 Known.setAllZero();
37070 }
37071 break;
37072 }
37073 case X86ISD::PMULUDQ: {
37074 KnownBits Known2;
37075 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37076 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37077
37078 Known = Known.trunc(BitWidth / 2).zext(BitWidth);
37079 Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);
37080 Known = KnownBits::mul(Known, Known2);
37081 break;
37082 }
37083 case X86ISD::CMOV: {
37084 Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
37085 // If we don't know any bits, early out.
37086 if (Known.isUnknown())
37087 break;
37088 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
37089
37090 // Only known if known in both the LHS and RHS.
37091 Known = Known.intersectWith(Known2);
37092 break;
37093 }
37094 case X86ISD::BEXTR:
37095 case X86ISD::BEXTRI: {
37096 SDValue Op0 = Op.getOperand(0);
37097 SDValue Op1 = Op.getOperand(1);
37098
37099 if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
37100 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
37101 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
37102
37103 // If the length is 0, the result is 0.
37104 if (Length == 0) {
37105 Known.setAllZero();
37106 break;
37107 }
37108
37109 if ((Shift + Length) <= BitWidth) {
37110 Known = DAG.computeKnownBits(Op0, Depth + 1);
37111 Known = Known.extractBits(Length, Shift);
37112 Known = Known.zextOrTrunc(BitWidth);
37113 }
37114 }
37115 break;
37116 }
37117 case X86ISD::PDEP: {
37118 KnownBits Known2;
37119 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37120 Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
37121 // Zeros are retained from the mask operand. But not ones.
37122 Known.One.clearAllBits();
37123 // The result will have at least as many trailing zeros as the non-mask
37124 // operand since bits can only map to the same or higher bit position.
37125 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
37126 break;
37127 }
37128 case X86ISD::PEXT: {
37129 Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
37130 // The result has as many leading zeros as the number of zeroes in the mask.
37131 unsigned Count = Known.Zero.popcount();
37132 Known.Zero = APInt::getHighBitsSet(BitWidth, Count);
37133 Known.One.clearAllBits();
37134 break;
37135 }
37136 case X86ISD::VTRUNC:
37137 case X86ISD::VTRUNCS:
37138 case X86ISD::VTRUNCUS:
37139 case X86ISD::CVTSI2P:
37140 case X86ISD::CVTUI2P:
37141 case X86ISD::CVTP2SI:
37142 case X86ISD::CVTP2UI:
37143 case X86ISD::MCVTP2SI:
37144 case X86ISD::MCVTP2UI:
37145 case X86ISD::CVTTP2SI:
37146 case X86ISD::CVTTP2UI:
37147 case X86ISD::MCVTTP2SI:
37148 case X86ISD::MCVTTP2UI:
37149 case X86ISD::MCVTSI2P:
37150 case X86ISD::MCVTUI2P:
37151 case X86ISD::VFPROUND:
37152 case X86ISD::VMFPROUND:
37153 case X86ISD::CVTPS2PH:
37154 case X86ISD::MCVTPS2PH: {
37155 // Truncations/Conversions - upper elements are known zero.
37156 EVT SrcVT = Op.getOperand(0).getValueType();
37157 if (SrcVT.isVector()) {
37158 unsigned NumSrcElts = SrcVT.getVectorNumElements();
37159 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
37160 Known.setAllZero();
37161 }
37162 break;
37163 }
37170 // Strict Conversions - upper elements are known zero.
37171 EVT SrcVT = Op.getOperand(1).getValueType();
37172 if (SrcVT.isVector()) {
37173 unsigned NumSrcElts = SrcVT.getVectorNumElements();
37174 if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)
37175 Known.setAllZero();
37176 }
37177 break;
37178 }
37179 case X86ISD::MOVQ2DQ: {
37180 // Move from MMX to XMM. Upper half of XMM should be 0.
37181 if (DemandedElts.countr_zero() >= (NumElts / 2))
37182 Known.setAllZero();
37183 break;
37184 }
37186 APInt UndefElts;
37187 SmallVector<APInt, 16> EltBits;
37188 if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,
37189 /*AllowWholeUndefs*/ false,
37190 /*AllowPartialUndefs*/ false)) {
37191 Known.Zero.setAllBits();
37192 Known.One.setAllBits();
37193 for (unsigned I = 0; I != NumElts; ++I) {
37194 if (!DemandedElts[I])
37195 continue;
37196 if (UndefElts[I]) {
37197 Known.resetAll();
37198 break;
37199 }
37200 KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);
37201 Known = Known.intersectWith(Known2);
37202 }
37203 return;
37204 }
37205 break;
37206 }
37208 switch (Op->getConstantOperandVal(0)) {
37209 case Intrinsic::x86_sse2_psad_bw:
37210 case Intrinsic::x86_avx2_psad_bw:
37211 case Intrinsic::x86_avx512_psad_bw_512: {
37212 SDValue LHS = Op.getOperand(1);
37213 SDValue RHS = Op.getOperand(2);
37214 assert(VT.getScalarType() == MVT::i64 &&
37215 LHS.getValueType() == RHS.getValueType() &&
37216 LHS.getValueType().getScalarType() == MVT::i8 &&
37217 "Unexpected PSADBW types");
37218 computeKnownBitsForPSADBW(LHS, RHS, Known, DemandedElts, DAG, Depth);
37219 break;
37220 }
37221 }
37222 break;
37223 }
37224 }
37225
37226 // Handle target shuffles.
37227 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
37228 if (isTargetShuffle(Opc)) {
37231 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
37232 unsigned NumOps = Ops.size();
37233 unsigned NumElts = VT.getVectorNumElements();
37234 if (Mask.size() == NumElts) {
37235 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
37236 Known.Zero.setAllBits(); Known.One.setAllBits();
37237 for (unsigned i = 0; i != NumElts; ++i) {
37238 if (!DemandedElts[i])
37239 continue;
37240 int M = Mask[i];
37241 if (M == SM_SentinelUndef) {
37242 // For UNDEF elements, we don't know anything about the common state
37243 // of the shuffle result.
37244 Known.resetAll();
37245 break;
37246 }
37247 if (M == SM_SentinelZero) {
37248 Known.One.clearAllBits();
37249 continue;
37250 }
37251 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
37252 "Shuffle index out of range");
37253
37254 unsigned OpIdx = (unsigned)M / NumElts;
37255 unsigned EltIdx = (unsigned)M % NumElts;
37256 if (Ops[OpIdx].getValueType() != VT) {
37257 // TODO - handle target shuffle ops with different value types.
37258 Known.resetAll();
37259 break;
37260 }
37261 DemandedOps[OpIdx].setBit(EltIdx);
37262 }
37263 // Known bits are the values that are shared by every demanded element.
37264 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
37265 if (!DemandedOps[i])
37266 continue;
37267 KnownBits Known2 =
37268 DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);
37269 Known = Known.intersectWith(Known2);
37270 }
37271 }
37272 }
37273 }
37274}
37275
37277 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
37278 unsigned Depth) const {
37279 EVT VT = Op.getValueType();
37280 unsigned VTBits = VT.getScalarSizeInBits();
37281 unsigned Opcode = Op.getOpcode();
37282 switch (Opcode) {
37284 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
37285 return VTBits;
37286
37287 case X86ISD::VTRUNC: {
37288 SDValue Src = Op.getOperand(0);
37289 MVT SrcVT = Src.getSimpleValueType();
37290 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
37291 assert(VTBits < NumSrcBits && "Illegal truncation input type");
37292 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
37293 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
37294 if (Tmp > (NumSrcBits - VTBits))
37295 return Tmp - (NumSrcBits - VTBits);
37296 return 1;
37297 }
37298
37299 case X86ISD::PACKSS: {
37300 // PACKSS is just a truncation if the sign bits extend to the packed size.
37301 APInt DemandedLHS, DemandedRHS;
37302 getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,
37303 DemandedRHS);
37304
37305 // Helper to detect PACKSSDW(BITCAST(PACKSSDW(X)),BITCAST(PACKSSDW(Y)))
37306 // patterns often used to compact vXi64 allsignbit patterns.
37307 auto NumSignBitsPACKSS = [&](SDValue V, const APInt &Elts) -> unsigned {
37309 if (BC.getOpcode() == X86ISD::PACKSS &&
37310 BC.getScalarValueSizeInBits() == 16 &&
37311 V.getScalarValueSizeInBits() == 32) {
37314 if (BC0.getScalarValueSizeInBits() == 64 &&
37315 BC1.getScalarValueSizeInBits() == 64 &&
37316 DAG.ComputeNumSignBits(BC0, Depth + 1) == 64 &&
37317 DAG.ComputeNumSignBits(BC1, Depth + 1) == 64)
37318 return 32;
37319 }
37320 return DAG.ComputeNumSignBits(V, Elts, Depth + 1);
37321 };
37322
37323 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
37324 unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;
37325 if (!!DemandedLHS)
37326 Tmp0 = NumSignBitsPACKSS(Op.getOperand(0), DemandedLHS);
37327 if (!!DemandedRHS)
37328 Tmp1 = NumSignBitsPACKSS(Op.getOperand(1), DemandedRHS);
37329 unsigned Tmp = std::min(Tmp0, Tmp1);
37330 if (Tmp > (SrcBits - VTBits))
37331 return Tmp - (SrcBits - VTBits);
37332 return 1;
37333 }
37334
37335 case X86ISD::VBROADCAST: {
37336 SDValue Src = Op.getOperand(0);
37337 if (!Src.getSimpleValueType().isVector())
37338 return DAG.ComputeNumSignBits(Src, Depth + 1);
37339 break;
37340 }
37341
37342 case X86ISD::VSHLI: {
37343 SDValue Src = Op.getOperand(0);
37344 const APInt &ShiftVal = Op.getConstantOperandAPInt(1);
37345 if (ShiftVal.uge(VTBits))
37346 return VTBits; // Shifted all bits out --> zero.
37347 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
37348 if (ShiftVal.uge(Tmp))
37349 return 1; // Shifted all sign bits out --> unknown.
37350 return Tmp - ShiftVal.getZExtValue();
37351 }
37352
37353 case X86ISD::VSRAI: {
37354 SDValue Src = Op.getOperand(0);
37355 APInt ShiftVal = Op.getConstantOperandAPInt(1);
37356 if (ShiftVal.uge(VTBits - 1))
37357 return VTBits; // Sign splat.
37358 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
37359 ShiftVal += Tmp;
37360 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
37361 }
37362
37363 case X86ISD::FSETCC:
37364 // cmpss/cmpsd return zero/all-bits result values in the bottom element.
37365 if (VT == MVT::f32 || VT == MVT::f64 ||
37366 ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))
37367 return VTBits;
37368 break;
37369
37370 case X86ISD::PCMPGT:
37371 case X86ISD::PCMPEQ:
37372 case X86ISD::CMPP:
37373 case X86ISD::VPCOM:
37374 case X86ISD::VPCOMU:
37375 // Vector compares return zero/all-bits result values.
37376 return VTBits;
37377
37378 case X86ISD::ANDNP: {
37379 unsigned Tmp0 =
37380 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
37381 if (Tmp0 == 1) return 1; // Early out.
37382 unsigned Tmp1 =
37383 DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
37384 return std::min(Tmp0, Tmp1);
37385 }
37386
37387 case X86ISD::CMOV: {
37388 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
37389 if (Tmp0 == 1) return 1; // Early out.
37390 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
37391 return std::min(Tmp0, Tmp1);
37392 }
37393 }
37394
37395 // Handle target shuffles.
37396 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
37397 if (isTargetShuffle(Opcode)) {
37400 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
37401 unsigned NumOps = Ops.size();
37402 unsigned NumElts = VT.getVectorNumElements();
37403 if (Mask.size() == NumElts) {
37404 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
37405 for (unsigned i = 0; i != NumElts; ++i) {
37406 if (!DemandedElts[i])
37407 continue;
37408 int M = Mask[i];
37409 if (M == SM_SentinelUndef) {
37410 // For UNDEF elements, we don't know anything about the common state
37411 // of the shuffle result.
37412 return 1;
37413 } else if (M == SM_SentinelZero) {
37414 // Zero = all sign bits.
37415 continue;
37416 }
37417 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
37418 "Shuffle index out of range");
37419
37420 unsigned OpIdx = (unsigned)M / NumElts;
37421 unsigned EltIdx = (unsigned)M % NumElts;
37422 if (Ops[OpIdx].getValueType() != VT) {
37423 // TODO - handle target shuffle ops with different value types.
37424 return 1;
37425 }
37426 DemandedOps[OpIdx].setBit(EltIdx);
37427 }
37428 unsigned Tmp0 = VTBits;
37429 for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {
37430 if (!DemandedOps[i])
37431 continue;
37432 unsigned Tmp1 =
37433 DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);
37434 Tmp0 = std::min(Tmp0, Tmp1);
37435 }
37436 return Tmp0;
37437 }
37438 }
37439 }
37440
37441 // Fallback case.
37442 return 1;
37443}
37444
37446 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
37447 return N->getOperand(0);
37448 return N;
37449}
37450
37451// Helper to look for a normal load that can be narrowed into a vzload with the
37452// specified VT and memory VT. Returns SDValue() on failure.
37454 SelectionDAG &DAG) {
37455 // Can't if the load is volatile or atomic.
37456 if (!LN->isSimple())
37457 return SDValue();
37458
37459 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
37460 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
37461 return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
37462 LN->getPointerInfo(), LN->getOriginalAlign(),
37463 LN->getMemOperand()->getFlags());
37464}
37465
37466// Attempt to match a combined shuffle mask against supported unary shuffle
37467// instructions.
37468// TODO: Investigate sharing more of this with shuffle lowering.
37469static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
37470 bool AllowFloatDomain, bool AllowIntDomain,
37471 SDValue V1, const SelectionDAG &DAG,
37472 const X86Subtarget &Subtarget, unsigned &Shuffle,
37473 MVT &SrcVT, MVT &DstVT) {
37474 unsigned NumMaskElts = Mask.size();
37475 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
37476
37477 // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.
37478 if (Mask[0] == 0 &&
37479 (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {
37480 if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||
37482 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {
37483 Shuffle = X86ISD::VZEXT_MOVL;
37484 if (MaskEltSize == 16)
37485 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
37486 else
37487 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
37488 return true;
37489 }
37490 }
37491
37492 // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction.
37493 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
37494 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
37495 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
37496 unsigned MaxScale = 64 / MaskEltSize;
37497 bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize &&
37498 DAG.ComputeNumSignBits(V1) == MaskEltSize;
37499 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
37500 bool MatchAny = true;
37501 bool MatchZero = true;
37502 bool MatchSign = UseSign;
37503 unsigned NumDstElts = NumMaskElts / Scale;
37504 for (unsigned i = 0;
37505 i != NumDstElts && (MatchAny || MatchSign || MatchZero); ++i) {
37506 if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {
37507 MatchAny = MatchSign = MatchZero = false;
37508 break;
37509 }
37510 unsigned Pos = (i * Scale) + 1;
37511 unsigned Len = Scale - 1;
37512 MatchAny &= isUndefInRange(Mask, Pos, Len);
37513 MatchZero &= isUndefOrZeroInRange(Mask, Pos, Len);
37514 MatchSign &= isUndefOrEqualInRange(Mask, (int)i, Pos, Len);
37515 }
37516 if (MatchAny || MatchSign || MatchZero) {
37517 assert((MatchSign || MatchZero) &&
37518 "Failed to match sext/zext but matched aext?");
37519 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
37520 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType()
37521 : MVT::getIntegerVT(MaskEltSize);
37522 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
37523
37524 Shuffle = unsigned(
37525 MatchAny ? ISD::ANY_EXTEND
37526 : (MatchSign ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND));
37527 if (SrcVT.getVectorNumElements() != NumDstElts)
37528 Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);
37529
37530 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
37531 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
37532 return true;
37533 }
37534 }
37535 }
37536
37537 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
37538 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||
37539 (MaskEltSize == 16 && Subtarget.hasFP16())) &&
37540 isUndefOrEqual(Mask[0], 0) &&
37541 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
37542 Shuffle = X86ISD::VZEXT_MOVL;
37543 if (MaskEltSize == 16)
37544 SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);
37545 else
37546 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
37547 return true;
37548 }
37549
37550 // Check if we have SSE3 which will let us use MOVDDUP etc. The
37551 // instructions are no slower than UNPCKLPD but has the option to
37552 // fold the input operand into even an unaligned memory load.
37553 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
37554 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {
37555 Shuffle = X86ISD::MOVDDUP;
37556 SrcVT = DstVT = MVT::v2f64;
37557 return true;
37558 }
37559 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
37560 Shuffle = X86ISD::MOVSLDUP;
37561 SrcVT = DstVT = MVT::v4f32;
37562 return true;
37563 }
37564 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {
37565 Shuffle = X86ISD::MOVSHDUP;
37566 SrcVT = DstVT = MVT::v4f32;
37567 return true;
37568 }
37569 }
37570
37571 if (MaskVT.is256BitVector() && AllowFloatDomain) {
37572 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
37573 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {
37574 Shuffle = X86ISD::MOVDDUP;
37575 SrcVT = DstVT = MVT::v4f64;
37576 return true;
37577 }
37578 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
37579 V1)) {
37580 Shuffle = X86ISD::MOVSLDUP;
37581 SrcVT = DstVT = MVT::v8f32;
37582 return true;
37583 }
37584 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,
37585 V1)) {
37586 Shuffle = X86ISD::MOVSHDUP;
37587 SrcVT = DstVT = MVT::v8f32;
37588 return true;
37589 }
37590 }
37591
37592 if (MaskVT.is512BitVector() && AllowFloatDomain) {
37593 assert(Subtarget.hasAVX512() &&
37594 "AVX512 required for 512-bit vector shuffles");
37595 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,
37596 V1)) {
37597 Shuffle = X86ISD::MOVDDUP;
37598 SrcVT = DstVT = MVT::v8f64;
37599 return true;
37600 }
37602 MaskVT, Mask,
37603 {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {
37604 Shuffle = X86ISD::MOVSLDUP;
37605 SrcVT = DstVT = MVT::v16f32;
37606 return true;
37607 }
37609 MaskVT, Mask,
37610 {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {
37611 Shuffle = X86ISD::MOVSHDUP;
37612 SrcVT = DstVT = MVT::v16f32;
37613 return true;
37614 }
37615 }
37616
37617 return false;
37618}
37619
37620// Attempt to match a combined shuffle mask against supported unary immediate
37621// permute instructions.
37622// TODO: Investigate sharing more of this with shuffle lowering.
37624 const APInt &Zeroable,
37625 bool AllowFloatDomain, bool AllowIntDomain,
37626 const SelectionDAG &DAG,
37627 const X86Subtarget &Subtarget,
37628 unsigned &Shuffle, MVT &ShuffleVT,
37629 unsigned &PermuteImm) {
37630 unsigned NumMaskElts = Mask.size();
37631 unsigned InputSizeInBits = MaskVT.getSizeInBits();
37632 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
37633 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
37634 bool ContainsZeros = isAnyZero(Mask);
37635
37636 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
37637 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
37638 // Check for lane crossing permutes.
37639 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
37640 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
37641 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
37642 Shuffle = X86ISD::VPERMI;
37643 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
37644 PermuteImm = getV4X86ShuffleImm(Mask);
37645 return true;
37646 }
37647 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
37648 SmallVector<int, 4> RepeatedMask;
37649 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
37650 Shuffle = X86ISD::VPERMI;
37651 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
37652 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
37653 return true;
37654 }
37655 }
37656 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
37657 // VPERMILPD can permute with a non-repeating shuffle.
37658 Shuffle = X86ISD::VPERMILPI;
37659 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
37660 PermuteImm = 0;
37661 for (int i = 0, e = Mask.size(); i != e; ++i) {
37662 int M = Mask[i];
37663 if (M == SM_SentinelUndef)
37664 continue;
37665 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
37666 PermuteImm |= (M & 1) << i;
37667 }
37668 return true;
37669 }
37670 }
37671
37672 // We are checking for shuffle match or shift match. Loop twice so we can
37673 // order which we try and match first depending on target preference.
37674 for (unsigned Order = 0; Order < 2; ++Order) {
37675 if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {
37676 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
37677 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
37678 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
37679 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
37680 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
37681 SmallVector<int, 4> RepeatedMask;
37682 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
37683 // Narrow the repeated mask to create 32-bit element permutes.
37684 SmallVector<int, 4> WordMask = RepeatedMask;
37685 if (MaskScalarSizeInBits == 64)
37686 narrowShuffleMaskElts(2, RepeatedMask, WordMask);
37687
37688 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
37689 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
37690 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
37691 PermuteImm = getV4X86ShuffleImm(WordMask);
37692 return true;
37693 }
37694 }
37695
37696 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
37697 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&
37698 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
37699 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
37700 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
37701 SmallVector<int, 4> RepeatedMask;
37702 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
37703 ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);
37704 ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);
37705
37706 // PSHUFLW: permute lower 4 elements only.
37707 if (isUndefOrInRange(LoMask, 0, 4) &&
37708 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
37709 Shuffle = X86ISD::PSHUFLW;
37710 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
37711 PermuteImm = getV4X86ShuffleImm(LoMask);
37712 return true;
37713 }
37714
37715 // PSHUFHW: permute upper 4 elements only.
37716 if (isUndefOrInRange(HiMask, 4, 8) &&
37717 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
37718 // Offset the HiMask so that we can create the shuffle immediate.
37719 int OffsetHiMask[4];
37720 for (int i = 0; i != 4; ++i)
37721 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
37722
37723 Shuffle = X86ISD::PSHUFHW;
37724 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
37725 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
37726 return true;
37727 }
37728 }
37729 }
37730 } else {
37731 // Attempt to match against bit rotates.
37732 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
37733 ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
37734 Subtarget.hasAVX512())) {
37735 int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
37736 Subtarget, Mask);
37737 if (0 < RotateAmt) {
37738 Shuffle = X86ISD::VROTLI;
37739 PermuteImm = (unsigned)RotateAmt;
37740 return true;
37741 }
37742 }
37743 }
37744 // Attempt to match against byte/bit shifts.
37745 if (AllowIntDomain &&
37746 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
37747 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
37748 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
37749 int ShiftAmt =
37750 matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,
37751 Zeroable, Subtarget);
37752 if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
37753 32 <= ShuffleVT.getScalarSizeInBits())) {
37754 // Byte shifts can be slower so only match them on second attempt.
37755 if (Order == 0 &&
37756 (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))
37757 continue;
37758
37759 PermuteImm = (unsigned)ShiftAmt;
37760 return true;
37761 }
37762
37763 }
37764 }
37765
37766 return false;
37767}
37768
37769// Attempt to match a combined unary shuffle mask against supported binary
37770// shuffle instructions.
37771// TODO: Investigate sharing more of this with shuffle lowering.
37772static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
37773 bool AllowFloatDomain, bool AllowIntDomain,
37774 SDValue &V1, SDValue &V2, const SDLoc &DL,
37775 SelectionDAG &DAG, const X86Subtarget &Subtarget,
37776 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
37777 bool IsUnary) {
37778 unsigned NumMaskElts = Mask.size();
37779 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
37780 unsigned SizeInBits = MaskVT.getSizeInBits();
37781
37782 if (MaskVT.is128BitVector()) {
37783 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&
37784 AllowFloatDomain) {
37785 V2 = V1;
37786 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
37787 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;
37788 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
37789 return true;
37790 }
37791 if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&
37792 AllowFloatDomain) {
37793 V2 = V1;
37794 Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;
37795 SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;
37796 return true;
37797 }
37798 if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&
37799 Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {
37800 std::swap(V1, V2);
37801 Shuffle = X86ISD::MOVSD;
37802 SrcVT = DstVT = MVT::v2f64;
37803 return true;
37804 }
37805 if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&
37806 (AllowFloatDomain || !Subtarget.hasSSE41())) {
37807 Shuffle = X86ISD::MOVSS;
37808 SrcVT = DstVT = MVT::v4f32;
37809 return true;
37810 }
37811 if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},
37812 DAG) &&
37813 Subtarget.hasFP16()) {
37814 Shuffle = X86ISD::MOVSH;
37815 SrcVT = DstVT = MVT::v8f16;
37816 return true;
37817 }
37818 }
37819
37820 // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.
37821 if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
37822 ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
37823 ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
37824 if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
37825 Subtarget)) {
37826 DstVT = MaskVT;
37827 return true;
37828 }
37829 }
37830 // TODO: Can we handle this inside matchShuffleWithPACK?
37831 if (MaskVT == MVT::v4i32 && Subtarget.hasSSE2() &&
37832 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2, 4, 6}, DAG) &&
37833 V1.getScalarValueSizeInBits() == 64 &&
37834 V2.getScalarValueSizeInBits() == 64) {
37835 // Use (SSE41) PACKUSWD if the leading zerobits goto the lowest 16-bits.
37836 unsigned MinLZV1 = DAG.computeKnownBits(V1).countMinLeadingZeros();
37837 unsigned MinLZV2 = DAG.computeKnownBits(V2).countMinLeadingZeros();
37838 if (Subtarget.hasSSE41() && MinLZV1 >= 48 && MinLZV2 >= 48) {
37839 SrcVT = MVT::v4i32;
37840 DstVT = MVT::v8i16;
37841 Shuffle = X86ISD::PACKUS;
37842 return true;
37843 }
37844 // Use PACKUSBW if the leading zerobits goto the lowest 8-bits.
37845 if (MinLZV1 >= 56 && MinLZV2 >= 56) {
37846 SrcVT = MVT::v8i16;
37847 DstVT = MVT::v16i8;
37848 Shuffle = X86ISD::PACKUS;
37849 return true;
37850 }
37851 // Use PACKSSWD if the signbits extend to the lowest 16-bits.
37852 if (DAG.ComputeNumSignBits(V1) > 48 && DAG.ComputeNumSignBits(V2) > 48) {
37853 SrcVT = MVT::v4i32;
37854 DstVT = MVT::v8i16;
37855 Shuffle = X86ISD::PACKSS;
37856 return true;
37857 }
37858 }
37859
37860 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
37861 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
37862 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
37863 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
37864 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
37865 (MaskVT.is512BitVector() && Subtarget.hasAVX512() &&
37866 (32 <= EltSizeInBits || Subtarget.hasBWI()))) {
37867 if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
37868 Subtarget)) {
37869 SrcVT = DstVT = MaskVT;
37870 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
37871 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
37872 return true;
37873 }
37874 }
37875
37876 // Attempt to match against a OR if we're performing a blend shuffle and the
37877 // non-blended source element is zero in each case.
37878 // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.
37879 if (SizeInBits == V1.getValueSizeInBits() &&
37880 SizeInBits == V2.getValueSizeInBits() &&
37881 (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
37882 (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {
37883 bool IsBlend = true;
37884 unsigned NumV1Elts = V1.getValueType().getVectorNumElements();
37885 unsigned NumV2Elts = V2.getValueType().getVectorNumElements();
37886 unsigned Scale1 = NumV1Elts / NumMaskElts;
37887 unsigned Scale2 = NumV2Elts / NumMaskElts;
37888 APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);
37889 APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);
37890 for (unsigned i = 0; i != NumMaskElts; ++i) {
37891 int M = Mask[i];
37892 if (M == SM_SentinelUndef)
37893 continue;
37894 if (M == SM_SentinelZero) {
37895 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
37896 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
37897 continue;
37898 }
37899 if (M == (int)i) {
37900 DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);
37901 continue;
37902 }
37903 if (M == (int)(i + NumMaskElts)) {
37904 DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);
37905 continue;
37906 }
37907 IsBlend = false;
37908 break;
37909 }
37910 if (IsBlend) {
37911 if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&
37912 DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {
37913 Shuffle = ISD::OR;
37914 SrcVT = DstVT = MaskVT.changeTypeToInteger();
37915 return true;
37916 }
37917 if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {
37918 // FIXME: handle mismatched sizes?
37919 // TODO: investigate if `ISD::OR` handling in
37920 // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.
37921 auto computeKnownBitsElementWise = [&DAG](SDValue V) {
37922 unsigned NumElts = V.getValueType().getVectorNumElements();
37923 KnownBits Known(NumElts);
37924 for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
37925 APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
37926 KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);
37927 if (PeepholeKnown.isZero())
37928 Known.Zero.setBit(EltIdx);
37929 if (PeepholeKnown.isAllOnes())
37930 Known.One.setBit(EltIdx);
37931 }
37932 return Known;
37933 };
37934
37935 KnownBits V1Known = computeKnownBitsElementWise(V1);
37936 KnownBits V2Known = computeKnownBitsElementWise(V2);
37937
37938 for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {
37939 int M = Mask[i];
37940 if (M == SM_SentinelUndef)
37941 continue;
37942 if (M == SM_SentinelZero) {
37943 IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];
37944 continue;
37945 }
37946 if (M == (int)i) {
37947 IsBlend &= V2Known.Zero[i] || V1Known.One[i];
37948 continue;
37949 }
37950 if (M == (int)(i + NumMaskElts)) {
37951 IsBlend &= V1Known.Zero[i] || V2Known.One[i];
37952 continue;
37953 }
37954 llvm_unreachable("will not get here.");
37955 }
37956 if (IsBlend) {
37957 Shuffle = ISD::OR;
37958 SrcVT = DstVT = MaskVT.changeTypeToInteger();
37959 return true;
37960 }
37961 }
37962 }
37963 }
37964
37965 return false;
37966}
37967
37969 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
37970 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
37971 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
37972 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
37973 unsigned NumMaskElts = Mask.size();
37974 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
37975
37976 // Attempt to match against VALIGND/VALIGNQ rotate.
37977 if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
37978 ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
37979 (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
37980 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
37981 if (!isAnyZero(Mask)) {
37982 int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
37983 if (0 < Rotation) {
37984 Shuffle = X86ISD::VALIGN;
37985 if (EltSizeInBits == 64)
37986 ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
37987 else
37988 ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
37989 PermuteImm = Rotation;
37990 return true;
37991 }
37992 }
37993 }
37994
37995 // Attempt to match against PALIGNR byte rotate.
37996 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
37997 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
37998 (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
37999 int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
38000 if (0 < ByteRotation) {
38001 Shuffle = X86ISD::PALIGNR;
38002 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
38003 PermuteImm = ByteRotation;
38004 return true;
38005 }
38006 }
38007
38008 // Attempt to combine to X86ISD::BLENDI.
38009 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
38010 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
38011 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
38012 uint64_t BlendMask = 0;
38013 bool ForceV1Zero = false, ForceV2Zero = false;
38014 SmallVector<int, 8> TargetMask(Mask);
38015 if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,
38016 ForceV2Zero, BlendMask)) {
38017 if (MaskVT == MVT::v16i16) {
38018 // We can only use v16i16 PBLENDW if the lanes are repeated.
38019 SmallVector<int, 8> RepeatedMask;
38020 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
38021 RepeatedMask)) {
38022 assert(RepeatedMask.size() == 8 &&
38023 "Repeated mask size doesn't match!");
38024 PermuteImm = 0;
38025 for (int i = 0; i < 8; ++i)
38026 if (RepeatedMask[i] >= 8)
38027 PermuteImm |= 1 << i;
38028 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
38029 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
38030 Shuffle = X86ISD::BLENDI;
38031 ShuffleVT = MaskVT;
38032 return true;
38033 }
38034 } else {
38035 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
38036 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
38037 PermuteImm = (unsigned)BlendMask;
38038 Shuffle = X86ISD::BLENDI;
38039 ShuffleVT = MaskVT;
38040 return true;
38041 }
38042 }
38043 }
38044
38045 // Attempt to combine to INSERTPS, but only if it has elements that need to
38046 // be set to zero.
38047 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
38048 MaskVT.is128BitVector() && isAnyZero(Mask) &&
38049 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
38050 Shuffle = X86ISD::INSERTPS;
38051 ShuffleVT = MVT::v4f32;
38052 return true;
38053 }
38054
38055 // Attempt to combine to SHUFPD.
38056 if (AllowFloatDomain && EltSizeInBits == 64 &&
38057 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
38058 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
38059 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
38060 bool ForceV1Zero = false, ForceV2Zero = false;
38061 if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,
38062 PermuteImm, Mask, Zeroable)) {
38063 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
38064 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
38065 Shuffle = X86ISD::SHUFP;
38066 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
38067 return true;
38068 }
38069 }
38070
38071 // Attempt to combine to SHUFPS.
38072 if (AllowFloatDomain && EltSizeInBits == 32 &&
38073 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
38074 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
38075 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
38076 SmallVector<int, 4> RepeatedMask;
38077 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
38078 // Match each half of the repeated mask, to determine if its just
38079 // referencing one of the vectors, is zeroable or entirely undef.
38080 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
38081 int M0 = RepeatedMask[Offset];
38082 int M1 = RepeatedMask[Offset + 1];
38083
38084 if (isUndefInRange(RepeatedMask, Offset, 2)) {
38085 return DAG.getUNDEF(MaskVT);
38086 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
38087 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
38088 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
38089 return getZeroVector(MaskVT, Subtarget, DAG, DL);
38090 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
38091 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
38092 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
38093 return V1;
38094 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
38095 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
38096 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
38097 return V2;
38098 }
38099
38100 return SDValue();
38101 };
38102
38103 int ShufMask[4] = {-1, -1, -1, -1};
38104 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
38105 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
38106
38107 if (Lo && Hi) {
38108 V1 = Lo;
38109 V2 = Hi;
38110 Shuffle = X86ISD::SHUFP;
38111 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
38112 PermuteImm = getV4X86ShuffleImm(ShufMask);
38113 return true;
38114 }
38115 }
38116 }
38117
38118 // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.
38119 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
38120 MaskVT.is128BitVector() &&
38121 matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
38122 Shuffle = X86ISD::INSERTPS;
38123 ShuffleVT = MVT::v4f32;
38124 return true;
38125 }
38126
38127 return false;
38128}
38129
38131 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
38132 bool HasVariableMask, bool AllowVariableCrossLaneMask,
38133 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
38134 const X86Subtarget &Subtarget);
38135
38136/// Combine an arbitrary chain of shuffles into a single instruction if
38137/// possible.
38138///
38139/// This is the leaf of the recursive combine below. When we have found some
38140/// chain of single-use x86 shuffle instructions and accumulated the combined
38141/// shuffle mask represented by them, this will try to pattern match that mask
38142/// into either a single instruction if there is a special purpose instruction
38143/// for this operation, or into a PSHUFB instruction which is a fully general
38144/// instruction but should only be used to replace chains over a certain depth.
38146 ArrayRef<int> BaseMask, int Depth,
38147 bool HasVariableMask,
38148 bool AllowVariableCrossLaneMask,
38149 bool AllowVariablePerLaneMask,
38150 SelectionDAG &DAG,
38151 const X86Subtarget &Subtarget) {
38152 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
38153 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
38154 "Unexpected number of shuffle inputs!");
38155
38156 SDLoc DL(Root);
38157 MVT RootVT = Root.getSimpleValueType();
38158 unsigned RootSizeInBits = RootVT.getSizeInBits();
38159 unsigned NumRootElts = RootVT.getVectorNumElements();
38160
38161 // Canonicalize shuffle input op to the requested type.
38162 auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {
38163 if (VT.getSizeInBits() > Op.getValueSizeInBits())
38164 Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());
38165 else if (VT.getSizeInBits() < Op.getValueSizeInBits())
38166 Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());
38167 return DAG.getBitcast(VT, Op);
38168 };
38169
38170 // Find the inputs that enter the chain. Note that multiple uses are OK
38171 // here, we're not going to remove the operands we find.
38172 bool UnaryShuffle = (Inputs.size() == 1);
38173 SDValue V1 = peekThroughBitcasts(Inputs[0]);
38174 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
38175 : peekThroughBitcasts(Inputs[1]));
38176
38177 MVT VT1 = V1.getSimpleValueType();
38178 MVT VT2 = V2.getSimpleValueType();
38179 assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&
38180 (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch");
38181
38182 SDValue Res;
38183
38184 unsigned NumBaseMaskElts = BaseMask.size();
38185 if (NumBaseMaskElts == 1) {
38186 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
38187 return CanonicalizeShuffleInput(RootVT, V1);
38188 }
38189
38190 bool OptForSize = DAG.shouldOptForSize();
38191 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
38192 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
38193 (RootVT.isFloatingPoint() && Depth >= 1) ||
38194 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
38195
38196 // Don't combine if we are a AVX512/EVEX target and the mask element size
38197 // is different from the root element size - this would prevent writemasks
38198 // from being reused.
38199 bool IsMaskedShuffle = false;
38200 if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
38201 if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
38202 Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
38203 IsMaskedShuffle = true;
38204 }
38205 }
38206
38207 // If we are shuffling a splat (and not introducing zeros) then we can just
38208 // use it directly. This works for smaller elements as well as they already
38209 // repeat across each mask element.
38210 if (UnaryShuffle && !isAnyZero(BaseMask) &&
38211 V1.getValueSizeInBits() >= RootSizeInBits &&
38212 (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
38213 DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {
38214 return CanonicalizeShuffleInput(RootVT, V1);
38215 }
38216
38217 SmallVector<int, 64> Mask(BaseMask);
38218
38219 // See if the shuffle is a hidden identity shuffle - repeated args in HOPs
38220 // etc. can be simplified.
38221 if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {
38222 SmallVector<int> ScaledMask, IdentityMask;
38223 unsigned NumElts = VT1.getVectorNumElements();
38224 if (Mask.size() <= NumElts &&
38225 scaleShuffleElements(Mask, NumElts, ScaledMask)) {
38226 for (unsigned i = 0; i != NumElts; ++i)
38227 IdentityMask.push_back(i);
38228 if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,
38229 V2))
38230 return CanonicalizeShuffleInput(RootVT, V1);
38231 }
38232 }
38233
38234 // Handle 128/256-bit lane shuffles of 512-bit vectors.
38235 if (RootVT.is512BitVector() &&
38236 (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
38237 // If the upper subvectors are zeroable, then an extract+insert is more
38238 // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
38239 // to zero the upper subvectors.
38240 if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {
38241 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
38242 return SDValue(); // Nothing to do!
38243 assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&
38244 "Unexpected lane shuffle");
38245 Res = CanonicalizeShuffleInput(RootVT, V1);
38246 unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);
38247 bool UseZero = isAnyZero(Mask);
38248 Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
38249 return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
38250 }
38251
38252 // Narrow shuffle mask to v4x128.
38253 SmallVector<int, 4> ScaledMask;
38254 assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
38255 narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);
38256
38257 // Try to lower to vshuf64x2/vshuf32x4.
38258 auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
38259 ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,
38260 SelectionDAG &DAG) {
38261 int PermMask[4] = {-1, -1, -1, -1};
38262 // Ensure elements came from the same Op.
38263 SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
38264 for (int i = 0; i < 4; ++i) {
38265 assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value");
38266 if (ScaledMask[i] < 0)
38267 continue;
38268
38269 SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;
38270 unsigned OpIndex = i / 2;
38271 if (Ops[OpIndex].isUndef())
38272 Ops[OpIndex] = Op;
38273 else if (Ops[OpIndex] != Op)
38274 return SDValue();
38275
38276 PermMask[i] = ScaledMask[i] % 4;
38277 }
38278
38279 return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
38280 CanonicalizeShuffleInput(ShuffleVT, Ops[0]),
38281 CanonicalizeShuffleInput(ShuffleVT, Ops[1]),
38282 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
38283 };
38284
38285 // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
38286 // doesn't work because our mask is for 128 bits and we don't have an MVT
38287 // to match that.
38288 bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) &&
38289 isUndefOrInRange(ScaledMask[1], 0, 2) &&
38290 isUndefOrInRange(ScaledMask[2], 2, 4) &&
38291 isUndefOrInRange(ScaledMask[3], 2, 4) &&
38292 (ScaledMask[0] < 0 || ScaledMask[2] < 0 ||
38293 ScaledMask[0] == (ScaledMask[2] % 2)) &&
38294 (ScaledMask[1] < 0 || ScaledMask[3] < 0 ||
38295 ScaledMask[1] == (ScaledMask[3] % 2));
38296
38297 if (!isAnyZero(ScaledMask) && !PreferPERMQ) {
38298 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
38299 return SDValue(); // Nothing to do!
38300 MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
38301 if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))
38302 return DAG.getBitcast(RootVT, V);
38303 }
38304 }
38305
38306 // Handle 128-bit lane shuffles of 256-bit vectors.
38307 if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
38308 // If the upper half is zeroable, then an extract+insert is more optimal
38309 // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
38310 // zero the upper half.
38311 if (isUndefOrZero(Mask[1])) {
38312 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
38313 return SDValue(); // Nothing to do!
38314 assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle");
38315 Res = CanonicalizeShuffleInput(RootVT, V1);
38316 Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);
38317 return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,
38318 256);
38319 }
38320
38321 // If we're inserting the low subvector, an insert-subvector 'concat'
38322 // pattern is quicker than VPERM2X128.
38323 // TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
38324 if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&
38325 !Subtarget.hasAVX2()) {
38326 if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
38327 return SDValue(); // Nothing to do!
38328 SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);
38329 SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);
38330 Hi = extractSubVector(Hi, 0, DAG, DL, 128);
38331 return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);
38332 }
38333
38334 if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
38335 return SDValue(); // Nothing to do!
38336
38337 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
38338 // we need to use the zeroing feature.
38339 // Prefer blends for sequential shuffles unless we are optimizing for size.
38340 if (UnaryShuffle &&
38341 !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&
38342 (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {
38343 unsigned PermMask = 0;
38344 PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);
38345 PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);
38346 return DAG.getNode(
38347 X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
38348 DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
38349 }
38350
38351 if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
38352 return SDValue(); // Nothing to do!
38353
38354 // TODO - handle AVX512VL cases with X86ISD::SHUF128.
38355 if (!UnaryShuffle && !IsMaskedShuffle) {
38356 assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&
38357 "Unexpected shuffle sentinel value");
38358 // Prefer blends to X86ISD::VPERM2X128.
38359 if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
38360 unsigned PermMask = 0;
38361 PermMask |= ((Mask[0] & 3) << 0);
38362 PermMask |= ((Mask[1] & 3) << 4);
38363 SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;
38364 SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;
38365 return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
38366 CanonicalizeShuffleInput(RootVT, LHS),
38367 CanonicalizeShuffleInput(RootVT, RHS),
38368 DAG.getTargetConstant(PermMask, DL, MVT::i8));
38369 }
38370 }
38371 }
38372
38373 // For masks that have been widened to 128-bit elements or more,
38374 // narrow back down to 64-bit elements.
38375 if (BaseMaskEltSizeInBits > 64) {
38376 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
38377 int MaskScale = BaseMaskEltSizeInBits / 64;
38378 SmallVector<int, 64> ScaledMask;
38379 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
38380 Mask = std::move(ScaledMask);
38381 }
38382
38383 // For masked shuffles, we're trying to match the root width for better
38384 // writemask folding, attempt to scale the mask.
38385 // TODO - variable shuffles might need this to be widened again.
38386 if (IsMaskedShuffle && NumRootElts > Mask.size()) {
38387 assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
38388 int MaskScale = NumRootElts / Mask.size();
38389 SmallVector<int, 64> ScaledMask;
38390 narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
38391 Mask = std::move(ScaledMask);
38392 }
38393
38394 unsigned NumMaskElts = Mask.size();
38395 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
38396 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38397
38398 // Determine the effective mask value type.
38399 FloatDomain &= (32 <= MaskEltSizeInBits);
38400 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
38401 : MVT::getIntegerVT(MaskEltSizeInBits);
38402 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
38403
38404 // Only allow legal mask types.
38405 if (!TLI.isTypeLegal(MaskVT))
38406 return SDValue();
38407
38408 // Attempt to match the mask against known shuffle patterns.
38409 MVT ShuffleSrcVT, ShuffleVT;
38410 unsigned Shuffle, PermuteImm;
38411
38412 // Which shuffle domains are permitted?
38413 // Permit domain crossing at higher combine depths.
38414 // TODO: Should we indicate which domain is preferred if both are allowed?
38415 bool AllowFloatDomain = FloatDomain || (Depth >= 3);
38416 bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&
38417 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
38418
38419 // Determine zeroable mask elements.
38420 APInt KnownUndef, KnownZero;
38421 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
38422 APInt Zeroable = KnownUndef | KnownZero;
38423
38424 if (UnaryShuffle) {
38425 // Attempt to match against broadcast-from-vector.
38426 // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
38427 if ((Subtarget.hasAVX2() ||
38428 (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
38429 (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
38430 if (isUndefOrEqual(Mask, 0)) {
38431 if (V1.getValueType() == MaskVT &&
38433 X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
38434 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
38435 return SDValue(); // Nothing to do!
38436 Res = V1.getOperand(0);
38437 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
38438 return DAG.getBitcast(RootVT, Res);
38439 }
38440 if (Subtarget.hasAVX2()) {
38441 if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
38442 return SDValue(); // Nothing to do!
38443 Res = CanonicalizeShuffleInput(MaskVT, V1);
38444 Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);
38445 return DAG.getBitcast(RootVT, Res);
38446 }
38447 }
38448 }
38449
38450 if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,
38451 DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&
38452 (!IsMaskedShuffle ||
38453 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
38454 if (Depth == 0 && Root.getOpcode() == Shuffle)
38455 return SDValue(); // Nothing to do!
38456 Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
38457 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
38458 return DAG.getBitcast(RootVT, Res);
38459 }
38460
38461 if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
38462 AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,
38463 PermuteImm) &&
38464 (!IsMaskedShuffle ||
38465 (NumRootElts == ShuffleVT.getVectorNumElements()))) {
38466 if (Depth == 0 && Root.getOpcode() == Shuffle)
38467 return SDValue(); // Nothing to do!
38468 Res = CanonicalizeShuffleInput(ShuffleVT, V1);
38469 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
38470 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
38471 return DAG.getBitcast(RootVT, Res);
38472 }
38473 }
38474
38475 // Attempt to combine to INSERTPS, but only if the inserted element has come
38476 // from a scalar.
38477 // TODO: Handle other insertions here as well?
38478 if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
38479 Subtarget.hasSSE41() &&
38480 !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {
38481 if (MaskEltSizeInBits == 32) {
38482 SDValue SrcV1 = V1, SrcV2 = V2;
38483 if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,
38484 DAG) &&
38485 SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
38486 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
38487 return SDValue(); // Nothing to do!
38488 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
38489 CanonicalizeShuffleInput(MVT::v4f32, SrcV1),
38490 CanonicalizeShuffleInput(MVT::v4f32, SrcV2),
38491 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
38492 return DAG.getBitcast(RootVT, Res);
38493 }
38494 }
38495 if (MaskEltSizeInBits == 64 &&
38496 isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&
38497 V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&
38498 V2.getScalarValueSizeInBits() <= 32) {
38499 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
38500 return SDValue(); // Nothing to do!
38501 PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);
38502 Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
38503 CanonicalizeShuffleInput(MVT::v4f32, V1),
38504 CanonicalizeShuffleInput(MVT::v4f32, V2),
38505 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
38506 return DAG.getBitcast(RootVT, Res);
38507 }
38508 }
38509
38510 SDValue NewV1 = V1; // Save operands in case early exit happens.
38511 SDValue NewV2 = V2;
38512 if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
38513 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
38514 ShuffleVT, UnaryShuffle) &&
38515 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
38516 if (Depth == 0 && Root.getOpcode() == Shuffle)
38517 return SDValue(); // Nothing to do!
38518 NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);
38519 NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);
38520 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
38521 return DAG.getBitcast(RootVT, Res);
38522 }
38523
38524 NewV1 = V1; // Save operands in case early exit happens.
38525 NewV2 = V2;
38526 if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
38527 AllowIntDomain, NewV1, NewV2, DL, DAG,
38528 Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
38529 (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
38530 if (Depth == 0 && Root.getOpcode() == Shuffle)
38531 return SDValue(); // Nothing to do!
38532 NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);
38533 NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);
38534 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
38535 DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
38536 return DAG.getBitcast(RootVT, Res);
38537 }
38538
38539 // Typically from here on, we need an integer version of MaskVT.
38540 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
38541 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
38542
38543 // Annoyingly, SSE4A instructions don't map into the above match helpers.
38544 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
38545 uint64_t BitLen, BitIdx;
38546 if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
38547 Zeroable)) {
38548 if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)
38549 return SDValue(); // Nothing to do!
38550 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
38551 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
38552 DAG.getTargetConstant(BitLen, DL, MVT::i8),
38553 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
38554 return DAG.getBitcast(RootVT, Res);
38555 }
38556
38557 if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
38558 if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)
38559 return SDValue(); // Nothing to do!
38560 V1 = CanonicalizeShuffleInput(IntMaskVT, V1);
38561 V2 = CanonicalizeShuffleInput(IntMaskVT, V2);
38562 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
38563 DAG.getTargetConstant(BitLen, DL, MVT::i8),
38564 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
38565 return DAG.getBitcast(RootVT, Res);
38566 }
38567 }
38568
38569 // Match shuffle against TRUNCATE patterns.
38570 if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
38571 // Match against a VTRUNC instruction, accounting for src/dst sizes.
38572 if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
38573 Subtarget)) {
38574 bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
38575 ShuffleSrcVT.getVectorNumElements();
38576 unsigned Opc =
38577 IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
38578 if (Depth == 0 && Root.getOpcode() == Opc)
38579 return SDValue(); // Nothing to do!
38580 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
38581 Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
38582 if (ShuffleVT.getSizeInBits() < RootSizeInBits)
38583 Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
38584 return DAG.getBitcast(RootVT, Res);
38585 }
38586
38587 // Do we need a more general binary truncation pattern?
38588 if (RootSizeInBits < 512 &&
38589 ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
38590 (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
38591 (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
38592 isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
38593 // Bail if this was already a truncation or PACK node.
38594 // We sometimes fail to match PACK if we demand known undef elements.
38595 if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE ||
38596 Root.getOpcode() == X86ISD::PACKSS ||
38597 Root.getOpcode() == X86ISD::PACKUS))
38598 return SDValue(); // Nothing to do!
38599 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
38600 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
38601 V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);
38602 V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);
38603 ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
38604 ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
38605 Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
38606 Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
38607 return DAG.getBitcast(RootVT, Res);
38608 }
38609 }
38610
38611 // Don't try to re-form single instruction chains under any circumstances now
38612 // that we've done encoding canonicalization for them.
38613 if (Depth < 1)
38614 return SDValue();
38615
38616 // Depth threshold above which we can efficiently use variable mask shuffles.
38617 int VariableCrossLaneShuffleDepth =
38618 Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;
38619 int VariablePerLaneShuffleDepth =
38620 Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;
38621 AllowVariableCrossLaneMask &=
38622 (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;
38623 AllowVariablePerLaneMask &=
38624 (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;
38625 // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a
38626 // higher depth before combining them.
38627 bool AllowBWIVPERMV3 =
38628 (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);
38629
38630 bool MaskContainsZeros = isAnyZero(Mask);
38631
38632 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
38633 // If we have a single input lane-crossing shuffle then lower to VPERMV.
38634 if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {
38635 if (Subtarget.hasAVX2() &&
38636 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {
38637 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
38638 Res = CanonicalizeShuffleInput(MaskVT, V1);
38639 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
38640 return DAG.getBitcast(RootVT, Res);
38641 }
38642 // AVX512 variants (non-VLX will pad to 512-bit shuffles).
38643 if ((Subtarget.hasAVX512() &&
38644 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
38645 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
38646 (Subtarget.hasBWI() &&
38647 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
38648 (Subtarget.hasVBMI() &&
38649 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {
38650 V1 = CanonicalizeShuffleInput(MaskVT, V1);
38651 V2 = DAG.getUNDEF(MaskVT);
38652 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
38653 return DAG.getBitcast(RootVT, Res);
38654 }
38655 }
38656
38657 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
38658 // vector as the second source (non-VLX will pad to 512-bit shuffles).
38659 if (UnaryShuffle && AllowVariableCrossLaneMask &&
38660 ((Subtarget.hasAVX512() &&
38661 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
38662 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
38663 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||
38664 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
38665 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
38666 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
38667 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
38668 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
38669 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
38670 for (unsigned i = 0; i != NumMaskElts; ++i)
38671 if (Mask[i] == SM_SentinelZero)
38672 Mask[i] = NumMaskElts + i;
38673 V1 = CanonicalizeShuffleInput(MaskVT, V1);
38674 V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);
38675 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
38676 return DAG.getBitcast(RootVT, Res);
38677 }
38678
38679 // If that failed and either input is extracted then try to combine as a
38680 // shuffle with the larger type.
38682 Inputs, Root, BaseMask, Depth, HasVariableMask,
38683 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,
38684 Subtarget))
38685 return WideShuffle;
38686
38687 // If we have a dual input lane-crossing shuffle then lower to VPERMV3,
38688 // (non-VLX will pad to 512-bit shuffles).
38689 if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
38690 ((Subtarget.hasAVX512() &&
38691 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
38692 MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
38693 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||
38694 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
38695 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
38696 (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||
38697 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
38698 (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {
38699 V1 = CanonicalizeShuffleInput(MaskVT, V1);
38700 V2 = CanonicalizeShuffleInput(MaskVT, V2);
38701 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
38702 return DAG.getBitcast(RootVT, Res);
38703 }
38704 return SDValue();
38705 }
38706
38707 // See if we can combine a single input shuffle with zeros to a bit-mask,
38708 // which is much simpler than any shuffle.
38709 if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&
38710 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
38711 TLI.isTypeLegal(MaskVT)) {
38712 APInt Zero = APInt::getZero(MaskEltSizeInBits);
38713 APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);
38714 APInt UndefElts(NumMaskElts, 0);
38715 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
38716 for (unsigned i = 0; i != NumMaskElts; ++i) {
38717 int M = Mask[i];
38718 if (M == SM_SentinelUndef) {
38719 UndefElts.setBit(i);
38720 continue;
38721 }
38722 if (M == SM_SentinelZero)
38723 continue;
38724 EltBits[i] = AllOnes;
38725 }
38726 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
38727 Res = CanonicalizeShuffleInput(MaskVT, V1);
38728 unsigned AndOpcode =
38730 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
38731 return DAG.getBitcast(RootVT, Res);
38732 }
38733
38734 // If we have a single input shuffle with different shuffle patterns in the
38735 // the 128-bit lanes use the variable mask to VPERMILPS.
38736 // TODO Combine other mask types at higher depths.
38737 if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
38738 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
38739 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
38740 SmallVector<SDValue, 16> VPermIdx;
38741 for (int M : Mask) {
38742 SDValue Idx =
38743 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
38744 VPermIdx.push_back(Idx);
38745 }
38746 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
38747 Res = CanonicalizeShuffleInput(MaskVT, V1);
38748 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
38749 return DAG.getBitcast(RootVT, Res);
38750 }
38751
38752 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
38753 // to VPERMIL2PD/VPERMIL2PS.
38754 if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&
38755 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
38756 MaskVT == MVT::v8f32)) {
38757 // VPERMIL2 Operation.
38758 // Bits[3] - Match Bit.
38759 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
38760 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
38761 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
38762 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
38763 SmallVector<int, 8> VPerm2Idx;
38764 unsigned M2ZImm = 0;
38765 for (int M : Mask) {
38766 if (M == SM_SentinelUndef) {
38767 VPerm2Idx.push_back(-1);
38768 continue;
38769 }
38770 if (M == SM_SentinelZero) {
38771 M2ZImm = 2;
38772 VPerm2Idx.push_back(8);
38773 continue;
38774 }
38775 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
38776 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
38777 VPerm2Idx.push_back(Index);
38778 }
38779 V1 = CanonicalizeShuffleInput(MaskVT, V1);
38780 V2 = CanonicalizeShuffleInput(MaskVT, V2);
38781 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
38782 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
38783 DAG.getTargetConstant(M2ZImm, DL, MVT::i8));
38784 return DAG.getBitcast(RootVT, Res);
38785 }
38786
38787 // If we have 3 or more shuffle instructions or a chain involving a variable
38788 // mask, we can replace them with a single PSHUFB instruction profitably.
38789 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
38790 // instructions, but in practice PSHUFB tends to be *very* fast so we're
38791 // more aggressive.
38792 if (UnaryShuffle && AllowVariablePerLaneMask &&
38793 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
38794 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
38795 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
38796 SmallVector<SDValue, 16> PSHUFBMask;
38797 int NumBytes = RootVT.getSizeInBits() / 8;
38798 int Ratio = NumBytes / NumMaskElts;
38799 for (int i = 0; i < NumBytes; ++i) {
38800 int M = Mask[i / Ratio];
38801 if (M == SM_SentinelUndef) {
38802 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
38803 continue;
38804 }
38805 if (M == SM_SentinelZero) {
38806 PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
38807 continue;
38808 }
38809 M = Ratio * M + i % Ratio;
38810 assert((M / 16) == (i / 16) && "Lane crossing detected");
38811 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
38812 }
38813 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
38814 Res = CanonicalizeShuffleInput(ByteVT, V1);
38815 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
38816 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
38817 return DAG.getBitcast(RootVT, Res);
38818 }
38819
38820 // With XOP, if we have a 128-bit binary input shuffle we can always combine
38821 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
38822 // slower than PSHUFB on targets that support both.
38823 if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&
38824 Subtarget.hasXOP()) {
38825 // VPPERM Mask Operation
38826 // Bits[4:0] - Byte Index (0 - 31)
38827 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
38828 SmallVector<SDValue, 16> VPPERMMask;
38829 int NumBytes = 16;
38830 int Ratio = NumBytes / NumMaskElts;
38831 for (int i = 0; i < NumBytes; ++i) {
38832 int M = Mask[i / Ratio];
38833 if (M == SM_SentinelUndef) {
38834 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
38835 continue;
38836 }
38837 if (M == SM_SentinelZero) {
38838 VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
38839 continue;
38840 }
38841 M = Ratio * M + i % Ratio;
38842 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
38843 }
38844 MVT ByteVT = MVT::v16i8;
38845 V1 = CanonicalizeShuffleInput(ByteVT, V1);
38846 V2 = CanonicalizeShuffleInput(ByteVT, V2);
38847 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
38848 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
38849 return DAG.getBitcast(RootVT, Res);
38850 }
38851
38852 // If that failed and either input is extracted then try to combine as a
38853 // shuffle with the larger type.
38855 Inputs, Root, BaseMask, Depth, HasVariableMask,
38856 AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))
38857 return WideShuffle;
38858
38859 // If we have a dual input shuffle then lower to VPERMV3,
38860 // (non-VLX will pad to 512-bit shuffles)
38861 if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
38862 ((Subtarget.hasAVX512() &&
38863 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||
38864 MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||
38865 MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||
38866 MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||
38867 MaskVT == MVT::v16i32)) ||
38868 (Subtarget.hasBWI() && AllowBWIVPERMV3 &&
38869 (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||
38870 MaskVT == MVT::v32i16)) ||
38871 (Subtarget.hasVBMI() && AllowBWIVPERMV3 &&
38872 (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||
38873 MaskVT == MVT::v64i8)))) {
38874 V1 = CanonicalizeShuffleInput(MaskVT, V1);
38875 V2 = CanonicalizeShuffleInput(MaskVT, V2);
38876 Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);
38877 return DAG.getBitcast(RootVT, Res);
38878 }
38879
38880 // Failed to find any combines.
38881 return SDValue();
38882}
38883
38884// Combine an arbitrary chain of shuffles + extract_subvectors into a single
38885// instruction if possible.
38886//
38887// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
38888// type size to attempt to combine:
38889// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
38890// -->
38891// extract_subvector(shuffle(x,y,m2),0)
38893 ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,
38894 bool HasVariableMask, bool AllowVariableCrossLaneMask,
38895 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
38896 const X86Subtarget &Subtarget) {
38897 unsigned NumMaskElts = BaseMask.size();
38898 unsigned NumInputs = Inputs.size();
38899 if (NumInputs == 0)
38900 return SDValue();
38901
38902 EVT RootVT = Root.getValueType();
38903 unsigned RootSizeInBits = RootVT.getSizeInBits();
38904 unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
38905 assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
38906
38907 // Peek through extract_subvector to find widest legal vector.
38908 // TODO: Handle ISD::TRUNCATE
38909 unsigned WideSizeInBits = RootSizeInBits;
38910 for (unsigned I = 0; I != NumInputs; ++I) {
38911 SDValue Input = peekThroughBitcasts(Inputs[I]);
38912 while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR)
38913 Input = peekThroughBitcasts(Input.getOperand(0));
38914 if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
38915 WideSizeInBits < Input.getValueSizeInBits())
38916 WideSizeInBits = Input.getValueSizeInBits();
38917 }
38918
38919 // Bail if we fail to find a source larger than the existing root.
38920 unsigned Scale = WideSizeInBits / RootSizeInBits;
38921 if (WideSizeInBits <= RootSizeInBits ||
38922 (WideSizeInBits % RootSizeInBits) != 0)
38923 return SDValue();
38924
38925 // Create new mask for larger type.
38926 SmallVector<int, 64> WideMask(BaseMask);
38927 for (int &M : WideMask) {
38928 if (M < 0)
38929 continue;
38930 M = (M % NumMaskElts) + ((M / NumMaskElts) * Scale * NumMaskElts);
38931 }
38932 WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
38933
38934 // Attempt to peek through inputs and adjust mask when we extract from an
38935 // upper subvector.
38936 int AdjustedMasks = 0;
38937 SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());
38938 for (unsigned I = 0; I != NumInputs; ++I) {
38939 SDValue &Input = WideInputs[I];
38940 Input = peekThroughBitcasts(Input);
38941 while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
38942 Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
38944 if (Idx != 0) {
38945 ++AdjustedMasks;
38946 unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
38947 Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
38948
38949 int lo = I * WideMask.size();
38950 int hi = (I + 1) * WideMask.size();
38951 for (int &M : WideMask)
38952 if (lo <= M && M < hi)
38953 M += Idx;
38954 }
38955 Input = peekThroughBitcasts(Input.getOperand(0));
38956 }
38957 }
38958
38959 // Remove unused/repeated shuffle source ops.
38960 resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
38961 assert(!WideInputs.empty() && "Shuffle with no inputs detected");
38962
38963 // Bail if we're always extracting from the lowest subvectors,
38964 // combineX86ShuffleChain should match this for the current width, or the
38965 // shuffle still references too many inputs.
38966 if (AdjustedMasks == 0 || WideInputs.size() > 2)
38967 return SDValue();
38968
38969 // Minor canonicalization of the accumulated shuffle mask to make it easier
38970 // to match below. All this does is detect masks with sequential pairs of
38971 // elements, and shrink them to the half-width mask. It does this in a loop
38972 // so it will reduce the size of the mask to the minimal width mask which
38973 // performs an equivalent shuffle.
38974 while (WideMask.size() > 1) {
38975 SmallVector<int, 64> WidenedMask;
38976 if (!canWidenShuffleElements(WideMask, WidenedMask))
38977 break;
38978 WideMask = std::move(WidenedMask);
38979 }
38980
38981 // Canonicalization of binary shuffle masks to improve pattern matching by
38982 // commuting the inputs.
38983 if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
38985 std::swap(WideInputs[0], WideInputs[1]);
38986 }
38987
38988 // Increase depth for every upper subvector we've peeked through.
38989 Depth += AdjustedMasks;
38990
38991 // Attempt to combine wider chain.
38992 // TODO: Can we use a better Root?
38993 SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
38994 WideInputs.back().getValueSizeInBits()
38995 ? WideInputs.front()
38996 : WideInputs.back();
38997 assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
38998 "WideRootSize mismatch");
38999
39000 if (SDValue WideShuffle =
39001 combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,
39002 HasVariableMask, AllowVariableCrossLaneMask,
39003 AllowVariablePerLaneMask, DAG, Subtarget)) {
39004 WideShuffle =
39005 extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);
39006 return DAG.getBitcast(RootVT, WideShuffle);
39007 }
39008
39009 return SDValue();
39010}
39011
39012// Canonicalize the combined shuffle mask chain with horizontal ops.
39013// NOTE: This may update the Ops and Mask.
39016 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
39017 const X86Subtarget &Subtarget) {
39018 if (Mask.empty() || Ops.empty())
39019 return SDValue();
39020
39022 for (SDValue Op : Ops)
39024
39025 // All ops must be the same horizop + type.
39026 SDValue BC0 = BC[0];
39027 EVT VT0 = BC0.getValueType();
39028 unsigned Opcode0 = BC0.getOpcode();
39029 if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {
39030 return V.getOpcode() != Opcode0 || V.getValueType() != VT0;
39031 }))
39032 return SDValue();
39033
39034 bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
39035 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
39036 bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);
39037 if (!isHoriz && !isPack)
39038 return SDValue();
39039
39040 // Do all ops have a single use?
39041 bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {
39042 return Op.hasOneUse() &&
39044 });
39045
39046 int NumElts = VT0.getVectorNumElements();
39047 int NumLanes = VT0.getSizeInBits() / 128;
39048 int NumEltsPerLane = NumElts / NumLanes;
39049 int NumHalfEltsPerLane = NumEltsPerLane / 2;
39050 MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
39051 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
39052
39053 if (NumEltsPerLane >= 4 &&
39054 (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {
39055 SmallVector<int> LaneMask, ScaledMask;
39056 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&
39057 scaleShuffleElements(LaneMask, 4, ScaledMask)) {
39058 // See if we can remove the shuffle by resorting the HOP chain so that
39059 // the HOP args are pre-shuffled.
39060 // TODO: Generalize to any sized/depth chain.
39061 // TODO: Add support for PACKSS/PACKUS.
39062 if (isHoriz) {
39063 // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.
39064 auto GetHOpSrc = [&](int M) {
39065 if (M == SM_SentinelUndef)
39066 return DAG.getUNDEF(VT0);
39067 if (M == SM_SentinelZero)
39068 return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);
39069 SDValue Src0 = BC[M / 4];
39070 SDValue Src1 = Src0.getOperand((M % 4) >= 2);
39071 if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))
39072 return Src1.getOperand(M % 2);
39073 return SDValue();
39074 };
39075 SDValue M0 = GetHOpSrc(ScaledMask[0]);
39076 SDValue M1 = GetHOpSrc(ScaledMask[1]);
39077 SDValue M2 = GetHOpSrc(ScaledMask[2]);
39078 SDValue M3 = GetHOpSrc(ScaledMask[3]);
39079 if (M0 && M1 && M2 && M3) {
39080 SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);
39081 SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);
39082 return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
39083 }
39084 }
39085 // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.
39086 if (Ops.size() >= 2) {
39087 SDValue LHS, RHS;
39088 auto GetHOpSrc = [&](int M, int &OutM) {
39089 // TODO: Support SM_SentinelZero
39090 if (M < 0)
39091 return M == SM_SentinelUndef;
39092 SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);
39093 if (!LHS || LHS == Src) {
39094 LHS = Src;
39095 OutM = (M % 2);
39096 return true;
39097 }
39098 if (!RHS || RHS == Src) {
39099 RHS = Src;
39100 OutM = (M % 2) + 2;
39101 return true;
39102 }
39103 return false;
39104 };
39105 int PostMask[4] = {-1, -1, -1, -1};
39106 if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&
39107 GetHOpSrc(ScaledMask[1], PostMask[1]) &&
39108 GetHOpSrc(ScaledMask[2], PostMask[2]) &&
39109 GetHOpSrc(ScaledMask[3], PostMask[3])) {
39110 LHS = DAG.getBitcast(SrcVT, LHS);
39111 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
39112 SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);
39113 // Use SHUFPS for the permute so this will work on SSE2 targets,
39114 // shuffle combining and domain handling will simplify this later on.
39115 MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);
39116 Res = DAG.getBitcast(ShuffleVT, Res);
39117 return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,
39118 getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));
39119 }
39120 }
39121 }
39122 }
39123
39124 if (2 < Ops.size())
39125 return SDValue();
39126
39127 SDValue BC1 = BC[BC.size() - 1];
39128 if (Mask.size() == VT0.getVectorNumElements()) {
39129 // Canonicalize binary shuffles of horizontal ops that use the
39130 // same sources to an unary shuffle.
39131 // TODO: Try to perform this fold even if the shuffle remains.
39132 if (Ops.size() == 2) {
39133 auto ContainsOps = [](SDValue HOp, SDValue Op) {
39134 return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);
39135 };
39136 // Commute if all BC0's ops are contained in BC1.
39137 if (ContainsOps(BC1, BC0.getOperand(0)) &&
39138 ContainsOps(BC1, BC0.getOperand(1))) {
39140 std::swap(Ops[0], Ops[1]);
39141 std::swap(BC0, BC1);
39142 }
39143
39144 // If BC1 can be represented by BC0, then convert to unary shuffle.
39145 if (ContainsOps(BC0, BC1.getOperand(0)) &&
39146 ContainsOps(BC0, BC1.getOperand(1))) {
39147 for (int &M : Mask) {
39148 if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.
39149 continue;
39150 int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;
39151 M -= NumElts + (SubLane * NumHalfEltsPerLane);
39152 if (BC1.getOperand(SubLane) != BC0.getOperand(0))
39153 M += NumHalfEltsPerLane;
39154 }
39155 }
39156 }
39157
39158 // Canonicalize unary horizontal ops to only refer to lower halves.
39159 for (int i = 0; i != NumElts; ++i) {
39160 int &M = Mask[i];
39161 if (isUndefOrZero(M))
39162 continue;
39163 if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&
39164 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
39165 M -= NumHalfEltsPerLane;
39166 if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&
39167 (M % NumEltsPerLane) >= NumHalfEltsPerLane)
39168 M -= NumHalfEltsPerLane;
39169 }
39170 }
39171
39172 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
39173 // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
39174 // represents the LHS/RHS inputs for the lower/upper halves.
39175 SmallVector<int, 16> TargetMask128, WideMask128;
39176 if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&
39177 scaleShuffleElements(TargetMask128, 2, WideMask128)) {
39178 assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle");
39179 bool SingleOp = (Ops.size() == 1);
39180 if (isPack || OneUseOps ||
39181 shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
39182 SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;
39183 SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;
39184 Lo = Lo.getOperand(WideMask128[0] & 1);
39185 Hi = Hi.getOperand(WideMask128[1] & 1);
39186 if (SingleOp) {
39187 SDValue Undef = DAG.getUNDEF(SrcVT);
39188 SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
39189 Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);
39190 Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);
39191 Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);
39192 Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);
39193 }
39194 return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
39195 }
39196 }
39197
39198 // If we are post-shuffling a 256-bit hop and not requiring the upper
39199 // elements, then try to narrow to a 128-bit hop directly.
39200 SmallVector<int, 16> WideMask64;
39201 if (Ops.size() == 1 && NumLanes == 2 &&
39202 scaleShuffleElements(Mask, 4, WideMask64) &&
39203 isUndefInRange(WideMask64, 2, 2)) {
39204 int M0 = WideMask64[0];
39205 int M1 = WideMask64[1];
39206 if (isInRange(M0, 0, 4) && isInRange(M1, 0, 4)) {
39208 unsigned Idx0 = (M0 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
39209 unsigned Idx1 = (M1 & 2) ? (SrcVT.getVectorNumElements() / 2) : 0;
39210 SDValue V0 = extract128BitVector(BC[0].getOperand(M0 & 1), Idx0, DAG, DL);
39211 SDValue V1 = extract128BitVector(BC[0].getOperand(M1 & 1), Idx1, DAG, DL);
39212 SDValue Res = DAG.getNode(Opcode0, DL, HalfVT, V0, V1);
39213 return widenSubVector(Res, false, Subtarget, DAG, DL, 256);
39214 }
39215 }
39216
39217 return SDValue();
39218}
39219
39220// Attempt to constant fold all of the constant source ops.
39221// Returns true if the entire shuffle is folded to a constant.
39222// TODO: Extend this to merge multiple constant Ops and update the mask.
39224 ArrayRef<int> Mask, SDValue Root,
39225 bool HasVariableMask,
39226 SelectionDAG &DAG,
39227 const X86Subtarget &Subtarget) {
39228 MVT VT = Root.getSimpleValueType();
39229
39230 unsigned SizeInBits = VT.getSizeInBits();
39231 unsigned NumMaskElts = Mask.size();
39232 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
39233 unsigned NumOps = Ops.size();
39234
39235 // Extract constant bits from each source op.
39236 SmallVector<APInt, 16> UndefEltsOps(NumOps);
39237 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
39238 for (unsigned I = 0; I != NumOps; ++I)
39239 if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],
39240 RawBitsOps[I],
39241 /*AllowWholeUndefs*/ true,
39242 /*AllowPartialUndefs*/ true))
39243 return SDValue();
39244
39245 // If we're optimizing for size, only fold if at least one of the constants is
39246 // only used once or the combined shuffle has included a variable mask
39247 // shuffle, this is to avoid constant pool bloat.
39248 bool IsOptimizingSize = DAG.shouldOptForSize();
39249 if (IsOptimizingSize && !HasVariableMask &&
39250 llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))
39251 return SDValue();
39252
39253 // Shuffle the constant bits according to the mask.
39254 SDLoc DL(Root);
39255 APInt UndefElts(NumMaskElts, 0);
39256 APInt ZeroElts(NumMaskElts, 0);
39257 APInt ConstantElts(NumMaskElts, 0);
39258 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
39259 APInt::getZero(MaskSizeInBits));
39260 for (unsigned i = 0; i != NumMaskElts; ++i) {
39261 int M = Mask[i];
39262 if (M == SM_SentinelUndef) {
39263 UndefElts.setBit(i);
39264 continue;
39265 } else if (M == SM_SentinelZero) {
39266 ZeroElts.setBit(i);
39267 continue;
39268 }
39269 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
39270
39271 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
39272 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
39273
39274 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
39275 if (SrcUndefElts[SrcMaskIdx]) {
39276 UndefElts.setBit(i);
39277 continue;
39278 }
39279
39280 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
39281 APInt &Bits = SrcEltBits[SrcMaskIdx];
39282 if (!Bits) {
39283 ZeroElts.setBit(i);
39284 continue;
39285 }
39286
39287 ConstantElts.setBit(i);
39288 ConstantBitData[i] = Bits;
39289 }
39290 assert((UndefElts | ZeroElts | ConstantElts).isAllOnes());
39291
39292 // Attempt to create a zero vector.
39293 if ((UndefElts | ZeroElts).isAllOnes())
39294 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
39295
39296 // Create the constant data.
39297 MVT MaskSVT;
39298 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
39299 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
39300 else
39301 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
39302
39303 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
39304 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
39305 return SDValue();
39306
39307 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
39308 return DAG.getBitcast(VT, CstOp);
39309}
39310
39311namespace llvm {
39312 namespace X86 {
39313 enum {
39316 } // namespace X86
39317} // namespace llvm
39318
39319/// Fully generic combining of x86 shuffle instructions.
39320///
39321/// This should be the last combine run over the x86 shuffle instructions. Once
39322/// they have been fully optimized, this will recursively consider all chains
39323/// of single-use shuffle instructions, build a generic model of the cumulative
39324/// shuffle operation, and check for simpler instructions which implement this
39325/// operation. We use this primarily for two purposes:
39326///
39327/// 1) Collapse generic shuffles to specialized single instructions when
39328/// equivalent. In most cases, this is just an encoding size win, but
39329/// sometimes we will collapse multiple generic shuffles into a single
39330/// special-purpose shuffle.
39331/// 2) Look for sequences of shuffle instructions with 3 or more total
39332/// instructions, and replace them with the slightly more expensive SSSE3
39333/// PSHUFB instruction if available. We do this as the last combining step
39334/// to ensure we avoid using PSHUFB if we can implement the shuffle with
39335/// a suitable short sequence of other instructions. The PSHUFB will either
39336/// use a register or have to read from memory and so is slightly (but only
39337/// slightly) more expensive than the other shuffle instructions.
39338///
39339/// Because this is inherently a quadratic operation (for each shuffle in
39340/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
39341/// This should never be an issue in practice as the shuffle lowering doesn't
39342/// produce sequences of more than 8 instructions.
39343///
39344/// FIXME: We will currently miss some cases where the redundant shuffling
39345/// would simplify under the threshold for PSHUFB formation because of
39346/// combine-ordering. To fix this, we should do the redundant instruction
39347/// combining in this recursive walk.
39349 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
39350 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
39351 unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,
39352 bool AllowVariablePerLaneMask, SelectionDAG &DAG,
39353 const X86Subtarget &Subtarget) {
39354 assert(!RootMask.empty() &&
39355 (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
39356 "Illegal shuffle root mask");
39357 MVT RootVT = Root.getSimpleValueType();
39358 assert(RootVT.isVector() && "Shuffles operate on vector types!");
39359 unsigned RootSizeInBits = RootVT.getSizeInBits();
39360
39361 // Bound the depth of our recursive combine because this is ultimately
39362 // quadratic in nature.
39363 if (Depth >= MaxDepth)
39364 return SDValue();
39365
39366 // Directly rip through bitcasts to find the underlying operand.
39367 SDValue Op = SrcOps[SrcOpIndex];
39369
39370 EVT VT = Op.getValueType();
39371 if (!VT.isVector() || !VT.isSimple())
39372 return SDValue(); // Bail if we hit a non-simple non-vector.
39373
39374 // FIXME: Just bail on f16 for now.
39375 if (VT.getVectorElementType() == MVT::f16)
39376 return SDValue();
39377
39378 assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&
39379 "Can only combine shuffles upto size of the root op.");
39380
39381 // Create a demanded elts mask from the referenced elements of Op.
39382 APInt OpDemandedElts = APInt::getZero(RootMask.size());
39383 for (int M : RootMask) {
39384 int BaseIdx = RootMask.size() * SrcOpIndex;
39385 if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))
39386 OpDemandedElts.setBit(M - BaseIdx);
39387 }
39388 if (RootSizeInBits != VT.getSizeInBits()) {
39389 // Op is smaller than Root - extract the demanded elts for the subvector.
39390 unsigned Scale = RootSizeInBits / VT.getSizeInBits();
39391 unsigned NumOpMaskElts = RootMask.size() / Scale;
39392 assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch");
39393 assert(OpDemandedElts
39394 .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)
39395 .isZero() &&
39396 "Out of range elements referenced in root mask");
39397 OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);
39398 }
39399 OpDemandedElts =
39400 APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());
39401
39402 // Extract target shuffle mask and resolve sentinels and inputs.
39403 SmallVector<int, 64> OpMask;
39404 SmallVector<SDValue, 2> OpInputs;
39405 APInt OpUndef, OpZero;
39406 bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());
39407 if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,
39408 OpZero, DAG, Depth, false)) {
39409 // Shuffle inputs must not be larger than the shuffle result.
39410 // TODO: Relax this for single input faux shuffles (e.g. trunc).
39411 if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {
39412 return OpInput.getValueSizeInBits() > VT.getSizeInBits();
39413 }))
39414 return SDValue();
39415 } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
39416 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
39417 !isNullConstant(Op.getOperand(1))) {
39418 SDValue SrcVec = Op.getOperand(0);
39419 int ExtractIdx = Op.getConstantOperandVal(1);
39420 unsigned NumElts = VT.getVectorNumElements();
39421 OpInputs.assign({SrcVec});
39422 OpMask.assign(NumElts, SM_SentinelUndef);
39423 std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
39424 OpZero = OpUndef = APInt::getZero(NumElts);
39425 } else {
39426 return SDValue();
39427 }
39428
39429 // If the shuffle result was smaller than the root, we need to adjust the
39430 // mask indices and pad the mask with undefs.
39431 if (RootSizeInBits > VT.getSizeInBits()) {
39432 unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();
39433 unsigned OpMaskSize = OpMask.size();
39434 if (OpInputs.size() > 1) {
39435 unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;
39436 for (int &M : OpMask) {
39437 if (M < 0)
39438 continue;
39439 int EltIdx = M % OpMaskSize;
39440 int OpIdx = M / OpMaskSize;
39441 M = (PaddedMaskSize * OpIdx) + EltIdx;
39442 }
39443 }
39444 OpZero = OpZero.zext(NumSubVecs * OpMaskSize);
39445 OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);
39446 OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);
39447 }
39448
39451
39452 // We don't need to merge masks if the root is empty.
39453 bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
39454 if (EmptyRoot) {
39455 // Only resolve zeros if it will remove an input, otherwise we might end
39456 // up in an infinite loop.
39457 bool ResolveKnownZeros = true;
39458 if (!OpZero.isZero()) {
39459 APInt UsedInputs = APInt::getZero(OpInputs.size());
39460 for (int i = 0, e = OpMask.size(); i != e; ++i) {
39461 int M = OpMask[i];
39462 if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
39463 continue;
39464 UsedInputs.setBit(M / OpMask.size());
39465 if (UsedInputs.isAllOnes()) {
39466 ResolveKnownZeros = false;
39467 break;
39468 }
39469 }
39470 }
39471 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
39472 ResolveKnownZeros);
39473
39474 Mask = OpMask;
39475 Ops.append(OpInputs.begin(), OpInputs.end());
39476 } else {
39477 resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
39478
39479 // Add the inputs to the Ops list, avoiding duplicates.
39480 Ops.append(SrcOps.begin(), SrcOps.end());
39481
39482 auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
39483 // Attempt to find an existing match.
39484 SDValue InputBC = peekThroughBitcasts(Input);
39485 for (int i = 0, e = Ops.size(); i < e; ++i)
39486 if (InputBC == peekThroughBitcasts(Ops[i]))
39487 return i;
39488 // Match failed - should we replace an existing Op?
39489 if (InsertionPoint >= 0) {
39490 Ops[InsertionPoint] = Input;
39491 return InsertionPoint;
39492 }
39493 // Add to the end of the Ops list.
39494 Ops.push_back(Input);
39495 return Ops.size() - 1;
39496 };
39497
39498 SmallVector<int, 2> OpInputIdx;
39499 for (SDValue OpInput : OpInputs)
39500 OpInputIdx.push_back(
39501 AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
39502
39503 assert(((RootMask.size() > OpMask.size() &&
39504 RootMask.size() % OpMask.size() == 0) ||
39505 (OpMask.size() > RootMask.size() &&
39506 OpMask.size() % RootMask.size() == 0) ||
39507 OpMask.size() == RootMask.size()) &&
39508 "The smaller number of elements must divide the larger.");
39509
39510 // This function can be performance-critical, so we rely on the power-of-2
39511 // knowledge that we have about the mask sizes to replace div/rem ops with
39512 // bit-masks and shifts.
39513 assert(llvm::has_single_bit<uint32_t>(RootMask.size()) &&
39514 "Non-power-of-2 shuffle mask sizes");
39515 assert(llvm::has_single_bit<uint32_t>(OpMask.size()) &&
39516 "Non-power-of-2 shuffle mask sizes");
39517 unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());
39518 unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());
39519
39520 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
39521 unsigned RootRatio =
39522 std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
39523 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
39524 assert((RootRatio == 1 || OpRatio == 1) &&
39525 "Must not have a ratio for both incoming and op masks!");
39526
39527 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
39528 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
39529 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
39530 unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);
39531 unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);
39532
39533 Mask.resize(MaskWidth, SM_SentinelUndef);
39534
39535 // Merge this shuffle operation's mask into our accumulated mask. Note that
39536 // this shuffle's mask will be the first applied to the input, followed by
39537 // the root mask to get us all the way to the root value arrangement. The
39538 // reason for this order is that we are recursing up the operation chain.
39539 for (unsigned i = 0; i < MaskWidth; ++i) {
39540 unsigned RootIdx = i >> RootRatioLog2;
39541 if (RootMask[RootIdx] < 0) {
39542 // This is a zero or undef lane, we're done.
39543 Mask[i] = RootMask[RootIdx];
39544 continue;
39545 }
39546
39547 unsigned RootMaskedIdx =
39548 RootRatio == 1
39549 ? RootMask[RootIdx]
39550 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
39551
39552 // Just insert the scaled root mask value if it references an input other
39553 // than the SrcOp we're currently inserting.
39554 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
39555 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
39556 Mask[i] = RootMaskedIdx;
39557 continue;
39558 }
39559
39560 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
39561 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
39562 if (OpMask[OpIdx] < 0) {
39563 // The incoming lanes are zero or undef, it doesn't matter which ones we
39564 // are using.
39565 Mask[i] = OpMask[OpIdx];
39566 continue;
39567 }
39568
39569 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
39570 unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
39571 : (OpMask[OpIdx] << OpRatioLog2) +
39572 (RootMaskedIdx & (OpRatio - 1));
39573
39574 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
39575 int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
39576 assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
39577 OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
39578
39579 Mask[i] = OpMaskedIdx;
39580 }
39581 }
39582
39583 // Peek through vector widenings and set out of bounds mask indices to undef.
39584 // TODO: Can resolveTargetShuffleInputsAndMask do some of this?
39585 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
39586 SDValue &Op = Ops[I];
39587 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() &&
39588 isNullConstant(Op.getOperand(2))) {
39589 Op = Op.getOperand(1);
39590 unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();
39591 int Lo = I * Mask.size();
39592 int Hi = (I + 1) * Mask.size();
39593 int NewHi = Lo + (Mask.size() / Scale);
39594 for (int &M : Mask) {
39595 if (Lo <= M && NewHi <= M && M < Hi)
39596 M = SM_SentinelUndef;
39597 }
39598 }
39599 }
39600
39601 // Peek through any free extract_subvector nodes back to root size.
39602 for (SDValue &Op : Ops)
39603 while (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
39604 (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&
39605 isNullConstant(Op.getOperand(1)))
39606 Op = Op.getOperand(0);
39607
39608 // Remove unused/repeated shuffle source ops.
39610
39611 // Handle the all undef/zero/ones cases early.
39612 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
39613 return DAG.getUNDEF(RootVT);
39614 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
39615 return getZeroVector(RootVT, Subtarget, DAG, SDLoc(Root));
39616 if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&
39618 return getOnesVector(RootVT, DAG, SDLoc(Root));
39619
39620 assert(!Ops.empty() && "Shuffle with no inputs detected");
39621 HasVariableMask |= IsOpVariableMask;
39622
39623 // Update the list of shuffle nodes that have been combined so far.
39624 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
39625 SrcNodes.end());
39626 CombinedNodes.push_back(Op.getNode());
39627
39628 // See if we can recurse into each shuffle source op (if it's a target
39629 // shuffle). The source op should only be generally combined if it either has
39630 // a single use (i.e. current Op) or all its users have already been combined,
39631 // if not then we can still combine but should prevent generation of variable
39632 // shuffles to avoid constant pool bloat.
39633 // Don't recurse if we already have more source ops than we can combine in
39634 // the remaining recursion depth.
39635 if (Ops.size() < (MaxDepth - Depth)) {
39636 for (int i = 0, e = Ops.size(); i < e; ++i) {
39637 // For empty roots, we need to resolve zeroable elements before combining
39638 // them with other shuffles.
39639 SmallVector<int, 64> ResolvedMask = Mask;
39640 if (EmptyRoot)
39641 resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
39642 bool AllowCrossLaneVar = false;
39643 bool AllowPerLaneVar = false;
39644 if (Ops[i].getNode()->hasOneUse() ||
39645 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {
39646 AllowCrossLaneVar = AllowVariableCrossLaneMask;
39647 AllowPerLaneVar = AllowVariablePerLaneMask;
39648 }
39650 Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,
39651 HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,
39652 Subtarget))
39653 return Res;
39654 }
39655 }
39656
39657 // Attempt to constant fold all of the constant source ops.
39659 Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
39660 return Cst;
39661
39662 // If constant fold failed and we only have constants - then we have
39663 // multiple uses by a single non-variable shuffle - just bail.
39664 if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {
39665 APInt UndefElts;
39666 SmallVector<APInt> RawBits;
39667 unsigned EltSizeInBits = RootSizeInBits / Mask.size();
39668 return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
39669 RawBits,
39670 /*AllowWholeUndefs*/ true,
39671 /*AllowPartialUndefs*/ true);
39672 })) {
39673 return SDValue();
39674 }
39675
39676 // Canonicalize the combined shuffle mask chain with horizontal ops.
39677 // NOTE: This will update the Ops and Mask.
39679 Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))
39680 return DAG.getBitcast(RootVT, HOp);
39681
39682 // Try to refine our inputs given our knowledge of target shuffle mask.
39683 for (auto I : enumerate(Ops)) {
39684 int OpIdx = I.index();
39685 SDValue &Op = I.value();
39686
39687 // What range of shuffle mask element values results in picking from Op?
39688 int Lo = OpIdx * Mask.size();
39689 int Hi = Lo + Mask.size();
39690
39691 // Which elements of Op do we demand, given the mask's granularity?
39692 APInt OpDemandedElts(Mask.size(), 0);
39693 for (int MaskElt : Mask) {
39694 if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?
39695 int OpEltIdx = MaskElt - Lo;
39696 OpDemandedElts.setBit(OpEltIdx);
39697 }
39698 }
39699
39700 // Is the shuffle result smaller than the root?
39701 if (Op.getValueSizeInBits() < RootSizeInBits) {
39702 // We padded the mask with undefs. But we now need to undo that.
39703 unsigned NumExpectedVectorElts = Mask.size();
39704 unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;
39705 unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;
39706 assert(!OpDemandedElts.extractBits(
39707 NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&
39708 "Demanding the virtual undef widening padding?");
39709 OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW
39710 }
39711
39712 // The Op itself may be of different VT, so we need to scale the mask.
39713 unsigned NumOpElts = Op.getValueType().getVectorNumElements();
39714 APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);
39715
39716 // Can this operand be simplified any further, given it's demanded elements?
39717 if (SDValue NewOp =
39719 Op, OpScaledDemandedElts, DAG))
39720 Op = NewOp;
39721 }
39722 // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?
39723
39724 // Widen any subvector shuffle inputs we've collected.
39725 // TODO: Remove this to avoid generating temporary nodes, we should only
39726 // widen once combineX86ShuffleChain has found a match.
39727 if (any_of(Ops, [RootSizeInBits](SDValue Op) {
39728 return Op.getValueSizeInBits() < RootSizeInBits;
39729 })) {
39730 for (SDValue &Op : Ops)
39731 if (Op.getValueSizeInBits() < RootSizeInBits)
39732 Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),
39733 RootSizeInBits);
39734 // Reresolve - we might have repeated subvector sources.
39736 }
39737
39738 // We can only combine unary and binary shuffle mask cases.
39739 if (Ops.size() <= 2) {
39740 // Minor canonicalization of the accumulated shuffle mask to make it easier
39741 // to match below. All this does is detect masks with sequential pairs of
39742 // elements, and shrink them to the half-width mask. It does this in a loop
39743 // so it will reduce the size of the mask to the minimal width mask which
39744 // performs an equivalent shuffle.
39745 while (Mask.size() > 1) {
39746 SmallVector<int, 64> WidenedMask;
39747 if (!canWidenShuffleElements(Mask, WidenedMask))
39748 break;
39749 Mask = std::move(WidenedMask);
39750 }
39751
39752 // Canonicalization of binary shuffle masks to improve pattern matching by
39753 // commuting the inputs.
39754 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
39756 std::swap(Ops[0], Ops[1]);
39757 }
39758
39759 // Try to combine into a single shuffle instruction.
39760 if (SDValue Shuffle = combineX86ShuffleChain(
39761 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
39762 AllowVariablePerLaneMask, DAG, Subtarget))
39763 return Shuffle;
39764
39765 // If all the operands come from the same larger vector, fallthrough and try
39766 // to use combineX86ShuffleChainWithExtract.
39769 if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||
39770 (RootSizeInBits / Mask.size()) != 64 ||
39771 LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
39772 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
39773 LHS.getOperand(0) != RHS.getOperand(0))
39774 return SDValue();
39775 }
39776
39777 // If that failed and any input is extracted then try to combine as a
39778 // shuffle with the larger type.
39780 Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,
39781 AllowVariablePerLaneMask, DAG, Subtarget);
39782}
39783
39784/// Helper entry wrapper to combineX86ShufflesRecursively.
39786 const X86Subtarget &Subtarget) {
39788 {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,
39789 /*HasVarMask*/ false,
39790 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,
39791 Subtarget);
39792}
39793
39794/// Get the PSHUF-style mask from PSHUF node.
39795///
39796/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
39797/// PSHUF-style masks that can be reused with such instructions.
39799 MVT VT = N.getSimpleValueType();
39802 bool HaveMask = getTargetShuffleMask(N, false, Ops, Mask);
39803 (void)HaveMask;
39804 assert(HaveMask);
39805
39806 // If we have more than 128-bits, only the low 128-bits of shuffle mask
39807 // matter. Check that the upper masks are repeats and remove them.
39808 if (VT.getSizeInBits() > 128) {
39809 int LaneElts = 128 / VT.getScalarSizeInBits();
39810#ifndef NDEBUG
39811 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
39812 for (int j = 0; j < LaneElts; ++j)
39813 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
39814 "Mask doesn't repeat in high 128-bit lanes!");
39815#endif
39816 Mask.resize(LaneElts);
39817 }
39818
39819 switch (N.getOpcode()) {
39820 case X86ISD::PSHUFD:
39821 return Mask;
39822 case X86ISD::PSHUFLW:
39823 Mask.resize(4);
39824 return Mask;
39825 case X86ISD::PSHUFHW:
39826 Mask.erase(Mask.begin(), Mask.begin() + 4);
39827 for (int &M : Mask)
39828 M -= 4;
39829 return Mask;
39830 default:
39831 llvm_unreachable("No valid shuffle instruction found!");
39832 }
39833}
39834
39835/// Search for a combinable shuffle across a chain ending in pshufd.
39836///
39837/// We walk up the chain and look for a combinable shuffle, skipping over
39838/// shuffles that we could hoist this shuffle's transformation past without
39839/// altering anything.
39842 const SDLoc &DL,
39843 SelectionDAG &DAG) {
39844 assert(N.getOpcode() == X86ISD::PSHUFD &&
39845 "Called with something other than an x86 128-bit half shuffle!");
39846
39847 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
39848 // of the shuffles in the chain so that we can form a fresh chain to replace
39849 // this one.
39851 SDValue V = N.getOperand(0);
39852 for (; V.hasOneUse(); V = V.getOperand(0)) {
39853 switch (V.getOpcode()) {
39854 default:
39855 return SDValue(); // Nothing combined!
39856
39857 case ISD::BITCAST:
39858 // Skip bitcasts as we always know the type for the target specific
39859 // instructions.
39860 continue;
39861
39862 case X86ISD::PSHUFD:
39863 // Found another dword shuffle.
39864 break;
39865
39866 case X86ISD::PSHUFLW:
39867 // Check that the low words (being shuffled) are the identity in the
39868 // dword shuffle, and the high words are self-contained.
39869 if (Mask[0] != 0 || Mask[1] != 1 ||
39870 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
39871 return SDValue();
39872
39873 Chain.push_back(V);
39874 continue;
39875
39876 case X86ISD::PSHUFHW:
39877 // Check that the high words (being shuffled) are the identity in the
39878 // dword shuffle, and the low words are self-contained.
39879 if (Mask[2] != 2 || Mask[3] != 3 ||
39880 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
39881 return SDValue();
39882
39883 Chain.push_back(V);
39884 continue;
39885
39886 case X86ISD::UNPCKL:
39887 case X86ISD::UNPCKH:
39888 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
39889 // shuffle into a preceding word shuffle.
39890 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
39891 V.getSimpleValueType().getVectorElementType() != MVT::i16)
39892 return SDValue();
39893
39894 // Search for a half-shuffle which we can combine with.
39895 unsigned CombineOp =
39896 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
39897 if (V.getOperand(0) != V.getOperand(1) ||
39898 !V->isOnlyUserOf(V.getOperand(0).getNode()))
39899 return SDValue();
39900 Chain.push_back(V);
39901 V = V.getOperand(0);
39902 do {
39903 switch (V.getOpcode()) {
39904 default:
39905 return SDValue(); // Nothing to combine.
39906
39907 case X86ISD::PSHUFLW:
39908 case X86ISD::PSHUFHW:
39909 if (V.getOpcode() == CombineOp)
39910 break;
39911
39912 Chain.push_back(V);
39913
39914 [[fallthrough]];
39915 case ISD::BITCAST:
39916 V = V.getOperand(0);
39917 continue;
39918 }
39919 break;
39920 } while (V.hasOneUse());
39921 break;
39922 }
39923 // Break out of the loop if we break out of the switch.
39924 break;
39925 }
39926
39927 if (!V.hasOneUse())
39928 // We fell out of the loop without finding a viable combining instruction.
39929 return SDValue();
39930
39931 // Merge this node's mask and our incoming mask.
39933 for (int &M : Mask)
39934 M = VMask[M];
39935 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
39936 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
39937
39938 // Rebuild the chain around this new shuffle.
39939 while (!Chain.empty()) {
39940 SDValue W = Chain.pop_back_val();
39941
39942 if (V.getValueType() != W.getOperand(0).getValueType())
39943 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
39944
39945 switch (W.getOpcode()) {
39946 default:
39947 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
39948
39949 case X86ISD::UNPCKL:
39950 case X86ISD::UNPCKH:
39951 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
39952 break;
39953
39954 case X86ISD::PSHUFD:
39955 case X86ISD::PSHUFLW:
39956 case X86ISD::PSHUFHW:
39957 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
39958 break;
39959 }
39960 }
39961 if (V.getValueType() != N.getValueType())
39962 V = DAG.getBitcast(N.getValueType(), V);
39963
39964 // Return the new chain to replace N.
39965 return V;
39966}
39967
39968// Attempt to commute shufps LHS loads:
39969// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
39971 SelectionDAG &DAG) {
39972 // TODO: Add vXf64 support.
39973 if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
39974 return SDValue();
39975
39976 // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
39977 auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
39978 if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
39979 return SDValue();
39980 SDValue N0 = V.getOperand(0);
39981 SDValue N1 = V.getOperand(1);
39982 unsigned Imm = V.getConstantOperandVal(2);
39983 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
39984 if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
39986 return SDValue();
39987 Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
39988 return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
39989 DAG.getTargetConstant(Imm, DL, MVT::i8));
39990 };
39991
39992 switch (N.getOpcode()) {
39993 case X86ISD::VPERMILPI:
39994 if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
39995 unsigned Imm = N.getConstantOperandVal(1);
39996 return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
39997 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
39998 }
39999 break;
40000 case X86ISD::SHUFP: {
40001 SDValue N0 = N.getOperand(0);
40002 SDValue N1 = N.getOperand(1);
40003 unsigned Imm = N.getConstantOperandVal(2);
40004 if (N0 == N1) {
40005 if (SDValue NewSHUFP = commuteSHUFP(N, N0))
40006 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
40007 DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
40008 } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
40009 return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
40010 DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
40011 } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
40012 return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
40013 DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
40014 }
40015 break;
40016 }
40017 }
40018
40019 return SDValue();
40020}
40021
40022// TODO - move this to TLI like isBinOp?
40023static bool isUnaryOp(unsigned Opcode) {
40024 switch (Opcode) {
40025 case ISD::CTLZ:
40026 case ISD::CTTZ:
40027 case ISD::CTPOP:
40028 return true;
40029 }
40030 return false;
40031}
40032
40033// Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
40034// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
40036 const SDLoc &DL) {
40037 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40038 EVT ShuffleVT = N.getValueType();
40039 unsigned Opc = N.getOpcode();
40040
40041 auto IsMergeableWithShuffle = [Opc, &DAG](SDValue Op, bool FoldShuf = true,
40042 bool FoldLoad = false) {
40043 // AllZeros/AllOnes constants are freely shuffled and will peek through
40044 // bitcasts. Other constant build vectors do not peek through bitcasts. Only
40045 // merge with target shuffles if it has one use so shuffle combining is
40046 // likely to kick in. Shuffles of splats are expected to be removed.
40047 return ISD::isBuildVectorAllOnes(Op.getNode()) ||
40048 ISD::isBuildVectorAllZeros(Op.getNode()) ||
40051 getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op)) ||
40052 (Op.getOpcode() == Opc && Op->hasOneUse()) ||
40053 (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
40054 (FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
40055 (FoldLoad && isShuffleFoldableLoad(Op)) ||
40056 DAG.isSplatValue(Op, /*AllowUndefs*/ false);
40057 };
40058 auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {
40059 // Ensure we only shuffle whole vector src elements, unless its a logical
40060 // binops where we can more aggressively move shuffles from dst to src.
40061 return isLogicOp(BinOp) ||
40062 (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());
40063 };
40064
40065 switch (Opc) {
40066 // Unary and Unary+Permute Shuffles.
40067 case X86ISD::PSHUFB: {
40068 // Don't merge PSHUFB if it contains zero'd elements.
40069 SmallVector<int> Mask;
40071 if (!getTargetShuffleMask(N, false, Ops, Mask))
40072 break;
40073 [[fallthrough]];
40074 }
40075 case X86ISD::VBROADCAST:
40076 case X86ISD::MOVDDUP:
40077 case X86ISD::PSHUFD:
40078 case X86ISD::PSHUFHW:
40079 case X86ISD::PSHUFLW:
40080 case X86ISD::VPERMI:
40081 case X86ISD::VPERMILPI: {
40082 if (N.getOperand(0).getValueType() == ShuffleVT &&
40083 N->isOnlyUserOf(N.getOperand(0).getNode())) {
40084 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
40085 unsigned SrcOpcode = N0.getOpcode();
40086 if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
40089 if (IsMergeableWithShuffle(Op00, Opc != X86ISD::VPERMI,
40090 Opc != X86ISD::PSHUFB) ||
40091 IsMergeableWithShuffle(Op01, Opc != X86ISD::VPERMI,
40092 Opc != X86ISD::PSHUFB)) {
40093 SDValue LHS, RHS;
40094 Op00 = DAG.getBitcast(ShuffleVT, Op00);
40095 Op01 = DAG.getBitcast(ShuffleVT, Op01);
40096 if (N.getNumOperands() == 2) {
40097 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
40098 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
40099 } else {
40100 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);
40101 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);
40102 }
40103 EVT OpVT = N0.getValueType();
40104 return DAG.getBitcast(ShuffleVT,
40105 DAG.getNode(SrcOpcode, DL, OpVT,
40106 DAG.getBitcast(OpVT, LHS),
40107 DAG.getBitcast(OpVT, RHS)));
40108 }
40109 }
40110 }
40111 break;
40112 }
40113 // Binary and Binary+Permute Shuffles.
40114 case X86ISD::INSERTPS: {
40115 // Don't merge INSERTPS if it contains zero'd elements.
40116 unsigned InsertPSMask = N.getConstantOperandVal(2);
40117 unsigned ZeroMask = InsertPSMask & 0xF;
40118 if (ZeroMask != 0)
40119 break;
40120 [[fallthrough]];
40121 }
40122 case X86ISD::MOVSD:
40123 case X86ISD::MOVSS:
40124 case X86ISD::BLENDI:
40125 case X86ISD::SHUFP:
40126 case X86ISD::UNPCKH:
40127 case X86ISD::UNPCKL: {
40128 if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&
40129 N->isOnlyUserOf(N.getOperand(1).getNode())) {
40130 SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
40131 SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));
40132 unsigned SrcOpcode = N0.getOpcode();
40133 if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
40134 N0.getValueType() == N1.getValueType() &&
40135 IsSafeToMoveShuffle(N0, SrcOpcode) &&
40136 IsSafeToMoveShuffle(N1, SrcOpcode)) {
40141 // Ensure the total number of shuffles doesn't increase by folding this
40142 // shuffle through to the source ops.
40143 if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||
40144 (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||
40145 ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&
40146 (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {
40147 SDValue LHS, RHS;
40148 Op00 = DAG.getBitcast(ShuffleVT, Op00);
40149 Op10 = DAG.getBitcast(ShuffleVT, Op10);
40150 Op01 = DAG.getBitcast(ShuffleVT, Op01);
40151 Op11 = DAG.getBitcast(ShuffleVT, Op11);
40152 if (N.getNumOperands() == 3) {
40153 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
40154 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));
40155 } else {
40156 LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
40157 RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);
40158 }
40159 EVT OpVT = N0.getValueType();
40160 return DAG.getBitcast(ShuffleVT,
40161 DAG.getNode(SrcOpcode, DL, OpVT,
40162 DAG.getBitcast(OpVT, LHS),
40163 DAG.getBitcast(OpVT, RHS)));
40164 }
40165 }
40166 if (isUnaryOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&
40167 N0.getValueType() == N1.getValueType() &&
40168 IsSafeToMoveShuffle(N0, SrcOpcode) &&
40169 IsSafeToMoveShuffle(N1, SrcOpcode)) {
40172 SDValue Res;
40173 Op00 = DAG.getBitcast(ShuffleVT, Op00);
40174 Op10 = DAG.getBitcast(ShuffleVT, Op10);
40175 if (N.getNumOperands() == 3) {
40176 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));
40177 } else {
40178 Res = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);
40179 }
40180 EVT OpVT = N0.getValueType();
40181 return DAG.getBitcast(
40182 ShuffleVT,
40183 DAG.getNode(SrcOpcode, DL, OpVT, DAG.getBitcast(OpVT, Res)));
40184 }
40185 }
40186 break;
40187 }
40188 }
40189 return SDValue();
40190}
40191
40192/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
40194 SelectionDAG &DAG,
40195 const SDLoc &DL) {
40196 assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle");
40197
40198 MVT VT = V.getSimpleValueType();
40199 SDValue Src0 = peekThroughBitcasts(V.getOperand(0));
40200 SDValue Src1 = peekThroughBitcasts(V.getOperand(1));
40201 unsigned SrcOpc0 = Src0.getOpcode();
40202 unsigned SrcOpc1 = Src1.getOpcode();
40203 EVT SrcVT0 = Src0.getValueType();
40204 EVT SrcVT1 = Src1.getValueType();
40205
40206 if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))
40207 return SDValue();
40208
40209 switch (SrcOpc0) {
40210 case X86ISD::MOVDDUP: {
40211 SDValue LHS = Src0.getOperand(0);
40212 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
40213 SDValue Res =
40214 DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));
40215 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);
40216 return DAG.getBitcast(VT, Res);
40217 }
40218 case X86ISD::VPERMILPI:
40219 // TODO: Handle v4f64 permutes with different low/high lane masks.
40220 if (SrcVT0 == MVT::v4f64) {
40221 uint64_t Mask = Src0.getConstantOperandVal(1);
40222 if ((Mask & 0x3) != ((Mask >> 2) & 0x3))
40223 break;
40224 }
40225 [[fallthrough]];
40226 case X86ISD::VSHLI:
40227 case X86ISD::VSRLI:
40228 case X86ISD::VSRAI:
40229 case X86ISD::PSHUFD:
40230 if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {
40231 SDValue LHS = Src0.getOperand(0);
40232 SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);
40233 SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,
40234 V.getOperand(2));
40235 Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));
40236 return DAG.getBitcast(VT, Res);
40237 }
40238 break;
40239 }
40240
40241 return SDValue();
40242}
40243
40244/// Try to combine x86 target specific shuffles.
40246 SelectionDAG &DAG,
40248 const X86Subtarget &Subtarget) {
40249 MVT VT = N.getSimpleValueType();
40251 unsigned Opcode = N.getOpcode();
40252 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
40253
40254 if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
40255 return R;
40256
40257 // Handle specific target shuffles.
40258 switch (Opcode) {
40259 case X86ISD::MOVDDUP: {
40260 SDValue Src = N.getOperand(0);
40261 // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
40262 if (VT == MVT::v2f64 && Src.hasOneUse() &&
40263 ISD::isNormalLoad(Src.getNode())) {
40264 LoadSDNode *LN = cast<LoadSDNode>(Src);
40265 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
40266 SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
40267 DCI.CombineTo(N.getNode(), Movddup);
40268 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
40270 return N; // Return N so it doesn't get rechecked!
40271 }
40272 }
40273
40274 return SDValue();
40275 }
40276 case X86ISD::VBROADCAST: {
40277 SDValue Src = N.getOperand(0);
40278 SDValue BC = peekThroughBitcasts(Src);
40279 EVT SrcVT = Src.getValueType();
40280 EVT BCVT = BC.getValueType();
40281
40282 // If broadcasting from another shuffle, attempt to simplify it.
40283 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
40284 if (isTargetShuffle(BC.getOpcode()) &&
40285 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
40286 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
40287 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
40289 for (unsigned i = 0; i != Scale; ++i)
40290 DemandedMask[i] = i;
40292 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,
40294 /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,
40295 /*AllowPerLaneVarMask*/ true, DAG, Subtarget))
40296 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
40297 DAG.getBitcast(SrcVT, Res));
40298 }
40299
40300 // broadcast(bitcast(src)) -> bitcast(broadcast(src))
40301 // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
40302 if (Src.getOpcode() == ISD::BITCAST &&
40303 SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
40304 TLI.isTypeLegal(BCVT) &&
40306 BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {
40307 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
40309 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
40310 }
40311
40312 // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))
40313 // If we're re-broadcasting a smaller type then broadcast with that type and
40314 // bitcast.
40315 // TODO: Do this for any splat?
40316 if (Src.getOpcode() == ISD::BITCAST &&
40317 (BC.getOpcode() == X86ISD::VBROADCAST ||
40319 (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&
40320 (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {
40321 MVT NewVT =
40323 VT.getSizeInBits() / BCVT.getScalarSizeInBits());
40324 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
40325 }
40326
40327 // Reduce broadcast source vector to lowest 128-bits.
40328 if (SrcVT.getSizeInBits() > 128)
40329 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
40330 extract128BitVector(Src, 0, DAG, DL));
40331
40332 // broadcast(scalar_to_vector(x)) -> broadcast(x).
40333 if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR &&
40334 Src.getValueType().getScalarType() == Src.getOperand(0).getValueType())
40335 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
40336
40337 // broadcast(extract_vector_elt(x, 0)) -> broadcast(x).
40338 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
40339 isNullConstant(Src.getOperand(1)) &&
40340 Src.getValueType() ==
40341 Src.getOperand(0).getValueType().getScalarType() &&
40342 TLI.isTypeLegal(Src.getOperand(0).getValueType()))
40343 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));
40344
40345 // Share broadcast with the longest vector and extract low subvector (free).
40346 // Ensure the same SDValue from the SDNode use is being used.
40347 for (SDNode *User : Src->uses())
40348 if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
40349 Src == User->getOperand(0) &&
40350 User->getValueSizeInBits(0).getFixedValue() >
40351 VT.getFixedSizeInBits()) {
40352 return extractSubVector(SDValue(User, 0), 0, DAG, DL,
40353 VT.getSizeInBits());
40354 }
40355
40356 // vbroadcast(scalarload X) -> vbroadcast_load X
40357 // For float loads, extract other uses of the scalar from the broadcast.
40358 if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&
40359 ISD::isNormalLoad(Src.getNode())) {
40360 LoadSDNode *LN = cast<LoadSDNode>(Src);
40361 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40362 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
40363 SDValue BcastLd =
40365 LN->getMemoryVT(), LN->getMemOperand());
40366 // If the load value is used only by N, replace it via CombineTo N.
40367 bool NoReplaceExtract = Src.hasOneUse();
40368 DCI.CombineTo(N.getNode(), BcastLd);
40369 if (NoReplaceExtract) {
40370 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40372 } else {
40373 SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,
40374 DAG.getIntPtrConstant(0, DL));
40375 DCI.CombineTo(LN, Scl, BcastLd.getValue(1));
40376 }
40377 return N; // Return N so it doesn't get rechecked!
40378 }
40379
40380 // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
40381 // i16. So shrink it ourselves if we can make a broadcast_load.
40382 if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
40383 Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
40384 assert(Subtarget.hasAVX2() && "Expected AVX2");
40385 SDValue TruncIn = Src.getOperand(0);
40386
40387 // If this is a truncate of a non extending load we can just narrow it to
40388 // use a broadcast_load.
40389 if (ISD::isNormalLoad(TruncIn.getNode())) {
40390 LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
40391 // Unless its volatile or atomic.
40392 if (LN->isSimple()) {
40393 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40394 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
40395 SDValue BcastLd = DAG.getMemIntrinsicNode(
40396 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
40397 LN->getPointerInfo(), LN->getOriginalAlign(),
40398 LN->getMemOperand()->getFlags());
40399 DCI.CombineTo(N.getNode(), BcastLd);
40400 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40401 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
40402 return N; // Return N so it doesn't get rechecked!
40403 }
40404 }
40405
40406 // If this is a truncate of an i16 extload, we can directly replace it.
40407 if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
40408 ISD::isEXTLoad(Src.getOperand(0).getNode())) {
40409 LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
40410 if (LN->getMemoryVT().getSizeInBits() == 16) {
40411 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40412 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
40413 SDValue BcastLd =
40415 LN->getMemoryVT(), LN->getMemOperand());
40416 DCI.CombineTo(N.getNode(), BcastLd);
40417 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40418 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
40419 return N; // Return N so it doesn't get rechecked!
40420 }
40421 }
40422
40423 // If this is a truncate of load that has been shifted right, we can
40424 // offset the pointer and use a narrower load.
40425 if (TruncIn.getOpcode() == ISD::SRL &&
40426 TruncIn.getOperand(0).hasOneUse() &&
40427 isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
40428 ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
40429 LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
40430 unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
40431 // Make sure the shift amount and the load size are divisible by 16.
40432 // Don't do this if the load is volatile or atomic.
40433 if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
40434 LN->isSimple()) {
40435 unsigned Offset = ShiftAmt / 8;
40436 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40439 SDValue Ops[] = { LN->getChain(), Ptr };
40440 SDValue BcastLd = DAG.getMemIntrinsicNode(
40441 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
40443 LN->getOriginalAlign(),
40444 LN->getMemOperand()->getFlags());
40445 DCI.CombineTo(N.getNode(), BcastLd);
40446 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40447 DCI.recursivelyDeleteUnusedNodes(Src.getNode());
40448 return N; // Return N so it doesn't get rechecked!
40449 }
40450 }
40451 }
40452
40453 // vbroadcast(vzload X) -> vbroadcast_load X
40454 if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
40455 MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
40456 if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
40457 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40458 SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
40459 SDValue BcastLd =
40461 LN->getMemoryVT(), LN->getMemOperand());
40462 DCI.CombineTo(N.getNode(), BcastLd);
40463 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40465 return N; // Return N so it doesn't get rechecked!
40466 }
40467 }
40468
40469 // vbroadcast(vector load X) -> vbroadcast_load
40470 if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||
40471 SrcVT == MVT::v4i32) &&
40472 Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {
40473 LoadSDNode *LN = cast<LoadSDNode>(Src);
40474 // Unless the load is volatile or atomic.
40475 if (LN->isSimple()) {
40476 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40477 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
40478 SDValue BcastLd = DAG.getMemIntrinsicNode(
40479 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),
40480 LN->getPointerInfo(), LN->getOriginalAlign(),
40481 LN->getMemOperand()->getFlags());
40482 DCI.CombineTo(N.getNode(), BcastLd);
40483 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
40485 return N; // Return N so it doesn't get rechecked!
40486 }
40487 }
40488
40489 return SDValue();
40490 }
40491 case X86ISD::VZEXT_MOVL: {
40492 SDValue N0 = N.getOperand(0);
40493
40494 // If this a vzmovl of a full vector load, replace it with a vzload, unless
40495 // the load is volatile.
40496 if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
40497 auto *LN = cast<LoadSDNode>(N0);
40498 if (SDValue VZLoad =
40499 narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
40500 DCI.CombineTo(N.getNode(), VZLoad);
40501 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
40503 return N;
40504 }
40505 }
40506
40507 // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
40508 // and can just use a VZEXT_LOAD.
40509 // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
40510 if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
40511 auto *LN = cast<MemSDNode>(N0);
40512 if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
40513 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
40514 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
40515 SDValue VZLoad =
40517 LN->getMemoryVT(), LN->getMemOperand());
40518 DCI.CombineTo(N.getNode(), VZLoad);
40519 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
40521 return N;
40522 }
40523 }
40524
40525 // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
40526 // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
40527 // if the upper bits of the i64 are zero.
40528 if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
40529 N0.getOperand(0).hasOneUse() &&
40530 N0.getOperand(0).getValueType() == MVT::i64) {
40531 SDValue In = N0.getOperand(0);
40532 APInt Mask = APInt::getHighBitsSet(64, 32);
40533 if (DAG.MaskedValueIsZero(In, Mask)) {
40534 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
40535 MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
40536 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
40537 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
40538 return DAG.getBitcast(VT, Movl);
40539 }
40540 }
40541
40542 // Load a scalar integer constant directly to XMM instead of transferring an
40543 // immediate value from GPR.
40544 // vzext_movl (scalar_to_vector C) --> load [C,0...]
40545 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
40546 if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
40547 // Create a vector constant - scalar constant followed by zeros.
40548 EVT ScalarVT = N0.getOperand(0).getValueType();
40549 Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
40550 unsigned NumElts = VT.getVectorNumElements();
40551 Constant *Zero = ConstantInt::getNullValue(ScalarTy);
40552 SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
40553 ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
40554
40555 // Load the vector constant from constant pool.
40556 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
40557 SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
40558 MachinePointerInfo MPI =
40560 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
40561 return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
40563 }
40564 }
40565
40566 // Pull subvector inserts into undef through VZEXT_MOVL by making it an
40567 // insert into a zero vector. This helps get VZEXT_MOVL closer to
40568 // scalar_to_vectors where 256/512 are canonicalized to an insert and a
40569 // 128-bit scalar_to_vector. This reduces the number of isel patterns.
40570 if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {
40572
40573 if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&
40574 isNullConstant(V.getOperand(2))) {
40575 SDValue In = V.getOperand(1);
40577 In.getValueSizeInBits() /
40578 VT.getScalarSizeInBits());
40579 In = DAG.getBitcast(SubVT, In);
40580 SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);
40581 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
40582 getZeroVector(VT, Subtarget, DAG, DL), Movl,
40583 V.getOperand(2));
40584 }
40585 }
40586
40587 return SDValue();
40588 }
40589 case X86ISD::BLENDI: {
40590 SDValue N0 = N.getOperand(0);
40591 SDValue N1 = N.getOperand(1);
40592
40593 // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.
40594 // TODO: Handle MVT::v16i16 repeated blend mask.
40595 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
40596 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
40597 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
40598 if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&
40599 SrcVT.getScalarSizeInBits() >= 32) {
40600 unsigned Size = VT.getVectorNumElements();
40601 unsigned NewSize = SrcVT.getVectorNumElements();
40602 APInt BlendMask = N.getConstantOperandAPInt(2).zextOrTrunc(Size);
40603 APInt NewBlendMask = APIntOps::ScaleBitMask(BlendMask, NewSize);
40604 return DAG.getBitcast(
40605 VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),
40606 N1.getOperand(0),
40607 DAG.getTargetConstant(NewBlendMask.getZExtValue(),
40608 DL, MVT::i8)));
40609 }
40610 }
40611 return SDValue();
40612 }
40613 case X86ISD::SHUFP: {
40614 // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).
40615 // This is a more relaxed shuffle combiner that can ignore oneuse limits.
40616 // TODO: Support types other than v4f32.
40617 if (VT == MVT::v4f32) {
40618 bool Updated = false;
40619 SmallVector<int> Mask;
40621 if (getTargetShuffleMask(N, false, Ops, Mask) && Ops.size() == 2) {
40622 for (int i = 0; i != 2; ++i) {
40623 SmallVector<SDValue> SubOps;
40624 SmallVector<int> SubMask, SubScaledMask;
40625 SDValue Sub = peekThroughBitcasts(Ops[i]);
40626 // TODO: Scaling might be easier if we specify the demanded elts.
40627 if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&
40628 scaleShuffleElements(SubMask, 4, SubScaledMask) &&
40629 SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {
40630 int Ofs = i * 2;
40631 Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);
40632 Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);
40633 Ops[i] = DAG.getBitcast(VT, SubOps[0]);
40634 Updated = true;
40635 }
40636 }
40637 }
40638 if (Updated) {
40639 for (int &M : Mask)
40640 M %= 4;
40641 Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
40642 return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);
40643 }
40644 }
40645 return SDValue();
40646 }
40647 case X86ISD::VPERMI: {
40648 // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.
40649 // TODO: Remove when we have preferred domains in combineX86ShuffleChain.
40650 SDValue N0 = N.getOperand(0);
40651 SDValue N1 = N.getOperand(1);
40652 unsigned EltSizeInBits = VT.getScalarSizeInBits();
40653 if (N0.getOpcode() == ISD::BITCAST &&
40654 N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {
40655 SDValue Src = N0.getOperand(0);
40656 EVT SrcVT = Src.getValueType();
40657 SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);
40658 return DAG.getBitcast(VT, Res);
40659 }
40660 return SDValue();
40661 }
40662 case X86ISD::SHUF128: {
40663 // If we're permuting the upper 256-bits subvectors of a concatenation, then
40664 // see if we can peek through and access the subvector directly.
40665 if (VT.is512BitVector()) {
40666 // 512-bit mask uses 4 x i2 indices - if the msb is always set then only the
40667 // upper subvector is used.
40668 SDValue LHS = N->getOperand(0);
40669 SDValue RHS = N->getOperand(1);
40670 uint64_t Mask = N->getConstantOperandVal(2);
40671 SmallVector<SDValue> LHSOps, RHSOps;
40672 SDValue NewLHS, NewRHS;
40673 if ((Mask & 0x0A) == 0x0A &&
40674 collectConcatOps(LHS.getNode(), LHSOps, DAG) && LHSOps.size() == 2) {
40675 NewLHS = widenSubVector(LHSOps[1], false, Subtarget, DAG, DL, 512);
40676 Mask &= ~0x0A;
40677 }
40678 if ((Mask & 0xA0) == 0xA0 &&
40679 collectConcatOps(RHS.getNode(), RHSOps, DAG) && RHSOps.size() == 2) {
40680 NewRHS = widenSubVector(RHSOps[1], false, Subtarget, DAG, DL, 512);
40681 Mask &= ~0xA0;
40682 }
40683 if (NewLHS || NewRHS)
40684 return DAG.getNode(X86ISD::SHUF128, DL, VT, NewLHS ? NewLHS : LHS,
40685 NewRHS ? NewRHS : RHS,
40686 DAG.getTargetConstant(Mask, DL, MVT::i8));
40687 }
40688 return SDValue();
40689 }
40690 case X86ISD::VPERM2X128: {
40691 // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).
40692 SDValue LHS = N->getOperand(0);
40693 SDValue RHS = N->getOperand(1);
40694 if (LHS.getOpcode() == ISD::BITCAST &&
40695 (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {
40696 EVT SrcVT = LHS.getOperand(0).getValueType();
40697 if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {
40698 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,
40699 DAG.getBitcast(SrcVT, LHS),
40700 DAG.getBitcast(SrcVT, RHS),
40701 N->getOperand(2)));
40702 }
40703 }
40704
40705 // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).
40707 return Res;
40708
40709 // Fold vperm2x128 subvector shuffle with an inner concat pattern.
40710 // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.
40711 auto FindSubVector128 = [&](unsigned Idx) {
40712 if (Idx > 3)
40713 return SDValue();
40714 SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
40715 SmallVector<SDValue> SubOps;
40716 if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
40717 return SubOps[Idx & 1];
40718 unsigned NumElts = Src.getValueType().getVectorNumElements();
40719 if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
40720 Src.getOperand(1).getValueSizeInBits() == 128 &&
40721 Src.getConstantOperandAPInt(2) == (NumElts / 2)) {
40722 return Src.getOperand(1);
40723 }
40724 return SDValue();
40725 };
40726 unsigned Imm = N.getConstantOperandVal(2);
40727 if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {
40728 if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {
40729 MVT SubVT = VT.getHalfNumVectorElementsVT();
40730 SubLo = DAG.getBitcast(SubVT, SubLo);
40731 SubHi = DAG.getBitcast(SubVT, SubHi);
40732 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);
40733 }
40734 }
40735 return SDValue();
40736 }
40737 case X86ISD::PSHUFD:
40738 case X86ISD::PSHUFLW:
40739 case X86ISD::PSHUFHW: {
40740 SDValue N0 = N.getOperand(0);
40741 SDValue N1 = N.getOperand(1);
40742 if (N0->hasOneUse()) {
40744 switch (V.getOpcode()) {
40745 case X86ISD::VSHL:
40746 case X86ISD::VSRL:
40747 case X86ISD::VSRA:
40748 case X86ISD::VSHLI:
40749 case X86ISD::VSRLI:
40750 case X86ISD::VSRAI:
40751 case X86ISD::VROTLI:
40752 case X86ISD::VROTRI: {
40753 MVT InnerVT = V.getSimpleValueType();
40754 if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) {
40755 SDValue Res = DAG.getNode(Opcode, DL, VT,
40756 DAG.getBitcast(VT, V.getOperand(0)), N1);
40757 Res = DAG.getBitcast(InnerVT, Res);
40758 Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1));
40759 return DAG.getBitcast(VT, Res);
40760 }
40761 break;
40762 }
40763 }
40764 }
40765
40766 Mask = getPSHUFShuffleMask(N);
40767 assert(Mask.size() == 4);
40768 break;
40769 }
40770 case X86ISD::MOVSD:
40771 case X86ISD::MOVSH:
40772 case X86ISD::MOVSS: {
40773 SDValue N0 = N.getOperand(0);
40774 SDValue N1 = N.getOperand(1);
40775
40776 // Canonicalize scalar FPOps:
40777 // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))
40778 // If commutable, allow OP(N1[0], N0[0]).
40779 unsigned Opcode1 = N1.getOpcode();
40780 if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||
40781 Opcode1 == ISD::FDIV) {
40782 SDValue N10 = N1.getOperand(0);
40783 SDValue N11 = N1.getOperand(1);
40784 if (N10 == N0 ||
40785 (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {
40786 if (N10 != N0)
40787 std::swap(N10, N11);
40788 MVT SVT = VT.getVectorElementType();
40789 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
40790 N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);
40791 N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);
40792 SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);
40793 SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
40794 return DAG.getNode(Opcode, DL, VT, N0, SclVec);
40795 }
40796 }
40797
40798 return SDValue();
40799 }
40800 case X86ISD::INSERTPS: {
40801 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
40802 SDValue Op0 = N.getOperand(0);
40803 SDValue Op1 = N.getOperand(1);
40804 unsigned InsertPSMask = N.getConstantOperandVal(2);
40805 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
40806 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
40807 unsigned ZeroMask = InsertPSMask & 0xF;
40808
40809 // If we zero out all elements from Op0 then we don't need to reference it.
40810 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
40811 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
40812 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
40813
40814 // If we zero out the element from Op1 then we don't need to reference it.
40815 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
40816 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
40817 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
40818
40819 // Attempt to merge insertps Op1 with an inner target shuffle node.
40820 SmallVector<int, 8> TargetMask1;
40822 APInt KnownUndef1, KnownZero1;
40823 if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,
40824 KnownZero1)) {
40825 if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {
40826 // Zero/UNDEF insertion - zero out element and remove dependency.
40827 InsertPSMask |= (1u << DstIdx);
40828 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
40829 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
40830 }
40831 // Update insertps mask srcidx and reference the source input directly.
40832 int M = TargetMask1[SrcIdx];
40833 assert(0 <= M && M < 8 && "Shuffle index out of range");
40834 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
40835 Op1 = Ops1[M < 4 ? 0 : 1];
40836 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
40837 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
40838 }
40839
40840 // Attempt to merge insertps Op0 with an inner target shuffle node.
40841 SmallVector<int, 8> TargetMask0;
40843 APInt KnownUndef0, KnownZero0;
40844 if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,
40845 KnownZero0)) {
40846 bool Updated = false;
40847 bool UseInput00 = false;
40848 bool UseInput01 = false;
40849 for (int i = 0; i != 4; ++i) {
40850 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
40851 // No change if element is already zero or the inserted element.
40852 continue;
40853 }
40854
40855 if (KnownUndef0[i] || KnownZero0[i]) {
40856 // If the target mask is undef/zero then we must zero the element.
40857 InsertPSMask |= (1u << i);
40858 Updated = true;
40859 continue;
40860 }
40861
40862 // The input vector element must be inline.
40863 int M = TargetMask0[i];
40864 if (M != i && M != (i + 4))
40865 return SDValue();
40866
40867 // Determine which inputs of the target shuffle we're using.
40868 UseInput00 |= (0 <= M && M < 4);
40869 UseInput01 |= (4 <= M);
40870 }
40871
40872 // If we're not using both inputs of the target shuffle then use the
40873 // referenced input directly.
40874 if (UseInput00 && !UseInput01) {
40875 Updated = true;
40876 Op0 = Ops0[0];
40877 } else if (!UseInput00 && UseInput01) {
40878 Updated = true;
40879 Op0 = Ops0[1];
40880 }
40881
40882 if (Updated)
40883 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
40884 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
40885 }
40886
40887 // If we're inserting an element from a vbroadcast load, fold the
40888 // load into the X86insertps instruction. We need to convert the scalar
40889 // load to a vector and clear the source lane of the INSERTPS control.
40890 if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {
40891 auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);
40892 if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {
40893 SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),
40894 MemIntr->getBasePtr(),
40895 MemIntr->getMemOperand());
40896 SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
40898 Load),
40899 DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));
40900 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
40901 return Insert;
40902 }
40903 }
40904
40905 return SDValue();
40906 }
40907 default:
40908 return SDValue();
40909 }
40910
40911 // Nuke no-op shuffles that show up after combining.
40912 if (isNoopShuffleMask(Mask))
40913 return N.getOperand(0);
40914
40915 // Look for simplifications involving one or two shuffle instructions.
40916 SDValue V = N.getOperand(0);
40917 switch (N.getOpcode()) {
40918 default:
40919 break;
40920 case X86ISD::PSHUFLW:
40921 case X86ISD::PSHUFHW:
40922 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
40923
40924 // See if this reduces to a PSHUFD which is no more expensive and can
40925 // combine with more operations. Note that it has to at least flip the
40926 // dwords as otherwise it would have been removed as a no-op.
40927 if (ArrayRef<int>(Mask).equals({2, 3, 0, 1})) {
40928 int DMask[] = {0, 1, 2, 3};
40929 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
40930 DMask[DOffset + 0] = DOffset + 1;
40931 DMask[DOffset + 1] = DOffset + 0;
40932 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
40933 V = DAG.getBitcast(DVT, V);
40934 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
40935 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
40936 return DAG.getBitcast(VT, V);
40937 }
40938
40939 // Look for shuffle patterns which can be implemented as a single unpack.
40940 // FIXME: This doesn't handle the location of the PSHUFD generically, and
40941 // only works when we have a PSHUFD followed by two half-shuffles.
40942 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
40943 (V.getOpcode() == X86ISD::PSHUFLW ||
40944 V.getOpcode() == X86ISD::PSHUFHW) &&
40945 V.getOpcode() != N.getOpcode() &&
40946 V.hasOneUse() && V.getOperand(0).hasOneUse()) {
40947 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
40948 if (D.getOpcode() == X86ISD::PSHUFD) {
40951 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
40952 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
40953 int WordMask[8];
40954 for (int i = 0; i < 4; ++i) {
40955 WordMask[i + NOffset] = Mask[i] + NOffset;
40956 WordMask[i + VOffset] = VMask[i] + VOffset;
40957 }
40958 // Map the word mask through the DWord mask.
40959 int MappedMask[8];
40960 for (int i = 0; i < 8; ++i)
40961 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
40962 if (ArrayRef<int>(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
40963 ArrayRef<int>(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
40964 // We can replace all three shuffles with an unpack.
40965 V = DAG.getBitcast(VT, D.getOperand(0));
40966 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
40968 DL, VT, V, V);
40969 }
40970 }
40971 }
40972
40973 break;
40974
40975 case X86ISD::PSHUFD:
40976 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DL, DAG))
40977 return NewN;
40978
40979 break;
40980 }
40981
40982 return SDValue();
40983}
40984
40985/// Checks if the shuffle mask takes subsequent elements
40986/// alternately from two vectors.
40987/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
40988static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
40989
40990 int ParitySrc[2] = {-1, -1};
40991 unsigned Size = Mask.size();
40992 for (unsigned i = 0; i != Size; ++i) {
40993 int M = Mask[i];
40994 if (M < 0)
40995 continue;
40996
40997 // Make sure we are using the matching element from the input.
40998 if ((M % Size) != i)
40999 return false;
41000
41001 // Make sure we use the same input for all elements of the same parity.
41002 int Src = M / Size;
41003 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
41004 return false;
41005 ParitySrc[i % 2] = Src;
41006 }
41007
41008 // Make sure each input is used.
41009 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
41010 return false;
41011
41012 Op0Even = ParitySrc[0] == 0;
41013 return true;
41014}
41015
41016/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
41017/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
41018/// are written to the parameters \p Opnd0 and \p Opnd1.
41019///
41020/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
41021/// so it is easier to generically match. We also insert dummy vector shuffle
41022/// nodes for the operands which explicitly discard the lanes which are unused
41023/// by this operation to try to flow through the rest of the combiner
41024/// the fact that they're unused.
41025static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
41026 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
41027 bool &IsSubAdd) {
41028
41029 EVT VT = N->getValueType(0);
41030 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41031 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
41033 return false;
41034
41035 // We only handle target-independent shuffles.
41036 // FIXME: It would be easy and harmless to use the target shuffle mask
41037 // extraction tool to support more.
41038 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
41039 return false;
41040
41041 SDValue V1 = N->getOperand(0);
41042 SDValue V2 = N->getOperand(1);
41043
41044 // Make sure we have an FADD and an FSUB.
41045 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
41046 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
41047 V1.getOpcode() == V2.getOpcode())
41048 return false;
41049
41050 // If there are other uses of these operations we can't fold them.
41051 if (!V1->hasOneUse() || !V2->hasOneUse())
41052 return false;
41053
41054 // Ensure that both operations have the same operands. Note that we can
41055 // commute the FADD operands.
41056 SDValue LHS, RHS;
41057 if (V1.getOpcode() == ISD::FSUB) {
41058 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
41059 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
41060 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
41061 return false;
41062 } else {
41063 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
41064 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
41065 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
41066 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
41067 return false;
41068 }
41069
41070 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
41071 bool Op0Even;
41072 if (!isAddSubOrSubAddMask(Mask, Op0Even))
41073 return false;
41074
41075 // It's a subadd if the vector in the even parity is an FADD.
41076 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
41077 : V2->getOpcode() == ISD::FADD;
41078
41079 Opnd0 = LHS;
41080 Opnd1 = RHS;
41081 return true;
41082}
41083
41084/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
41086 const X86Subtarget &Subtarget,
41087 SelectionDAG &DAG) {
41088 // We only handle target-independent shuffles.
41089 // FIXME: It would be easy and harmless to use the target shuffle mask
41090 // extraction tool to support more.
41091 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
41092 return SDValue();
41093
41094 MVT VT = N->getSimpleValueType(0);
41095 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41096 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
41097 return SDValue();
41098
41099 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
41100 SDValue Op0 = N->getOperand(0);
41101 SDValue Op1 = N->getOperand(1);
41102 SDValue FMAdd = Op0, FMSub = Op1;
41103 if (FMSub.getOpcode() != X86ISD::FMSUB)
41104 std::swap(FMAdd, FMSub);
41105
41106 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
41107 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
41108 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
41109 FMAdd.getOperand(2) != FMSub.getOperand(2))
41110 return SDValue();
41111
41112 // Check for correct shuffle mask.
41113 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
41114 bool Op0Even;
41115 if (!isAddSubOrSubAddMask(Mask, Op0Even))
41116 return SDValue();
41117
41118 // FMAddSub takes zeroth operand from FMSub node.
41119 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
41120 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
41121 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
41122 FMAdd.getOperand(2));
41123}
41124
41125/// Try to combine a shuffle into a target-specific add-sub or
41126/// mul-add-sub node.
41128 const X86Subtarget &Subtarget,
41129 SelectionDAG &DAG) {
41130 if (SDValue V = combineShuffleToFMAddSub(N, DL, Subtarget, DAG))
41131 return V;
41132
41133 SDValue Opnd0, Opnd1;
41134 bool IsSubAdd;
41135 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
41136 return SDValue();
41137
41138 MVT VT = N->getSimpleValueType(0);
41139
41140 // Try to generate X86ISD::FMADDSUB node here.
41141 SDValue Opnd2;
41142 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
41143 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
41144 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
41145 }
41146
41147 if (IsSubAdd)
41148 return SDValue();
41149
41150 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
41151 // the ADDSUB idiom has been successfully recognized. There are no known
41152 // X86 targets with 512-bit ADDSUB instructions!
41153 if (VT.is512BitVector())
41154 return SDValue();
41155
41156 // Do not generate X86ISD::ADDSUB node for FP16's vector types even though
41157 // the ADDSUB idiom has been successfully recognized. There are no known
41158 // X86 targets with FP16 ADDSUB instructions!
41159 if (VT.getVectorElementType() == MVT::f16)
41160 return SDValue();
41161
41162 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
41163}
41164
41165// We are looking for a shuffle where both sources are concatenated with undef
41166// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
41167// if we can express this as a single-source shuffle, that's preferable.
41169 SelectionDAG &DAG,
41170 const X86Subtarget &Subtarget) {
41171 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
41172 return SDValue();
41173
41174 EVT VT = N->getValueType(0);
41175
41176 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
41177 if (!VT.is128BitVector() && !VT.is256BitVector())
41178 return SDValue();
41179
41180 if (VT.getVectorElementType() != MVT::i32 &&
41181 VT.getVectorElementType() != MVT::i64 &&
41182 VT.getVectorElementType() != MVT::f32 &&
41183 VT.getVectorElementType() != MVT::f64)
41184 return SDValue();
41185
41186 SDValue N0 = N->getOperand(0);
41187 SDValue N1 = N->getOperand(1);
41188
41189 // Check that both sources are concats with undef.
41190 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
41191 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
41192 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
41193 !N1.getOperand(1).isUndef())
41194 return SDValue();
41195
41196 // Construct the new shuffle mask. Elements from the first source retain their
41197 // index, but elements from the second source no longer need to skip an undef.
41199 int NumElts = VT.getVectorNumElements();
41200
41201 auto *SVOp = cast<ShuffleVectorSDNode>(N);
41202 for (int Elt : SVOp->getMask())
41203 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
41204
41206 N1.getOperand(0));
41207 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
41208}
41209
41210/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
41211/// low half of each source vector and does not set any high half elements in
41212/// the destination vector, narrow the shuffle to half its original size.
41214 EVT VT = Shuf->getValueType(0);
41215 if (!DAG.getTargetLoweringInfo().isTypeLegal(Shuf->getValueType(0)))
41216 return SDValue();
41217 if (!VT.is256BitVector() && !VT.is512BitVector())
41218 return SDValue();
41219
41220 // See if we can ignore all of the high elements of the shuffle.
41221 ArrayRef<int> Mask = Shuf->getMask();
41222 if (!isUndefUpperHalf(Mask))
41223 return SDValue();
41224
41225 // Check if the shuffle mask accesses only the low half of each input vector
41226 // (half-index output is 0 or 2).
41227 int HalfIdx1, HalfIdx2;
41228 SmallVector<int, 8> HalfMask(Mask.size() / 2);
41229 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||
41230 (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))
41231 return SDValue();
41232
41233 // Create a half-width shuffle to replace the unnecessarily wide shuffle.
41234 // The trick is knowing that all of the insert/extract are actually free
41235 // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle
41236 // of narrow inputs into a narrow output, and that is always cheaper than
41237 // the wide shuffle that we started with.
41238 return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),
41239 Shuf->getOperand(1), HalfMask, HalfIdx1,
41240 HalfIdx2, false, DAG, /*UseConcat*/ true);
41241}
41242
41245 const X86Subtarget &Subtarget) {
41246 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))
41247 if (SDValue V = narrowShuffle(Shuf, DAG))
41248 return V;
41249
41250 // If we have legalized the vector types, look for blends of FADD and FSUB
41251 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
41252 SDLoc dl(N);
41253 EVT VT = N->getValueType(0);
41254 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
41255 if (TLI.isTypeLegal(VT) && !isSoftF16(VT, Subtarget))
41256 if (SDValue AddSub =
41257 combineShuffleToAddSubOrFMAddSub(N, dl, Subtarget, DAG))
41258 return AddSub;
41259
41260 // Attempt to combine into a vector load/broadcast.
41262 VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
41263 return LD;
41264
41265 // For AVX2, we sometimes want to combine
41266 // (vector_shuffle <mask> (concat_vectors t1, undef)
41267 // (concat_vectors t2, undef))
41268 // Into:
41269 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
41270 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
41271 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, dl, DAG, Subtarget))
41272 return ShufConcat;
41273
41274 if (isTargetShuffle(N->getOpcode())) {
41275 SDValue Op(N, 0);
41276 if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))
41277 return Shuffle;
41278
41279 // Try recursively combining arbitrary sequences of x86 shuffle
41280 // instructions into higher-order shuffles. We do this after combining
41281 // specific PSHUF instruction sequences into their minimal form so that we
41282 // can evaluate how many specialized shuffle instructions are involved in
41283 // a particular chain.
41284 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
41285 return Res;
41286
41287 // Simplify source operands based on shuffle mask.
41288 // TODO - merge this into combineX86ShufflesRecursively.
41289 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
41290 if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))
41291 return SDValue(N, 0);
41292
41293 // Canonicalize SHUFFLE(UNARYOP(X)) -> UNARYOP(SHUFFLE(X)).
41294 // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).
41295 // Perform this after other shuffle combines to allow inner shuffles to be
41296 // combined away first.
41297 if (SDValue BinOp = canonicalizeShuffleWithOp(Op, DAG, dl))
41298 return BinOp;
41299 }
41300
41301 return SDValue();
41302}
41303
41304// Simplify variable target shuffle masks based on the demanded elements.
41305// TODO: Handle DemandedBits in mask indices as well?
41307 SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
41308 TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
41309 // If we're demanding all elements don't bother trying to simplify the mask.
41310 unsigned NumElts = DemandedElts.getBitWidth();
41311 if (DemandedElts.isAllOnes())
41312 return false;
41313
41314 SDValue Mask = Op.getOperand(MaskIndex);
41315 if (!Mask.hasOneUse())
41316 return false;
41317
41318 // Attempt to generically simplify the variable shuffle mask.
41319 APInt MaskUndef, MaskZero;
41320 if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
41321 Depth + 1))
41322 return true;
41323
41324 // Attempt to extract+simplify a (constant pool load) shuffle mask.
41325 // TODO: Support other types from getTargetShuffleMaskIndices?
41327 EVT BCVT = BC.getValueType();
41328 auto *Load = dyn_cast<LoadSDNode>(BC);
41329 if (!Load || !Load->getBasePtr().hasOneUse())
41330 return false;
41331
41332 const Constant *C = getTargetConstantFromNode(Load);
41333 if (!C)
41334 return false;
41335
41336 Type *CTy = C->getType();
41337 if (!CTy->isVectorTy() ||
41338 CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
41339 return false;
41340
41341 // Handle scaling for i64 elements on 32-bit targets.
41342 unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
41343 if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
41344 return false;
41345 unsigned Scale = NumCstElts / NumElts;
41346
41347 // Simplify mask if we have an undemanded element that is not undef.
41348 bool Simplified = false;
41349 SmallVector<Constant *, 32> ConstVecOps;
41350 for (unsigned i = 0; i != NumCstElts; ++i) {
41351 Constant *Elt = C->getAggregateElement(i);
41352 if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
41353 ConstVecOps.push_back(UndefValue::get(Elt->getType()));
41354 Simplified = true;
41355 continue;
41356 }
41357 ConstVecOps.push_back(Elt);
41358 }
41359 if (!Simplified)
41360 return false;
41361
41362 // Generate new constant pool entry + legalize immediately for the load.
41363 SDLoc DL(Op);
41364 SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
41365 SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
41366 SDValue NewMask = TLO.DAG.getLoad(
41367 BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
41369 Load->getAlign());
41370 return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
41371}
41372
41374 SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
41375 TargetLoweringOpt &TLO, unsigned Depth) const {
41376 int NumElts = DemandedElts.getBitWidth();
41377 unsigned Opc = Op.getOpcode();
41378 EVT VT = Op.getValueType();
41379
41380 // Handle special case opcodes.
41381 switch (Opc) {
41382 case X86ISD::PMULDQ:
41383 case X86ISD::PMULUDQ: {
41384 APInt LHSUndef, LHSZero;
41385 APInt RHSUndef, RHSZero;
41386 SDValue LHS = Op.getOperand(0);
41387 SDValue RHS = Op.getOperand(1);
41388 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
41389 Depth + 1))
41390 return true;
41391 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
41392 Depth + 1))
41393 return true;
41394 // Multiply by zero.
41395 KnownZero = LHSZero | RHSZero;
41396 break;
41397 }
41398 case X86ISD::VPMADDWD: {
41399 APInt LHSUndef, LHSZero;
41400 APInt RHSUndef, RHSZero;
41401 SDValue LHS = Op.getOperand(0);
41402 SDValue RHS = Op.getOperand(1);
41403 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);
41404
41405 if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,
41406 Depth + 1))
41407 return true;
41408 if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,
41409 Depth + 1))
41410 return true;
41411
41412 // TODO: Multiply by zero.
41413
41414 // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.
41415 APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;
41416 if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,
41417 Depth + 1))
41418 return true;
41419 APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;
41420 if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,
41421 Depth + 1))
41422 return true;
41423 break;
41424 }
41425 case X86ISD::PSADBW: {
41426 SDValue LHS = Op.getOperand(0);
41427 SDValue RHS = Op.getOperand(1);
41428 assert(VT.getScalarType() == MVT::i64 &&
41429 LHS.getValueType() == RHS.getValueType() &&
41430 LHS.getValueType().getScalarType() == MVT::i8 &&
41431 "Unexpected PSADBW types");
41432
41433 // Aggressively peek through ops to get at the demanded elts.
41434 if (!DemandedElts.isAllOnes()) {
41435 unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();
41436 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);
41438 LHS, DemandedSrcElts, TLO.DAG, Depth + 1);
41440 RHS, DemandedSrcElts, TLO.DAG, Depth + 1);
41441 if (NewLHS || NewRHS) {
41442 NewLHS = NewLHS ? NewLHS : LHS;
41443 NewRHS = NewRHS ? NewRHS : RHS;
41444 return TLO.CombineTo(
41445 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
41446 }
41447 }
41448 break;
41449 }
41450 case X86ISD::VSHL:
41451 case X86ISD::VSRL:
41452 case X86ISD::VSRA: {
41453 // We only need the bottom 64-bits of the (128-bit) shift amount.
41454 SDValue Amt = Op.getOperand(1);
41455 MVT AmtVT = Amt.getSimpleValueType();
41456 assert(AmtVT.is128BitVector() && "Unexpected value type");
41457
41458 // If we reuse the shift amount just for sse shift amounts then we know that
41459 // only the bottom 64-bits are only ever used.
41460 bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
41461 unsigned UseOpc = Use->getOpcode();
41462 return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
41463 UseOpc == X86ISD::VSRA) &&
41464 Use->getOperand(0) != Amt;
41465 });
41466
41467 APInt AmtUndef, AmtZero;
41468 unsigned NumAmtElts = AmtVT.getVectorNumElements();
41469 APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);
41470 if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,
41471 Depth + 1, AssumeSingleUse))
41472 return true;
41473 [[fallthrough]];
41474 }
41475 case X86ISD::VSHLI:
41476 case X86ISD::VSRLI:
41477 case X86ISD::VSRAI: {
41478 SDValue Src = Op.getOperand(0);
41479 APInt SrcUndef;
41480 if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,
41481 Depth + 1))
41482 return true;
41483
41484 // Fold shift(0,x) -> 0
41485 if (DemandedElts.isSubsetOf(KnownZero))
41486 return TLO.CombineTo(
41487 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
41488
41489 // Aggressively peek through ops to get at the demanded elts.
41490 if (!DemandedElts.isAllOnes())
41492 Src, DemandedElts, TLO.DAG, Depth + 1))
41493 return TLO.CombineTo(
41494 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));
41495 break;
41496 }
41497 case X86ISD::VPSHA:
41498 case X86ISD::VPSHL:
41499 case X86ISD::VSHLV:
41500 case X86ISD::VSRLV:
41501 case X86ISD::VSRAV: {
41502 APInt LHSUndef, LHSZero;
41503 APInt RHSUndef, RHSZero;
41504 SDValue LHS = Op.getOperand(0);
41505 SDValue RHS = Op.getOperand(1);
41506 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
41507 Depth + 1))
41508 return true;
41509
41510 // Fold shift(0,x) -> 0
41511 if (DemandedElts.isSubsetOf(LHSZero))
41512 return TLO.CombineTo(
41513 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
41514
41515 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
41516 Depth + 1))
41517 return true;
41518
41519 KnownZero = LHSZero;
41520 break;
41521 }
41522 case X86ISD::PCMPEQ:
41523 case X86ISD::PCMPGT: {
41524 APInt LHSUndef, LHSZero;
41525 APInt RHSUndef, RHSZero;
41526 SDValue LHS = Op.getOperand(0);
41527 SDValue RHS = Op.getOperand(1);
41528 if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
41529 Depth + 1))
41530 return true;
41531 if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
41532 Depth + 1))
41533 return true;
41534 break;
41535 }
41536 case X86ISD::KSHIFTL: {
41537 SDValue Src = Op.getOperand(0);
41538 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
41539 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
41540 unsigned ShiftAmt = Amt->getZExtValue();
41541
41542 if (ShiftAmt == 0)
41543 return TLO.CombineTo(Op, Src);
41544
41545 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
41546 // single shift. We can do this if the bottom bits (which are shifted
41547 // out) are never demanded.
41548 if (Src.getOpcode() == X86ISD::KSHIFTR) {
41549 if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {
41550 unsigned C1 = Src.getConstantOperandVal(1);
41551 unsigned NewOpc = X86ISD::KSHIFTL;
41552 int Diff = ShiftAmt - C1;
41553 if (Diff < 0) {
41554 Diff = -Diff;
41555 NewOpc = X86ISD::KSHIFTR;
41556 }
41557
41558 SDLoc dl(Op);
41559 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
41560 return TLO.CombineTo(
41561 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
41562 }
41563 }
41564
41565 APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);
41566 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
41567 Depth + 1))
41568 return true;
41569
41570 KnownUndef <<= ShiftAmt;
41571 KnownZero <<= ShiftAmt;
41572 KnownZero.setLowBits(ShiftAmt);
41573 break;
41574 }
41575 case X86ISD::KSHIFTR: {
41576 SDValue Src = Op.getOperand(0);
41577 auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));
41578 assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount");
41579 unsigned ShiftAmt = Amt->getZExtValue();
41580
41581 if (ShiftAmt == 0)
41582 return TLO.CombineTo(Op, Src);
41583
41584 // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
41585 // single shift. We can do this if the top bits (which are shifted
41586 // out) are never demanded.
41587 if (Src.getOpcode() == X86ISD::KSHIFTL) {
41588 if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {
41589 unsigned C1 = Src.getConstantOperandVal(1);
41590 unsigned NewOpc = X86ISD::KSHIFTR;
41591 int Diff = ShiftAmt - C1;
41592 if (Diff < 0) {
41593 Diff = -Diff;
41594 NewOpc = X86ISD::KSHIFTL;
41595 }
41596
41597 SDLoc dl(Op);
41598 SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);
41599 return TLO.CombineTo(
41600 Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));
41601 }
41602 }
41603
41604 APInt DemandedSrc = DemandedElts.shl(ShiftAmt);
41605 if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,
41606 Depth + 1))
41607 return true;
41608
41609 KnownUndef.lshrInPlace(ShiftAmt);
41610 KnownZero.lshrInPlace(ShiftAmt);
41611 KnownZero.setHighBits(ShiftAmt);
41612 break;
41613 }
41614 case X86ISD::ANDNP: {
41615 // ANDNP = (~LHS & RHS);
41616 SDValue LHS = Op.getOperand(0);
41617 SDValue RHS = Op.getOperand(1);
41618
41619 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
41620 APInt UndefElts;
41621 SmallVector<APInt> EltBits;
41622 int NumElts = VT.getVectorNumElements();
41623 int EltSizeInBits = VT.getScalarSizeInBits();
41624 APInt OpBits = APInt::getAllOnes(EltSizeInBits);
41625 APInt OpElts = DemandedElts;
41626 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
41627 EltBits)) {
41628 OpBits.clearAllBits();
41629 OpElts.clearAllBits();
41630 for (int I = 0; I != NumElts; ++I) {
41631 if (!DemandedElts[I])
41632 continue;
41633 if (UndefElts[I]) {
41634 // We can't assume an undef src element gives an undef dst - the
41635 // other src might be zero.
41636 OpBits.setAllBits();
41637 OpElts.setBit(I);
41638 } else if ((Invert && !EltBits[I].isAllOnes()) ||
41639 (!Invert && !EltBits[I].isZero())) {
41640 OpBits |= Invert ? ~EltBits[I] : EltBits[I];
41641 OpElts.setBit(I);
41642 }
41643 }
41644 }
41645 return std::make_pair(OpBits, OpElts);
41646 };
41647 APInt BitsLHS, EltsLHS;
41648 APInt BitsRHS, EltsRHS;
41649 std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);
41650 std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);
41651
41652 APInt LHSUndef, LHSZero;
41653 APInt RHSUndef, RHSZero;
41654 if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,
41655 Depth + 1))
41656 return true;
41657 if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,
41658 Depth + 1))
41659 return true;
41660
41661 if (!DemandedElts.isAllOnes()) {
41662 SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,
41663 TLO.DAG, Depth + 1);
41664 SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,
41665 TLO.DAG, Depth + 1);
41666 if (NewLHS || NewRHS) {
41667 NewLHS = NewLHS ? NewLHS : LHS;
41668 NewRHS = NewRHS ? NewRHS : RHS;
41669 return TLO.CombineTo(
41670 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));
41671 }
41672 }
41673 break;
41674 }
41675 case X86ISD::CVTSI2P:
41676 case X86ISD::CVTUI2P:
41677 case X86ISD::CVTPH2PS:
41678 case X86ISD::CVTPS2PH: {
41679 SDValue Src = Op.getOperand(0);
41680 MVT SrcVT = Src.getSimpleValueType();
41681 APInt SrcUndef, SrcZero;
41682 APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
41683 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
41684 Depth + 1))
41685 return true;
41686 break;
41687 }
41688 case X86ISD::PACKSS:
41689 case X86ISD::PACKUS: {
41690 SDValue N0 = Op.getOperand(0);
41691 SDValue N1 = Op.getOperand(1);
41692
41693 APInt DemandedLHS, DemandedRHS;
41694 getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
41695
41696 APInt LHSUndef, LHSZero;
41697 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
41698 Depth + 1))
41699 return true;
41700 APInt RHSUndef, RHSZero;
41701 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
41702 Depth + 1))
41703 return true;
41704
41705 // TODO - pass on known zero/undef.
41706
41707 // Aggressively peek through ops to get at the demanded elts.
41708 // TODO - we should do this for all target/faux shuffles ops.
41709 if (!DemandedElts.isAllOnes()) {
41710 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
41711 TLO.DAG, Depth + 1);
41712 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
41713 TLO.DAG, Depth + 1);
41714 if (NewN0 || NewN1) {
41715 NewN0 = NewN0 ? NewN0 : N0;
41716 NewN1 = NewN1 ? NewN1 : N1;
41717 return TLO.CombineTo(Op,
41718 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
41719 }
41720 }
41721 break;
41722 }
41723 case X86ISD::HADD:
41724 case X86ISD::HSUB:
41725 case X86ISD::FHADD:
41726 case X86ISD::FHSUB: {
41727 SDValue N0 = Op.getOperand(0);
41728 SDValue N1 = Op.getOperand(1);
41729
41730 APInt DemandedLHS, DemandedRHS;
41731 getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);
41732
41733 APInt LHSUndef, LHSZero;
41734 if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,
41735 Depth + 1))
41736 return true;
41737 APInt RHSUndef, RHSZero;
41738 if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,
41739 Depth + 1))
41740 return true;
41741
41742 // TODO - pass on known zero/undef.
41743
41744 // Aggressively peek through ops to get at the demanded elts.
41745 // TODO: Handle repeated operands.
41746 if (N0 != N1 && !DemandedElts.isAllOnes()) {
41747 SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
41748 TLO.DAG, Depth + 1);
41749 SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
41750 TLO.DAG, Depth + 1);
41751 if (NewN0 || NewN1) {
41752 NewN0 = NewN0 ? NewN0 : N0;
41753 NewN1 = NewN1 ? NewN1 : N1;
41754 return TLO.CombineTo(Op,
41755 TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));
41756 }
41757 }
41758 break;
41759 }
41760 case X86ISD::VTRUNC:
41761 case X86ISD::VTRUNCS:
41762 case X86ISD::VTRUNCUS: {
41763 SDValue Src = Op.getOperand(0);
41764 MVT SrcVT = Src.getSimpleValueType();
41765 APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
41766 APInt SrcUndef, SrcZero;
41767 if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,
41768 Depth + 1))
41769 return true;
41770 KnownZero = SrcZero.zextOrTrunc(NumElts);
41771 KnownUndef = SrcUndef.zextOrTrunc(NumElts);
41772 break;
41773 }
41774 case X86ISD::BLENDV: {
41775 APInt SelUndef, SelZero;
41776 if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
41777 SelZero, TLO, Depth + 1))
41778 return true;
41779
41780 // TODO: Use SelZero to adjust LHS/RHS DemandedElts.
41781 APInt LHSUndef, LHSZero;
41782 if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,
41783 LHSZero, TLO, Depth + 1))
41784 return true;
41785
41786 APInt RHSUndef, RHSZero;
41787 if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,
41788 RHSZero, TLO, Depth + 1))
41789 return true;
41790
41791 KnownZero = LHSZero & RHSZero;
41792 KnownUndef = LHSUndef & RHSUndef;
41793 break;
41794 }
41795 case X86ISD::VZEXT_MOVL: {
41796 // If upper demanded elements are already zero then we have nothing to do.
41797 SDValue Src = Op.getOperand(0);
41798 APInt DemandedUpperElts = DemandedElts;
41799 DemandedUpperElts.clearLowBits(1);
41800 if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))
41801 return TLO.CombineTo(Op, Src);
41802 break;
41803 }
41804 case X86ISD::VZEXT_LOAD: {
41805 // If upper demanded elements are not demanded then simplify to a
41806 // scalar_to_vector(load()).
41808 if (DemandedElts == 1 && Op.getValue(1).use_empty() && isTypeLegal(SVT)) {
41809 SDLoc DL(Op);
41810 auto *Mem = cast<MemSDNode>(Op);
41811 SDValue Elt = TLO.DAG.getLoad(SVT, DL, Mem->getChain(), Mem->getBasePtr(),
41812 Mem->getMemOperand());
41813 SDValue Vec = TLO.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Elt);
41814 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Vec));
41815 }
41816 break;
41817 }
41818 case X86ISD::VBROADCAST: {
41819 SDValue Src = Op.getOperand(0);
41820 MVT SrcVT = Src.getSimpleValueType();
41821 if (!SrcVT.isVector())
41822 break;
41823 // Don't bother broadcasting if we just need the 0'th element.
41824 if (DemandedElts == 1) {
41825 if (Src.getValueType() != VT)
41826 Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,
41827 SDLoc(Op));
41828 return TLO.CombineTo(Op, Src);
41829 }
41830 APInt SrcUndef, SrcZero;
41831 APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);
41832 if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
41833 Depth + 1))
41834 return true;
41835 // Aggressively peek through src to get at the demanded elt.
41836 // TODO - we should do this for all target/faux shuffles ops.
41838 Src, SrcElts, TLO.DAG, Depth + 1))
41839 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
41840 break;
41841 }
41842 case X86ISD::VPERMV:
41843 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
41844 Depth))
41845 return true;
41846 break;
41847 case X86ISD::PSHUFB:
41848 case X86ISD::VPERMV3:
41849 case X86ISD::VPERMILPV:
41850 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
41851 Depth))
41852 return true;
41853 break;
41854 case X86ISD::VPPERM:
41855 case X86ISD::VPERMIL2:
41856 if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
41857 Depth))
41858 return true;
41859 break;
41860 }
41861
41862 // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
41863 // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
41864 // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0
41865 if ((VT.is256BitVector() || VT.is512BitVector()) &&
41866 DemandedElts.lshr(NumElts / 2) == 0) {
41867 unsigned SizeInBits = VT.getSizeInBits();
41868 unsigned ExtSizeInBits = SizeInBits / 2;
41869
41870 // See if 512-bit ops only use the bottom 128-bits.
41871 if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)
41872 ExtSizeInBits = SizeInBits / 4;
41873
41874 switch (Opc) {
41875 // Scalar broadcast.
41876 case X86ISD::VBROADCAST: {
41877 SDLoc DL(Op);
41878 SDValue Src = Op.getOperand(0);
41879 if (Src.getValueSizeInBits() > ExtSizeInBits)
41880 Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);
41881 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
41882 ExtSizeInBits / VT.getScalarSizeInBits());
41883 SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);
41884 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
41885 TLO.DAG, DL, ExtSizeInBits));
41886 }
41888 SDLoc DL(Op);
41889 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
41890 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
41891 ExtSizeInBits / VT.getScalarSizeInBits());
41892 SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);
41893 SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};
41894 SDValue Bcst = TLO.DAG.getMemIntrinsicNode(
41895 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
41896 MemIntr->getMemOperand());
41898 Bcst.getValue(1));
41899 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,
41900 TLO.DAG, DL, ExtSizeInBits));
41901 }
41902 // Subvector broadcast.
41904 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
41905 EVT MemVT = MemIntr->getMemoryVT();
41906 if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {
41907 SDLoc DL(Op);
41908 SDValue Ld =
41909 TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),
41910 MemIntr->getBasePtr(), MemIntr->getMemOperand());
41912 Ld.getValue(1));
41913 return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,
41914 TLO.DAG, DL, ExtSizeInBits));
41915 } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {
41916 SDLoc DL(Op);
41917 EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),
41918 ExtSizeInBits / VT.getScalarSizeInBits());
41919 if (SDValue BcstLd =
41920 getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))
41921 return TLO.CombineTo(Op,
41922 insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,
41923 TLO.DAG, DL, ExtSizeInBits));
41924 }
41925 break;
41926 }
41927 // Byte shifts by immediate.
41928 case X86ISD::VSHLDQ:
41929 case X86ISD::VSRLDQ:
41930 // Shift by uniform.
41931 case X86ISD::VSHL:
41932 case X86ISD::VSRL:
41933 case X86ISD::VSRA:
41934 // Shift by immediate.
41935 case X86ISD::VSHLI:
41936 case X86ISD::VSRLI:
41937 case X86ISD::VSRAI: {
41938 SDLoc DL(Op);
41939 SDValue Ext0 =
41940 extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
41941 SDValue ExtOp =
41942 TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));
41943 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
41944 SDValue Insert =
41945 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
41946 return TLO.CombineTo(Op, Insert);
41947 }
41948 case X86ISD::VPERMI: {
41949 // Simplify PERMPD/PERMQ to extract_subvector.
41950 // TODO: This should be done in shuffle combining.
41951 if (VT == MVT::v4f64 || VT == MVT::v4i64) {
41953 DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);
41954 if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {
41955 SDLoc DL(Op);
41956 SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);
41957 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
41958 SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);
41959 return TLO.CombineTo(Op, Insert);
41960 }
41961 }
41962 break;
41963 }
41964 case X86ISD::VPERM2X128: {
41965 // Simplify VPERM2F128/VPERM2I128 to extract_subvector.
41966 SDLoc DL(Op);
41967 unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;
41968 if (LoMask & 0x8)
41969 return TLO.CombineTo(
41970 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));
41971 unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);
41972 unsigned SrcIdx = (LoMask & 0x2) >> 1;
41973 SDValue ExtOp =
41974 extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);
41975 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
41976 SDValue Insert =
41977 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
41978 return TLO.CombineTo(Op, Insert);
41979 }
41980 // Zero upper elements.
41981 case X86ISD::VZEXT_MOVL:
41982 // Target unary shuffles by immediate:
41983 case X86ISD::PSHUFD:
41984 case X86ISD::PSHUFLW:
41985 case X86ISD::PSHUFHW:
41986 case X86ISD::VPERMILPI:
41987 // (Non-Lane Crossing) Target Shuffles.
41988 case X86ISD::VPERMILPV:
41989 case X86ISD::VPERMIL2:
41990 case X86ISD::PSHUFB:
41991 case X86ISD::UNPCKL:
41992 case X86ISD::UNPCKH:
41993 case X86ISD::BLENDI:
41994 // Integer ops.
41995 case X86ISD::PACKSS:
41996 case X86ISD::PACKUS:
41997 case X86ISD::PCMPEQ:
41998 case X86ISD::PCMPGT:
41999 case X86ISD::PMULUDQ:
42000 case X86ISD::PMULDQ:
42001 case X86ISD::VSHLV:
42002 case X86ISD::VSRLV:
42003 case X86ISD::VSRAV:
42004 // Float ops.
42005 case X86ISD::FMAX:
42006 case X86ISD::FMIN:
42007 case X86ISD::FMAXC:
42008 case X86ISD::FMINC:
42009 // Horizontal Ops.
42010 case X86ISD::HADD:
42011 case X86ISD::HSUB:
42012 case X86ISD::FHADD:
42013 case X86ISD::FHSUB: {
42014 SDLoc DL(Op);
42016 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
42017 SDValue SrcOp = Op.getOperand(i);
42018 EVT SrcVT = SrcOp.getValueType();
42019 assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
42020 "Unsupported vector size");
42021 Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
42022 ExtSizeInBits)
42023 : SrcOp);
42024 }
42025 MVT ExtVT = VT.getSimpleVT();
42026 ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
42027 ExtSizeInBits / ExtVT.getScalarSizeInBits());
42028 SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
42029 SDValue UndefVec = TLO.DAG.getUNDEF(VT);
42030 SDValue Insert =
42031 insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
42032 return TLO.CombineTo(Op, Insert);
42033 }
42034 }
42035 }
42036
42037 // For splats, unless we *only* demand the 0'th element,
42038 // stop attempts at simplification here, we aren't going to improve things,
42039 // this is better than any potential shuffle.
42040 if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))
42041 return false;
42042
42043 // Get target/faux shuffle mask.
42044 APInt OpUndef, OpZero;
42045 SmallVector<int, 64> OpMask;
42046 SmallVector<SDValue, 2> OpInputs;
42047 if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,
42048 OpZero, TLO.DAG, Depth, false))
42049 return false;
42050
42051 // Shuffle inputs must be the same size as the result.
42052 if (OpMask.size() != (unsigned)NumElts ||
42053 llvm::any_of(OpInputs, [VT](SDValue V) {
42054 return VT.getSizeInBits() != V.getValueSizeInBits() ||
42055 !V.getValueType().isVector();
42056 }))
42057 return false;
42058
42059 KnownZero = OpZero;
42060 KnownUndef = OpUndef;
42061
42062 // Check if shuffle mask can be simplified to undef/zero/identity.
42063 int NumSrcs = OpInputs.size();
42064 for (int i = 0; i != NumElts; ++i)
42065 if (!DemandedElts[i])
42066 OpMask[i] = SM_SentinelUndef;
42067
42068 if (isUndefInRange(OpMask, 0, NumElts)) {
42069 KnownUndef.setAllBits();
42070 return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
42071 }
42072 if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {
42073 KnownZero.setAllBits();
42074 return TLO.CombineTo(
42075 Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));
42076 }
42077 for (int Src = 0; Src != NumSrcs; ++Src)
42078 if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))
42079 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));
42080
42081 // Attempt to simplify inputs.
42082 for (int Src = 0; Src != NumSrcs; ++Src) {
42083 // TODO: Support inputs of different types.
42084 if (OpInputs[Src].getValueType() != VT)
42085 continue;
42086
42087 int Lo = Src * NumElts;
42088 APInt SrcElts = APInt::getZero(NumElts);
42089 for (int i = 0; i != NumElts; ++i)
42090 if (DemandedElts[i]) {
42091 int M = OpMask[i] - Lo;
42092 if (0 <= M && M < NumElts)
42093 SrcElts.setBit(M);
42094 }
42095
42096 // TODO - Propagate input undef/zero elts.
42097 APInt SrcUndef, SrcZero;
42098 if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,
42099 TLO, Depth + 1))
42100 return true;
42101 }
42102
42103 // If we don't demand all elements, then attempt to combine to a simpler
42104 // shuffle.
42105 // We need to convert the depth to something combineX86ShufflesRecursively
42106 // can handle - so pretend its Depth == 0 again, and reduce the max depth
42107 // to match. This prevents combineX86ShuffleChain from returning a
42108 // combined shuffle that's the same as the original root, causing an
42109 // infinite loop.
42110 if (!DemandedElts.isAllOnes()) {
42111 assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range");
42112
42113 SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
42114 for (int i = 0; i != NumElts; ++i)
42115 if (DemandedElts[i])
42116 DemandedMask[i] = i;
42117
42119 {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,
42120 /*HasVarMask*/ false,
42121 /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,
42122 Subtarget);
42123 if (NewShuffle)
42124 return TLO.CombineTo(Op, NewShuffle);
42125 }
42126
42127 return false;
42128}
42129
42131 SDValue Op, const APInt &OriginalDemandedBits,
42132 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
42133 unsigned Depth) const {
42134 EVT VT = Op.getValueType();
42135 unsigned BitWidth = OriginalDemandedBits.getBitWidth();
42136 unsigned Opc = Op.getOpcode();
42137 switch(Opc) {
42138 case X86ISD::VTRUNC: {
42139 KnownBits KnownOp;
42140 SDValue Src = Op.getOperand(0);
42141 MVT SrcVT = Src.getSimpleValueType();
42142
42143 // Simplify the input, using demanded bit information.
42144 APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
42145 APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
42146 if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
42147 return true;
42148 break;
42149 }
42150 case X86ISD::PMULDQ:
42151 case X86ISD::PMULUDQ: {
42152 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
42153 KnownBits KnownLHS, KnownRHS;
42154 SDValue LHS = Op.getOperand(0);
42155 SDValue RHS = Op.getOperand(1);
42156
42157 // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.
42158 // FIXME: Can we bound this better?
42159 APInt DemandedMask = APInt::getLowBitsSet(64, 32);
42160 APInt DemandedMaskLHS = APInt::getAllOnes(64);
42161 APInt DemandedMaskRHS = APInt::getAllOnes(64);
42162
42163 bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();
42164 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))
42165 DemandedMaskLHS = DemandedMask;
42166 if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))
42167 DemandedMaskRHS = DemandedMask;
42168
42169 if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,
42170 KnownLHS, TLO, Depth + 1))
42171 return true;
42172 if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,
42173 KnownRHS, TLO, Depth + 1))
42174 return true;
42175
42176 // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.
42177 KnownRHS = KnownRHS.trunc(32);
42178 if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&
42179 KnownRHS.getConstant().isOne()) {
42180 SDLoc DL(Op);
42181 SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);
42182 return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));
42183 }
42184
42185 // Aggressively peek through ops to get at the demanded low bits.
42187 LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
42189 RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);
42190 if (DemandedLHS || DemandedRHS) {
42191 DemandedLHS = DemandedLHS ? DemandedLHS : LHS;
42192 DemandedRHS = DemandedRHS ? DemandedRHS : RHS;
42193 return TLO.CombineTo(
42194 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));
42195 }
42196 break;
42197 }
42198 case X86ISD::ANDNP: {
42199 KnownBits Known2;
42200 SDValue Op0 = Op.getOperand(0);
42201 SDValue Op1 = Op.getOperand(1);
42202
42203 if (SimplifyDemandedBits(Op1, OriginalDemandedBits, OriginalDemandedElts,
42204 Known, TLO, Depth + 1))
42205 return true;
42206 assert(!Known.hasConflict() && "Bits known to be one AND zero?");
42207
42208 if (SimplifyDemandedBits(Op0, ~Known.Zero & OriginalDemandedBits,
42209 OriginalDemandedElts, Known2, TLO, Depth + 1))
42210 return true;
42211 assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
42212
42213 // If the RHS is a constant, see if we can simplify it.
42214 if (ShrinkDemandedConstant(Op, ~Known2.One & OriginalDemandedBits,
42215 OriginalDemandedElts, TLO))
42216 return true;
42217
42218 // ANDNP = (~Op0 & Op1);
42219 Known.One &= Known2.Zero;
42220 Known.Zero |= Known2.One;
42221 break;
42222 }
42223 case X86ISD::VSHLI: {
42224 SDValue Op0 = Op.getOperand(0);
42225
42226 unsigned ShAmt = Op.getConstantOperandVal(1);
42227 if (ShAmt >= BitWidth)
42228 break;
42229
42230 APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
42231
42232 // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
42233 // single shift. We can do this if the bottom bits (which are shifted
42234 // out) are never demanded.
42235 if (Op0.getOpcode() == X86ISD::VSRLI &&
42236 OriginalDemandedBits.countr_zero() >= ShAmt) {
42237 unsigned Shift2Amt = Op0.getConstantOperandVal(1);
42238 if (Shift2Amt < BitWidth) {
42239 int Diff = ShAmt - Shift2Amt;
42240 if (Diff == 0)
42241 return TLO.CombineTo(Op, Op0.getOperand(0));
42242
42243 unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
42244 SDValue NewShift = TLO.DAG.getNode(
42245 NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
42246 TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
42247 return TLO.CombineTo(Op, NewShift);
42248 }
42249 }
42250
42251 // If we are only demanding sign bits then we can use the shift source directly.
42252 unsigned NumSignBits =
42253 TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
42254 unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();
42255 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
42256 return TLO.CombineTo(Op, Op0);
42257
42258 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
42259 TLO, Depth + 1))
42260 return true;
42261
42262 assert(!Known.hasConflict() && "Bits known to be one AND zero?");
42263 Known.Zero <<= ShAmt;
42264 Known.One <<= ShAmt;
42265
42266 // Low bits known zero.
42267 Known.Zero.setLowBits(ShAmt);
42268 return false;
42269 }
42270 case X86ISD::VSRLI: {
42271 unsigned ShAmt = Op.getConstantOperandVal(1);
42272 if (ShAmt >= BitWidth)
42273 break;
42274
42275 APInt DemandedMask = OriginalDemandedBits << ShAmt;
42276
42277 if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
42278 OriginalDemandedElts, Known, TLO, Depth + 1))
42279 return true;
42280
42281 assert(!Known.hasConflict() && "Bits known to be one AND zero?");
42282 Known.Zero.lshrInPlace(ShAmt);
42283 Known.One.lshrInPlace(ShAmt);
42284
42285 // High bits known zero.
42286 Known.Zero.setHighBits(ShAmt);
42287 return false;
42288 }
42289 case X86ISD::VSRAI: {
42290 SDValue Op0 = Op.getOperand(0);
42291 SDValue Op1 = Op.getOperand(1);
42292
42293 unsigned ShAmt = Op1->getAsZExtVal();
42294 if (ShAmt >= BitWidth)
42295 break;
42296
42297 APInt DemandedMask = OriginalDemandedBits << ShAmt;
42298
42299 // If we just want the sign bit then we don't need to shift it.
42300 if (OriginalDemandedBits.isSignMask())
42301 return TLO.CombineTo(Op, Op0);
42302
42303 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
42304 if (Op0.getOpcode() == X86ISD::VSHLI &&
42305 Op.getOperand(1) == Op0.getOperand(1)) {
42306 SDValue Op00 = Op0.getOperand(0);
42307 unsigned NumSignBits =
42308 TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
42309 if (ShAmt < NumSignBits)
42310 return TLO.CombineTo(Op, Op00);
42311 }
42312
42313 // If any of the demanded bits are produced by the sign extension, we also
42314 // demand the input sign bit.
42315 if (OriginalDemandedBits.countl_zero() < ShAmt)
42316 DemandedMask.setSignBit();
42317
42318 if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
42319 TLO, Depth + 1))
42320 return true;
42321
42322 assert(!Known.hasConflict() && "Bits known to be one AND zero?");
42323 Known.Zero.lshrInPlace(ShAmt);
42324 Known.One.lshrInPlace(ShAmt);
42325
42326 // If the input sign bit is known to be zero, or if none of the top bits
42327 // are demanded, turn this into an unsigned shift right.
42328 if (Known.Zero[BitWidth - ShAmt - 1] ||
42329 OriginalDemandedBits.countl_zero() >= ShAmt)
42330 return TLO.CombineTo(
42331 Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
42332
42333 // High bits are known one.
42334 if (Known.One[BitWidth - ShAmt - 1])
42335 Known.One.setHighBits(ShAmt);
42336 return false;
42337 }
42338 case X86ISD::BLENDV: {
42339 SDValue Sel = Op.getOperand(0);
42340 SDValue LHS = Op.getOperand(1);
42341 SDValue RHS = Op.getOperand(2);
42342
42343 APInt SignMask = APInt::getSignMask(BitWidth);
42345 Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);
42347 LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
42349 RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);
42350
42351 if (NewSel || NewLHS || NewRHS) {
42352 NewSel = NewSel ? NewSel : Sel;
42353 NewLHS = NewLHS ? NewLHS : LHS;
42354 NewRHS = NewRHS ? NewRHS : RHS;
42355 return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,
42356 NewSel, NewLHS, NewRHS));
42357 }
42358 break;
42359 }
42360 case X86ISD::PEXTRB:
42361 case X86ISD::PEXTRW: {
42362 SDValue Vec = Op.getOperand(0);
42363 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
42364 MVT VecVT = Vec.getSimpleValueType();
42365 unsigned NumVecElts = VecVT.getVectorNumElements();
42366
42367 if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {
42368 unsigned Idx = CIdx->getZExtValue();
42369 unsigned VecBitWidth = VecVT.getScalarSizeInBits();
42370
42371 // If we demand no bits from the vector then we must have demanded
42372 // bits from the implict zext - simplify to zero.
42373 APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);
42374 if (DemandedVecBits == 0)
42375 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
42376
42377 APInt KnownUndef, KnownZero;
42378 APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);
42379 if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,
42380 KnownZero, TLO, Depth + 1))
42381 return true;
42382
42383 KnownBits KnownVec;
42384 if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,
42385 KnownVec, TLO, Depth + 1))
42386 return true;
42387
42389 Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))
42390 return TLO.CombineTo(
42391 Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
42392
42393 Known = KnownVec.zext(BitWidth);
42394 return false;
42395 }
42396 break;
42397 }
42398 case X86ISD::PINSRB:
42399 case X86ISD::PINSRW: {
42400 SDValue Vec = Op.getOperand(0);
42401 SDValue Scl = Op.getOperand(1);
42402 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
42403 MVT VecVT = Vec.getSimpleValueType();
42404
42405 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
42406 unsigned Idx = CIdx->getZExtValue();
42407 if (!OriginalDemandedElts[Idx])
42408 return TLO.CombineTo(Op, Vec);
42409
42410 KnownBits KnownVec;
42411 APInt DemandedVecElts(OriginalDemandedElts);
42412 DemandedVecElts.clearBit(Idx);
42413 if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
42414 KnownVec, TLO, Depth + 1))
42415 return true;
42416
42417 KnownBits KnownScl;
42418 unsigned NumSclBits = Scl.getScalarValueSizeInBits();
42419 APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);
42420 if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
42421 return true;
42422
42423 KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());
42424 Known = KnownVec.intersectWith(KnownScl);
42425 return false;
42426 }
42427 break;
42428 }
42429 case X86ISD::PACKSS:
42430 // PACKSS saturates to MIN/MAX integer values. So if we just want the
42431 // sign bit then we can just ask for the source operands sign bit.
42432 // TODO - add known bits handling.
42433 if (OriginalDemandedBits.isSignMask()) {
42434 APInt DemandedLHS, DemandedRHS;
42435 getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);
42436
42437 KnownBits KnownLHS, KnownRHS;
42438 APInt SignMask = APInt::getSignMask(BitWidth * 2);
42439 if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,
42440 KnownLHS, TLO, Depth + 1))
42441 return true;
42442 if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
42443 KnownRHS, TLO, Depth + 1))
42444 return true;
42445
42446 // Attempt to avoid multi-use ops if we don't need anything from them.
42448 Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
42450 Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
42451 if (DemandedOp0 || DemandedOp1) {
42452 SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
42453 SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
42454 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
42455 }
42456 }
42457 // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
42458 break;
42459 case X86ISD::VBROADCAST: {
42460 SDValue Src = Op.getOperand(0);
42461 MVT SrcVT = Src.getSimpleValueType();
42462 APInt DemandedElts = APInt::getOneBitSet(
42463 SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);
42464 if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,
42465 TLO, Depth + 1))
42466 return true;
42467 // If we don't need the upper bits, attempt to narrow the broadcast source.
42468 // Don't attempt this on AVX512 as it might affect broadcast folding.
42469 // TODO: Should we attempt this for i32/i16 splats? They tend to be slower.
42470 if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&
42471 OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&
42472 Src->hasOneUse()) {
42473 MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);
42474 SDValue NewSrc =
42475 TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);
42476 MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);
42477 SDValue NewBcst =
42478 TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);
42479 return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));
42480 }
42481 break;
42482 }
42483 case X86ISD::PCMPGT:
42484 // icmp sgt(0, R) == ashr(R, BitWidth-1).
42485 // iff we only need the sign bit then we can use R directly.
42486 if (OriginalDemandedBits.isSignMask() &&
42487 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
42488 return TLO.CombineTo(Op, Op.getOperand(1));
42489 break;
42490 case X86ISD::MOVMSK: {
42491 SDValue Src = Op.getOperand(0);
42492 MVT SrcVT = Src.getSimpleValueType();
42493 unsigned SrcBits = SrcVT.getScalarSizeInBits();
42494 unsigned NumElts = SrcVT.getVectorNumElements();
42495
42496 // If we don't need the sign bits at all just return zero.
42497 if (OriginalDemandedBits.countr_zero() >= NumElts)
42498 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
42499
42500 // See if we only demand bits from the lower 128-bit vector.
42501 if (SrcVT.is256BitVector() &&
42502 OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {
42503 SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));
42504 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
42505 }
42506
42507 // Only demand the vector elements of the sign bits we need.
42508 APInt KnownUndef, KnownZero;
42509 APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);
42510 if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
42511 TLO, Depth + 1))
42512 return true;
42513
42514 Known.Zero = KnownZero.zext(BitWidth);
42515 Known.Zero.setHighBits(BitWidth - NumElts);
42516
42517 // MOVMSK only uses the MSB from each vector element.
42518 KnownBits KnownSrc;
42519 APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
42520 if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
42521 Depth + 1))
42522 return true;
42523
42524 if (KnownSrc.One[SrcBits - 1])
42525 Known.One.setLowBits(NumElts);
42526 else if (KnownSrc.Zero[SrcBits - 1])
42527 Known.Zero.setLowBits(NumElts);
42528
42529 // Attempt to avoid multi-use os if we don't need anything from it.
42531 Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
42532 return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
42533 return false;
42534 }
42535 case X86ISD::TESTP: {
42536 SDValue Op0 = Op.getOperand(0);
42537 SDValue Op1 = Op.getOperand(1);
42538 MVT OpVT = Op0.getSimpleValueType();
42539 assert((OpVT.getVectorElementType() == MVT::f32 ||
42540 OpVT.getVectorElementType() == MVT::f64) &&
42541 "Illegal vector type for X86ISD::TESTP");
42542
42543 // TESTPS/TESTPD only demands the sign bits of ALL the elements.
42544 KnownBits KnownSrc;
42545 APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());
42546 bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());
42547 return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,
42548 AssumeSingleUse) ||
42549 SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
42550 AssumeSingleUse);
42551 }
42552 case X86ISD::BEXTR:
42553 case X86ISD::BEXTRI: {
42554 SDValue Op0 = Op.getOperand(0);
42555 SDValue Op1 = Op.getOperand(1);
42556
42557 // Only bottom 16-bits of the control bits are required.
42558 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
42559 // NOTE: SimplifyDemandedBits won't do this for constants.
42560 uint64_t Val1 = Cst1->getZExtValue();
42561 uint64_t MaskedVal1 = Val1 & 0xFFFF;
42562 if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {
42563 SDLoc DL(Op);
42564 return TLO.CombineTo(
42565 Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
42566 TLO.DAG.getConstant(MaskedVal1, DL, VT)));
42567 }
42568
42569 unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
42570 unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
42571
42572 // If the length is 0, the result is 0.
42573 if (Length == 0) {
42574 Known.setAllZero();
42575 return false;
42576 }
42577
42578 if ((Shift + Length) <= BitWidth) {
42579 APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);
42580 if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))
42581 return true;
42582
42583 Known = Known.extractBits(Length, Shift);
42584 Known = Known.zextOrTrunc(BitWidth);
42585 return false;
42586 }
42587 } else {
42588 assert(Opc == X86ISD::BEXTR && "Unexpected opcode!");
42589 KnownBits Known1;
42590 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
42591 if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
42592 return true;
42593
42594 // If the length is 0, replace with 0.
42595 KnownBits LengthBits = Known1.extractBits(8, 8);
42596 if (LengthBits.isZero())
42597 return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
42598 }
42599
42600 break;
42601 }
42602 case X86ISD::PDEP: {
42603 SDValue Op0 = Op.getOperand(0);
42604 SDValue Op1 = Op.getOperand(1);
42605
42606 unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();
42607 APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
42608
42609 // If the demanded bits has leading zeroes, we don't demand those from the
42610 // mask.
42611 if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))
42612 return true;
42613
42614 // The number of possible 1s in the mask determines the number of LSBs of
42615 // operand 0 used. Undemanded bits from the mask don't matter so filter
42616 // them before counting.
42617 KnownBits Known2;
42618 uint64_t Count = (~Known.Zero & LoMask).popcount();
42619 APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));
42620 if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))
42621 return true;
42622
42623 // Zeroes are retained from the mask, but not ones.
42624 Known.One.clearAllBits();
42625 // The result will have at least as many trailing zeros as the non-mask
42626 // operand since bits can only map to the same or higher bit position.
42627 Known.Zero.setLowBits(Known2.countMinTrailingZeros());
42628 return false;
42629 }
42630 }
42631
42633 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
42634}
42635
42637 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
42638 SelectionDAG &DAG, unsigned Depth) const {
42639 int NumElts = DemandedElts.getBitWidth();
42640 unsigned Opc = Op.getOpcode();
42641 EVT VT = Op.getValueType();
42642
42643 switch (Opc) {
42644 case X86ISD::PINSRB:
42645 case X86ISD::PINSRW: {
42646 // If we don't demand the inserted element, return the base vector.
42647 SDValue Vec = Op.getOperand(0);
42648 auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
42649 MVT VecVT = Vec.getSimpleValueType();
42650 if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
42651 !DemandedElts[CIdx->getZExtValue()])
42652 return Vec;
42653 break;
42654 }
42655 case X86ISD::VSHLI: {
42656 // If we are only demanding sign bits then we can use the shift source
42657 // directly.
42658 SDValue Op0 = Op.getOperand(0);
42659 unsigned ShAmt = Op.getConstantOperandVal(1);
42660 unsigned BitWidth = DemandedBits.getBitWidth();
42661 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
42662 unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();
42663 if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
42664 return Op0;
42665 break;
42666 }
42667 case X86ISD::VSRAI:
42668 // iff we only need the sign bit then we can use the source directly.
42669 // TODO: generalize where we only demand extended signbits.
42670 if (DemandedBits.isSignMask())
42671 return Op.getOperand(0);
42672 break;
42673 case X86ISD::PCMPGT:
42674 // icmp sgt(0, R) == ashr(R, BitWidth-1).
42675 // iff we only need the sign bit then we can use R directly.
42676 if (DemandedBits.isSignMask() &&
42677 ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
42678 return Op.getOperand(1);
42679 break;
42680 case X86ISD::BLENDV: {
42681 // BLENDV: Cond (MSB) ? LHS : RHS
42682 SDValue Cond = Op.getOperand(0);
42683 SDValue LHS = Op.getOperand(1);
42684 SDValue RHS = Op.getOperand(2);
42685
42686 KnownBits CondKnown = DAG.computeKnownBits(Cond, DemandedElts, Depth + 1);
42687 if (CondKnown.isNegative())
42688 return LHS;
42689 if (CondKnown.isNonNegative())
42690 return RHS;
42691 break;
42692 }
42693 case X86ISD::ANDNP: {
42694 // ANDNP = (~LHS & RHS);
42695 SDValue LHS = Op.getOperand(0);
42696 SDValue RHS = Op.getOperand(1);
42697
42698 KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);
42699 KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);
42700
42701 // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then
42702 // the (inverted) LHS bits cannot contribute to the result of the 'andn' in
42703 // this context, so return RHS.
42704 if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))
42705 return RHS;
42706 break;
42707 }
42708 }
42709
42710 APInt ShuffleUndef, ShuffleZero;
42711 SmallVector<int, 16> ShuffleMask;
42713 if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,
42714 ShuffleUndef, ShuffleZero, DAG, Depth, false)) {
42715 // If all the demanded elts are from one operand and are inline,
42716 // then we can use the operand directly.
42717 int NumOps = ShuffleOps.size();
42718 if (ShuffleMask.size() == (unsigned)NumElts &&
42720 return VT.getSizeInBits() == V.getValueSizeInBits();
42721 })) {
42722
42723 if (DemandedElts.isSubsetOf(ShuffleUndef))
42724 return DAG.getUNDEF(VT);
42725 if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))
42726 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));
42727
42728 // Bitmask that indicates which ops have only been accessed 'inline'.
42729 APInt IdentityOp = APInt::getAllOnes(NumOps);
42730 for (int i = 0; i != NumElts; ++i) {
42731 int M = ShuffleMask[i];
42732 if (!DemandedElts[i] || ShuffleUndef[i])
42733 continue;
42734 int OpIdx = M / NumElts;
42735 int EltIdx = M % NumElts;
42736 if (M < 0 || EltIdx != i) {
42737 IdentityOp.clearAllBits();
42738 break;
42739 }
42740 IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
42741 if (IdentityOp == 0)
42742 break;
42743 }
42744 assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&
42745 "Multiple identity shuffles detected");
42746
42747 if (IdentityOp != 0)
42748 return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);
42749 }
42750 }
42751
42753 Op, DemandedBits, DemandedElts, DAG, Depth);
42754}
42755
42757 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
42758 bool PoisonOnly, unsigned Depth) const {
42759 unsigned NumElts = DemandedElts.getBitWidth();
42760
42761 // TODO: Add more target shuffles.
42762 switch (Op.getOpcode()) {
42763 case X86ISD::PSHUFD:
42764 case X86ISD::VPERMILPI: {
42767 if (getTargetShuffleMask(Op, true, Ops, Mask)) {
42768 SmallVector<APInt, 2> DemandedSrcElts(Ops.size(),
42769 APInt::getZero(NumElts));
42770 for (auto M : enumerate(Mask)) {
42771 if (!DemandedElts[M.index()] || M.value() == SM_SentinelZero)
42772 continue;
42773 if (M.value() == SM_SentinelUndef)
42774 return false;
42775 assert(0 <= M.value() && M.value() < (int)(Ops.size() * NumElts) &&
42776 "Shuffle mask index out of range");
42777 DemandedSrcElts[M.value() / NumElts].setBit(M.value() % NumElts);
42778 }
42779 for (auto Op : enumerate(Ops))
42780 if (!DemandedSrcElts[Op.index()].isZero() &&
42782 Op.value(), DemandedSrcElts[Op.index()], PoisonOnly, Depth + 1))
42783 return false;
42784 return true;
42785 }
42786 break;
42787 }
42788 }
42790 Op, DemandedElts, DAG, PoisonOnly, Depth);
42791}
42792
42794 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
42795 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
42796
42797 // TODO: Add more target shuffles.
42798 switch (Op.getOpcode()) {
42799 case X86ISD::PSHUFD:
42800 case X86ISD::VPERMILPI:
42801 case X86ISD::UNPCKH:
42802 case X86ISD::UNPCKL:
42803 return false;
42804 }
42806 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
42807}
42808
42810 const APInt &DemandedElts,
42811 APInt &UndefElts,
42812 const SelectionDAG &DAG,
42813 unsigned Depth) const {
42814 unsigned NumElts = DemandedElts.getBitWidth();
42815 unsigned Opc = Op.getOpcode();
42816
42817 switch (Opc) {
42818 case X86ISD::VBROADCAST:
42820 UndefElts = APInt::getZero(NumElts);
42821 return true;
42822 }
42823
42824 return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
42825 DAG, Depth);
42826}
42827
42828// Helper to peek through bitops/trunc/setcc to determine size of source vector.
42829// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
42830static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,
42831 bool AllowTruncate) {
42832 switch (Src.getOpcode()) {
42833 case ISD::TRUNCATE:
42834 if (!AllowTruncate)
42835 return false;
42836 [[fallthrough]];
42837 case ISD::SETCC:
42838 return Src.getOperand(0).getValueSizeInBits() == Size;
42839 case ISD::FREEZE:
42840 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate);
42841 case ISD::AND:
42842 case ISD::XOR:
42843 case ISD::OR:
42844 return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&
42845 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);
42846 case ISD::SELECT:
42847 case ISD::VSELECT:
42848 return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&
42849 checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate) &&
42850 checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate);
42851 case ISD::BUILD_VECTOR:
42852 return ISD::isBuildVectorAllZeros(Src.getNode()) ||
42853 ISD::isBuildVectorAllOnes(Src.getNode());
42854 }
42855 return false;
42856}
42857
42858// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
42859static unsigned getAltBitOpcode(unsigned Opcode) {
42860 switch(Opcode) {
42861 // clang-format off
42862 case ISD::AND: return X86ISD::FAND;
42863 case ISD::OR: return X86ISD::FOR;
42864 case ISD::XOR: return X86ISD::FXOR;
42865 case X86ISD::ANDNP: return X86ISD::FANDN;
42866 // clang-format on
42867 }
42868 llvm_unreachable("Unknown bitwise opcode");
42869}
42870
42871// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
42873 const SDLoc &DL) {
42874 EVT SrcVT = Src.getValueType();
42875 if (SrcVT != MVT::v4i1)
42876 return SDValue();
42877
42878 switch (Src.getOpcode()) {
42879 case ISD::SETCC:
42880 if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
42881 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
42882 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
42883 SDValue Op0 = Src.getOperand(0);
42884 if (ISD::isNormalLoad(Op0.getNode()))
42885 return DAG.getBitcast(MVT::v4f32, Op0);
42886 if (Op0.getOpcode() == ISD::BITCAST &&
42887 Op0.getOperand(0).getValueType() == MVT::v4f32)
42888 return Op0.getOperand(0);
42889 }
42890 break;
42891 case ISD::AND:
42892 case ISD::XOR:
42893 case ISD::OR: {
42894 SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
42895 SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
42896 if (Op0 && Op1)
42897 return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
42898 Op1);
42899 break;
42900 }
42901 }
42902 return SDValue();
42903}
42904
42905// Helper to push sign extension of vXi1 SETCC result through bitops.
42907 SDValue Src, const SDLoc &DL) {
42908 switch (Src.getOpcode()) {
42909 case ISD::SETCC:
42910 case ISD::FREEZE:
42911 case ISD::TRUNCATE:
42912 case ISD::BUILD_VECTOR:
42913 return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
42914 case ISD::AND:
42915 case ISD::XOR:
42916 case ISD::OR:
42917 return DAG.getNode(
42918 Src.getOpcode(), DL, SExtVT,
42919 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),
42920 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));
42921 case ISD::SELECT:
42922 case ISD::VSELECT:
42923 return DAG.getSelect(
42924 DL, SExtVT, Src.getOperand(0),
42925 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),
42926 signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));
42927 }
42928 llvm_unreachable("Unexpected node type for vXi1 sign extension");
42929}
42930
42931// Try to match patterns such as
42932// (i16 bitcast (v16i1 x))
42933// ->
42934// (i16 movmsk (16i8 sext (v16i1 x)))
42935// before the illegal vector is scalarized on subtargets that don't have legal
42936// vxi1 types.
42938 const SDLoc &DL,
42939 const X86Subtarget &Subtarget) {
42940 EVT SrcVT = Src.getValueType();
42941 if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
42942 return SDValue();
42943
42944 // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
42945 // legalization destroys the v4i32 type.
42946 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
42947 if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
42948 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
42949 DAG.getBitcast(MVT::v4f32, V));
42950 return DAG.getZExtOrTrunc(V, DL, VT);
42951 }
42952 }
42953
42954 // If the input is a truncate from v16i8 or v32i8 go ahead and use a
42955 // movmskb even with avx512. This will be better than truncating to vXi1 and
42956 // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
42957 // vpcmpeqb/vpcmpgtb.
42958 bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
42959 (Src.getOperand(0).getValueType() == MVT::v16i8 ||
42960 Src.getOperand(0).getValueType() == MVT::v32i8 ||
42961 Src.getOperand(0).getValueType() == MVT::v64i8);
42962
42963 // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
42964 // directly with vpmovmskb/vmovmskps/vmovmskpd.
42965 if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
42966 cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
42967 ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
42968 EVT CmpVT = Src.getOperand(0).getValueType();
42969 EVT EltVT = CmpVT.getVectorElementType();
42970 if (CmpVT.getSizeInBits() <= 256 &&
42971 (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
42972 PreferMovMsk = true;
42973 }
42974
42975 // With AVX512 vxi1 types are legal and we prefer using k-regs.
42976 // MOVMSK is supported in SSE2 or later.
42977 if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
42978 return SDValue();
42979
42980 // If the upper ops of a concatenation are undef, then try to bitcast the
42981 // lower op and extend.
42982 SmallVector<SDValue, 4> SubSrcOps;
42983 if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&
42984 SubSrcOps.size() >= 2) {
42985 SDValue LowerOp = SubSrcOps[0];
42986 ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());
42987 if (LowerOp.getOpcode() == ISD::SETCC &&
42988 all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {
42989 EVT SubVT = VT.getIntegerVT(
42990 *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());
42991 if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {
42992 EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
42993 return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));
42994 }
42995 }
42996 }
42997
42998 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
42999 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
43000 // v8i16 and v16i16.
43001 // For these two cases, we can shuffle the upper element bytes to a
43002 // consecutive sequence at the start of the vector and treat the results as
43003 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
43004 // for v16i16 this is not the case, because the shuffle is expensive, so we
43005 // avoid sign-extending to this type entirely.
43006 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
43007 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
43008 MVT SExtVT;
43009 bool PropagateSExt = false;
43010 switch (SrcVT.getSimpleVT().SimpleTy) {
43011 default:
43012 return SDValue();
43013 case MVT::v2i1:
43014 SExtVT = MVT::v2i64;
43015 break;
43016 case MVT::v4i1:
43017 SExtVT = MVT::v4i32;
43018 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
43019 // sign-extend to a 256-bit operation to avoid truncation.
43020 if (Subtarget.hasAVX() &&
43021 checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {
43022 SExtVT = MVT::v4i64;
43023 PropagateSExt = true;
43024 }
43025 break;
43026 case MVT::v8i1:
43027 SExtVT = MVT::v8i16;
43028 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
43029 // sign-extend to a 256-bit operation to match the compare.
43030 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
43031 // 256-bit because the shuffle is cheaper than sign extending the result of
43032 // the compare.
43033 if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||
43034 checkBitcastSrcVectorSize(Src, 512, true))) {
43035 SExtVT = MVT::v8i32;
43036 PropagateSExt = true;
43037 }
43038 break;
43039 case MVT::v16i1:
43040 SExtVT = MVT::v16i8;
43041 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
43042 // it is not profitable to sign-extend to 256-bit because this will
43043 // require an extra cross-lane shuffle which is more expensive than
43044 // truncating the result of the compare to 128-bits.
43045 break;
43046 case MVT::v32i1:
43047 SExtVT = MVT::v32i8;
43048 break;
43049 case MVT::v64i1:
43050 // If we have AVX512F, but not AVX512BW and the input is truncated from
43051 // v64i8 checked earlier. Then split the input and make two pmovmskbs.
43052 if (Subtarget.hasAVX512()) {
43053 if (Subtarget.hasBWI())
43054 return SDValue();
43055 SExtVT = MVT::v64i8;
43056 break;
43057 }
43058 // Split if this is a <64 x i8> comparison result.
43059 if (checkBitcastSrcVectorSize(Src, 512, false)) {
43060 SExtVT = MVT::v64i8;
43061 break;
43062 }
43063 return SDValue();
43064 };
43065
43066 SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
43067 : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
43068
43069 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {
43070 V = getPMOVMSKB(DL, V, DAG, Subtarget);
43071 } else {
43072 if (SExtVT == MVT::v8i16) {
43073 V = widenSubVector(V, false, Subtarget, DAG, DL, 256);
43074 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v16i8, V);
43075 }
43076 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
43077 }
43078
43079 EVT IntVT =
43081 V = DAG.getZExtOrTrunc(V, DL, IntVT);
43082 return DAG.getBitcast(VT, V);
43083}
43084
43085// Convert a vXi1 constant build vector to the same width scalar integer.
43087 EVT SrcVT = Op.getValueType();
43088 assert(SrcVT.getVectorElementType() == MVT::i1 &&
43089 "Expected a vXi1 vector");
43091 "Expected a constant build vector");
43092
43093 APInt Imm(SrcVT.getVectorNumElements(), 0);
43094 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
43095 SDValue In = Op.getOperand(Idx);
43096 if (!In.isUndef() && (In->getAsZExtVal() & 0x1))
43097 Imm.setBit(Idx);
43098 }
43099 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
43100 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
43101}
43102
43105 const X86Subtarget &Subtarget) {
43106 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
43107
43108 if (!DCI.isBeforeLegalizeOps())
43109 return SDValue();
43110
43111 // Only do this if we have k-registers.
43112 if (!Subtarget.hasAVX512())
43113 return SDValue();
43114
43115 EVT DstVT = N->getValueType(0);
43116 SDValue Op = N->getOperand(0);
43117 EVT SrcVT = Op.getValueType();
43118
43119 if (!Op.hasOneUse())
43120 return SDValue();
43121
43122 // Look for logic ops.
43123 if (Op.getOpcode() != ISD::AND &&
43124 Op.getOpcode() != ISD::OR &&
43125 Op.getOpcode() != ISD::XOR)
43126 return SDValue();
43127
43128 // Make sure we have a bitcast between mask registers and a scalar type.
43129 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
43130 DstVT.isScalarInteger()) &&
43131 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
43132 SrcVT.isScalarInteger()))
43133 return SDValue();
43134
43135 SDValue LHS = Op.getOperand(0);
43136 SDValue RHS = Op.getOperand(1);
43137
43138 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
43139 LHS.getOperand(0).getValueType() == DstVT)
43140 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
43141 DAG.getBitcast(DstVT, RHS));
43142
43143 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
43144 RHS.getOperand(0).getValueType() == DstVT)
43145 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
43146 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
43147
43148 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
43149 // Most of these have to move a constant from the scalar domain anyway.
43152 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
43153 DAG.getBitcast(DstVT, LHS), RHS);
43154 }
43155
43156 return SDValue();
43157}
43158
43160 const X86Subtarget &Subtarget) {
43161 SDLoc DL(BV);
43162 unsigned NumElts = BV->getNumOperands();
43163 SDValue Splat = BV->getSplatValue();
43164
43165 // Build MMX element from integer GPR or SSE float values.
43166 auto CreateMMXElement = [&](SDValue V) {
43167 if (V.isUndef())
43168 return DAG.getUNDEF(MVT::x86mmx);
43169 if (V.getValueType().isFloatingPoint()) {
43170 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
43171 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
43172 V = DAG.getBitcast(MVT::v2i64, V);
43173 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
43174 }
43175 V = DAG.getBitcast(MVT::i32, V);
43176 } else {
43177 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
43178 }
43179 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
43180 };
43181
43182 // Convert build vector ops to MMX data in the bottom elements.
43184
43185 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43186
43187 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
43188 if (Splat) {
43189 if (Splat.isUndef())
43190 return DAG.getUNDEF(MVT::x86mmx);
43191
43192 Splat = CreateMMXElement(Splat);
43193
43194 if (Subtarget.hasSSE1()) {
43195 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
43196 if (NumElts == 8)
43197 Splat = DAG.getNode(
43198 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
43199 DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,
43200 TLI.getPointerTy(DAG.getDataLayout())),
43201 Splat, Splat);
43202
43203 // Use PSHUFW to repeat 16-bit elements.
43204 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
43205 return DAG.getNode(
43206 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
43207 DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,
43208 TLI.getPointerTy(DAG.getDataLayout())),
43209 Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));
43210 }
43211 Ops.append(NumElts, Splat);
43212 } else {
43213 for (unsigned i = 0; i != NumElts; ++i)
43214 Ops.push_back(CreateMMXElement(BV->getOperand(i)));
43215 }
43216
43217 // Use tree of PUNPCKLs to build up general MMX vector.
43218 while (Ops.size() > 1) {
43219 unsigned NumOps = Ops.size();
43220 unsigned IntrinOp =
43221 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
43222 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
43223 : Intrinsic::x86_mmx_punpcklbw));
43224 SDValue Intrin = DAG.getTargetConstant(
43225 IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));
43226 for (unsigned i = 0; i != NumOps; i += 2)
43227 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
43228 Ops[i], Ops[i + 1]);
43229 Ops.resize(NumOps / 2);
43230 }
43231
43232 return Ops[0];
43233}
43234
43235// Recursive function that attempts to find if a bool vector node was originally
43236// a vector/float/double that got truncated/extended/bitcast to/from a scalar
43237// integer. If so, replace the scalar ops with bool vector equivalents back down
43238// the chain.
43240 SelectionDAG &DAG,
43241 const X86Subtarget &Subtarget) {
43242 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43243 unsigned Opc = V.getOpcode();
43244 switch (Opc) {
43245 case ISD::BITCAST: {
43246 // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
43247 SDValue Src = V.getOperand(0);
43248 EVT SrcVT = Src.getValueType();
43249 if (SrcVT.isVector() || SrcVT.isFloatingPoint())
43250 return DAG.getBitcast(VT, Src);
43251 break;
43252 }
43253 case ISD::TRUNCATE: {
43254 // If we find a suitable source, a truncated scalar becomes a subvector.
43255 SDValue Src = V.getOperand(0);
43256 EVT NewSrcVT =
43257 EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
43258 if (TLI.isTypeLegal(NewSrcVT))
43259 if (SDValue N0 =
43260 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
43261 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
43262 DAG.getIntPtrConstant(0, DL));
43263 break;
43264 }
43265 case ISD::ANY_EXTEND:
43266 case ISD::ZERO_EXTEND: {
43267 // If we find a suitable source, an extended scalar becomes a subvector.
43268 SDValue Src = V.getOperand(0);
43269 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
43270 Src.getScalarValueSizeInBits());
43271 if (TLI.isTypeLegal(NewSrcVT))
43272 if (SDValue N0 =
43273 combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
43274 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
43275 Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
43276 : DAG.getConstant(0, DL, VT),
43277 N0, DAG.getIntPtrConstant(0, DL));
43278 break;
43279 }
43280 case ISD::OR: {
43281 // If we find suitable sources, we can just move an OR to the vector domain.
43282 SDValue Src0 = V.getOperand(0);
43283 SDValue Src1 = V.getOperand(1);
43284 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
43285 if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
43286 return DAG.getNode(Opc, DL, VT, N0, N1);
43287 break;
43288 }
43289 case ISD::SHL: {
43290 // If we find a suitable source, a SHL becomes a KSHIFTL.
43291 SDValue Src0 = V.getOperand(0);
43292 if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||
43293 ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))
43294 break;
43295
43296 if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
43297 if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
43298 return DAG.getNode(
43299 X86ISD::KSHIFTL, DL, VT, N0,
43300 DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
43301 break;
43302 }
43303 }
43304 return SDValue();
43305}
43306
43309 const X86Subtarget &Subtarget) {
43310 SDValue N0 = N->getOperand(0);
43311 EVT VT = N->getValueType(0);
43312 EVT SrcVT = N0.getValueType();
43313 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43314
43315 // Try to match patterns such as
43316 // (i16 bitcast (v16i1 x))
43317 // ->
43318 // (i16 movmsk (16i8 sext (v16i1 x)))
43319 // before the setcc result is scalarized on subtargets that don't have legal
43320 // vxi1 types.
43321 if (DCI.isBeforeLegalize()) {
43322 SDLoc dl(N);
43323 if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
43324 return V;
43325
43326 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
43327 // type, widen both sides to avoid a trip through memory.
43328 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
43329 Subtarget.hasAVX512()) {
43330 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
43331 N0 = DAG.getBitcast(MVT::v8i1, N0);
43332 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
43333 DAG.getIntPtrConstant(0, dl));
43334 }
43335
43336 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
43337 // type, widen both sides to avoid a trip through memory.
43338 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
43339 Subtarget.hasAVX512()) {
43340 // Use zeros for the widening if we already have some zeroes. This can
43341 // allow SimplifyDemandedBits to remove scalar ANDs that may be down
43342 // stream of this.
43343 // FIXME: It might make sense to detect a concat_vectors with a mix of
43344 // zeroes and undef and turn it into insert_subvector for i1 vectors as
43345 // a separate combine. What we can't do is canonicalize the operands of
43346 // such a concat or we'll get into a loop with SimplifyDemandedBits.
43347 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
43348 SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);
43349 if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {
43350 SrcVT = LastOp.getValueType();
43351 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
43352 SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());
43353 Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));
43354 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
43355 N0 = DAG.getBitcast(MVT::i8, N0);
43356 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
43357 }
43358 }
43359
43360 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
43361 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
43362 Ops[0] = N0;
43363 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
43364 N0 = DAG.getBitcast(MVT::i8, N0);
43365 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
43366 }
43367 } else {
43368 // If we're bitcasting from iX to vXi1, see if the integer originally
43369 // began as a vXi1 and whether we can remove the bitcast entirely.
43370 if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
43371 SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {
43372 if (SDValue V =
43373 combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
43374 return V;
43375 }
43376 }
43377
43378 // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
43379 // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur
43380 // due to insert_subvector legalization on KNL. By promoting the copy to i16
43381 // we can help with known bits propagation from the vXi1 domain to the
43382 // scalar domain.
43383 if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&
43384 !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
43385 N0.getOperand(0).getValueType() == MVT::v16i1 &&
43387 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
43388 DAG.getBitcast(MVT::i16, N0.getOperand(0)));
43389
43390 // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
43391 // and the vbroadcast_load are both integer or both fp. In some cases this
43392 // will remove the bitcast entirely.
43393 if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
43394 VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
43395 auto *BCast = cast<MemIntrinsicSDNode>(N0);
43396 unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
43397 unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
43398 // Don't swap i8/i16 since don't have fp types that size.
43399 if (MemSize >= 32) {
43400 MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
43401 : MVT::getIntegerVT(MemSize);
43402 MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
43403 : MVT::getIntegerVT(SrcVTSize);
43404 LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
43405
43406 SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
43407 SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
43408 SDValue ResNode =
43410 MemVT, BCast->getMemOperand());
43411 DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
43412 return DAG.getBitcast(VT, ResNode);
43413 }
43414 }
43415
43416 // Since MMX types are special and don't usually play with other vector types,
43417 // it's better to handle them early to be sure we emit efficient code by
43418 // avoiding store-load conversions.
43419 if (VT == MVT::x86mmx) {
43420 // Detect MMX constant vectors.
43421 APInt UndefElts;
43422 SmallVector<APInt, 1> EltBits;
43423 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits,
43424 /*AllowWholeUndefs*/ true,
43425 /*AllowPartialUndefs*/ true)) {
43426 SDLoc DL(N0);
43427 // Handle zero-extension of i32 with MOVD.
43428 if (EltBits[0].countl_zero() >= 32)
43429 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
43430 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
43431 // Else, bitcast to a double.
43432 // TODO - investigate supporting sext 32-bit immediates on x86_64.
43433 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
43434 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
43435 }
43436
43437 // Detect bitcasts to x86mmx low word.
43438 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
43439 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
43440 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
43441 bool LowUndef = true, AllUndefOrZero = true;
43442 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
43443 SDValue Op = N0.getOperand(i);
43444 LowUndef &= Op.isUndef() || (i >= e/2);
43445 AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
43446 }
43447 if (AllUndefOrZero) {
43448 SDValue N00 = N0.getOperand(0);
43449 SDLoc dl(N00);
43450 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
43451 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
43452 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
43453 }
43454 }
43455
43456 // Detect bitcasts of 64-bit build vectors and convert to a
43457 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
43458 // lowest element.
43459 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
43460 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
43461 SrcVT == MVT::v8i8))
43462 return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);
43463
43464 // Detect bitcasts between element or subvector extraction to x86mmx.
43465 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
43467 isNullConstant(N0.getOperand(1))) {
43468 SDValue N00 = N0.getOperand(0);
43469 if (N00.getValueType().is128BitVector())
43470 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
43471 DAG.getBitcast(MVT::v2i64, N00));
43472 }
43473
43474 // Detect bitcasts from FP_TO_SINT to x86mmx.
43475 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
43476 SDLoc DL(N0);
43477 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
43478 DAG.getUNDEF(MVT::v2i32));
43479 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
43480 DAG.getBitcast(MVT::v2i64, Res));
43481 }
43482 }
43483
43484 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
43485 // most of these to scalar anyway.
43486 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
43487 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
43489 return combinevXi1ConstantToInteger(N0, DAG);
43490 }
43491
43492 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
43493 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
43494 isa<ConstantSDNode>(N0)) {
43495 auto *C = cast<ConstantSDNode>(N0);
43496 if (C->isAllOnes())
43497 return DAG.getConstant(1, SDLoc(N0), VT);
43498 if (C->isZero())
43499 return DAG.getConstant(0, SDLoc(N0), VT);
43500 }
43501
43502 // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
43503 // Turn it into a sign bit compare that produces a k-register. This avoids
43504 // a trip through a GPR.
43505 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
43506 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
43508 unsigned NumElts = VT.getVectorNumElements();
43509 SDValue Src = N0;
43510
43511 // Peek through truncate.
43512 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
43513 Src = N0.getOperand(0);
43514
43515 if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
43516 SDValue MovmskIn = Src.getOperand(0);
43517 MVT MovmskVT = MovmskIn.getSimpleValueType();
43518 unsigned MovMskElts = MovmskVT.getVectorNumElements();
43519
43520 // We allow extra bits of the movmsk to be used since they are known zero.
43521 // We can't convert a VPMOVMSKB without avx512bw.
43522 if (MovMskElts <= NumElts &&
43523 (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
43524 EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
43525 MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
43526 SDLoc dl(N);
43527 MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
43528 SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
43529 DAG.getConstant(0, dl, IntVT), ISD::SETLT);
43530 if (EVT(CmpVT) == VT)
43531 return Cmp;
43532
43533 // Pad with zeroes up to original VT to replace the zeroes that were
43534 // being used from the MOVMSK.
43535 unsigned NumConcats = NumElts / MovMskElts;
43536 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
43537 Ops[0] = Cmp;
43538 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
43539 }
43540 }
43541 }
43542
43543 // Try to remove bitcasts from input and output of mask arithmetic to
43544 // remove GPR<->K-register crossings.
43545 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
43546 return V;
43547
43548 // Convert a bitcasted integer logic operation that has one bitcasted
43549 // floating-point operand into a floating-point logic operation. This may
43550 // create a load of a constant, but that is cheaper than materializing the
43551 // constant in an integer register and transferring it to an SSE register or
43552 // transferring the SSE operand to integer register and back.
43553 unsigned FPOpcode;
43554 switch (N0.getOpcode()) {
43555 // clang-format off
43556 case ISD::AND: FPOpcode = X86ISD::FAND; break;
43557 case ISD::OR: FPOpcode = X86ISD::FOR; break;
43558 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
43559 default: return SDValue();
43560 // clang-format on
43561 }
43562
43563 // Check if we have a bitcast from another integer type as well.
43564 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
43565 (Subtarget.hasSSE2() && VT == MVT::f64) ||
43566 (Subtarget.hasFP16() && VT == MVT::f16) ||
43567 (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&
43568 TLI.isTypeLegal(VT))))
43569 return SDValue();
43570
43571 SDValue LogicOp0 = N0.getOperand(0);
43572 SDValue LogicOp1 = N0.getOperand(1);
43573 SDLoc DL0(N0);
43574
43575 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
43576 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
43577 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&
43578 LogicOp0.getOperand(0).getValueType() == VT &&
43579 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
43580 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
43581 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
43582 return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
43583 }
43584 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
43585 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
43586 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&
43587 LogicOp1.getOperand(0).getValueType() == VT &&
43588 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
43589 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
43590 unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();
43591 return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
43592 }
43593
43594 return SDValue();
43595}
43596
43597// (mul (zext a), (sext, b))
43598static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,
43599 SDValue &Op1) {
43600 Op0 = Mul.getOperand(0);
43601 Op1 = Mul.getOperand(1);
43602
43603 // The operand1 should be signed extend
43604 if (Op0.getOpcode() == ISD::SIGN_EXTEND)
43605 std::swap(Op0, Op1);
43606
43607 auto IsFreeTruncation = [](SDValue &Op) -> bool {
43608 if ((Op.getOpcode() == ISD::ZERO_EXTEND ||
43609 Op.getOpcode() == ISD::SIGN_EXTEND) &&
43610 Op.getOperand(0).getScalarValueSizeInBits() <= 8)
43611 return true;
43612
43613 auto *BV = dyn_cast<BuildVectorSDNode>(Op);
43614 return (BV && BV->isConstant());
43615 };
43616
43617 // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned
43618 // value, we need to check Op0 is zero extended value. Op1 should be signed
43619 // value, so we just check the signed bits.
43620 if ((IsFreeTruncation(Op0) &&
43621 DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&
43622 (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))
43623 return true;
43624
43625 return false;
43626}
43627
43628// Given a ABS node, detect the following pattern:
43629// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).
43630// This is useful as it is the input into a SAD pattern.
43631static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {
43632 SDValue AbsOp1 = Abs->getOperand(0);
43633 if (AbsOp1.getOpcode() != ISD::SUB)
43634 return false;
43635
43636 Op0 = AbsOp1.getOperand(0);
43637 Op1 = AbsOp1.getOperand(1);
43638
43639 // Check if the operands of the sub are zero-extended from vectors of i8.
43640 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
43641 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
43642 Op1.getOpcode() != ISD::ZERO_EXTEND ||
43643 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
43644 return false;
43645
43646 return true;
43647}
43648
43650 unsigned &LogBias, const SDLoc &DL,
43651 const X86Subtarget &Subtarget) {
43652 // Extend or truncate to MVT::i8 first.
43653 MVT Vi8VT =
43654 MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());
43655 LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);
43656 RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);
43657
43658 // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element
43659 // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].
43660 // The src A, B element type is i8, but the dst C element type is i32.
43661 // When we calculate the reduce stage, we use src vector type vXi8 for it
43662 // so we need logbias 2 to avoid extra 2 stages.
43663 LogBias = 2;
43664
43665 unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());
43666 if (Subtarget.hasVNNI() && !Subtarget.hasVLX())
43667 RegSize = std::max(512u, RegSize);
43668
43669 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
43670 // fill in the missing vector elements with 0.
43671 unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();
43672 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));
43673 Ops[0] = LHS;
43674 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
43675 SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
43676 Ops[0] = RHS;
43677 SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
43678
43679 // Actually build the DotProduct, split as 256/512 bits for
43680 // AVXVNNI/AVX512VNNI.
43681 auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43682 ArrayRef<SDValue> Ops) {
43683 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
43684 return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);
43685 };
43686 MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
43687 SDValue Zero = DAG.getConstant(0, DL, DpVT);
43688
43689 return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},
43690 DpBuilder, false);
43691}
43692
43693// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
43694// to these zexts.
43695static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
43696 const SDValue &Zext1, const SDLoc &DL,
43697 const X86Subtarget &Subtarget) {
43698 // Find the appropriate width for the PSADBW.
43699 EVT InVT = Zext0.getOperand(0).getValueType();
43700 unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
43701
43702 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
43703 // fill in the missing vector elements with 0.
43704 unsigned NumConcat = RegSize / InVT.getSizeInBits();
43705 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
43706 Ops[0] = Zext0.getOperand(0);
43707 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
43708 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
43709 Ops[0] = Zext1.getOperand(0);
43710 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
43711
43712 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
43713 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
43714 ArrayRef<SDValue> Ops) {
43715 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
43716 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
43717 };
43718 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
43719 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
43720 PSADBWBuilder);
43721}
43722
43723// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
43724// PHMINPOSUW.
43726 const X86Subtarget &Subtarget) {
43727 // Bail without SSE41.
43728 if (!Subtarget.hasSSE41())
43729 return SDValue();
43730
43731 EVT ExtractVT = Extract->getValueType(0);
43732 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
43733 return SDValue();
43734
43735 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
43736 ISD::NodeType BinOp;
43737 SDValue Src = DAG.matchBinOpReduction(
43738 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);
43739 if (!Src)
43740 return SDValue();
43741
43742 EVT SrcVT = Src.getValueType();
43743 EVT SrcSVT = SrcVT.getScalarType();
43744 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
43745 return SDValue();
43746
43747 SDLoc DL(Extract);
43748 SDValue MinPos = Src;
43749
43750 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
43751 while (SrcVT.getSizeInBits() > 128) {
43752 SDValue Lo, Hi;
43753 std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
43754 SrcVT = Lo.getValueType();
43755 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
43756 }
43757 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
43758 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
43759 "Unexpected value type");
43760
43761 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
43762 // to flip the value accordingly.
43763 SDValue Mask;
43764 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
43765 if (BinOp == ISD::SMAX)
43766 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
43767 else if (BinOp == ISD::SMIN)
43768 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
43769 else if (BinOp == ISD::UMAX)
43770 Mask = DAG.getAllOnesConstant(DL, SrcVT);
43771
43772 if (Mask)
43773 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
43774
43775 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
43776 // shuffling each upper element down and insert zeros. This means that the
43777 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
43778 // ready for the PHMINPOS.
43779 if (ExtractVT == MVT::i8) {
43781 SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),
43782 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
43783 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
43784 }
43785
43786 // Perform the PHMINPOS on a v8i16 vector,
43787 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
43788 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
43789 MinPos = DAG.getBitcast(SrcVT, MinPos);
43790
43791 if (Mask)
43792 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
43793
43794 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
43795 DAG.getIntPtrConstant(0, DL));
43796}
43797
43798// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
43800 const X86Subtarget &Subtarget) {
43801 // Bail without SSE2.
43802 if (!Subtarget.hasSSE2())
43803 return SDValue();
43804
43805 EVT ExtractVT = Extract->getValueType(0);
43806 unsigned BitWidth = ExtractVT.getSizeInBits();
43807 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
43808 ExtractVT != MVT::i8 && ExtractVT != MVT::i1)
43809 return SDValue();
43810
43811 // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.
43812 ISD::NodeType BinOp;
43813 SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
43814 if (!Match && ExtractVT == MVT::i1)
43815 Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});
43816 if (!Match)
43817 return SDValue();
43818
43819 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
43820 // which we can't support here for now.
43821 if (Match.getScalarValueSizeInBits() != BitWidth)
43822 return SDValue();
43823
43824 SDValue Movmsk;
43825 SDLoc DL(Extract);
43826 EVT MatchVT = Match.getValueType();
43827 unsigned NumElts = MatchVT.getVectorNumElements();
43828 unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;
43829 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
43830 LLVMContext &Ctx = *DAG.getContext();
43831
43832 if (ExtractVT == MVT::i1) {
43833 // Special case for (pre-legalization) vXi1 reductions.
43834 if (NumElts > 64 || !isPowerOf2_32(NumElts))
43835 return SDValue();
43836 if (Match.getOpcode() == ISD::SETCC) {
43837 ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();
43838 if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||
43839 (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {
43840 // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.
43841 // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.
43842 X86::CondCode X86CC;
43843 SDValue LHS = DAG.getFreeze(Match.getOperand(0));
43844 SDValue RHS = DAG.getFreeze(Match.getOperand(1));
43845 APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());
43846 if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,
43847 DAG, X86CC))
43848 return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,
43849 getSETCC(X86CC, V, DL, DAG));
43850 }
43851 }
43852 if (TLI.isTypeLegal(MatchVT)) {
43853 // If this is a legal AVX512 predicate type then we can just bitcast.
43854 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
43855 Movmsk = DAG.getBitcast(MovmskVT, Match);
43856 } else {
43857 // Use combineBitcastvxi1 to create the MOVMSK.
43858 while (NumElts > MaxElts) {
43859 SDValue Lo, Hi;
43860 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
43861 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
43862 NumElts /= 2;
43863 }
43864 EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);
43865 Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);
43866 }
43867 if (!Movmsk)
43868 return SDValue();
43869 Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
43870 } else {
43871 // FIXME: Better handling of k-registers or 512-bit vectors?
43872 unsigned MatchSizeInBits = Match.getValueSizeInBits();
43873 if (!(MatchSizeInBits == 128 ||
43874 (MatchSizeInBits == 256 && Subtarget.hasAVX())))
43875 return SDValue();
43876
43877 // Make sure this isn't a vector of 1 element. The perf win from using
43878 // MOVMSK diminishes with less elements in the reduction, but it is
43879 // generally better to get the comparison over to the GPRs as soon as
43880 // possible to reduce the number of vector ops.
43881 if (Match.getValueType().getVectorNumElements() < 2)
43882 return SDValue();
43883
43884 // Check that we are extracting a reduction of all sign bits.
43885 if (DAG.ComputeNumSignBits(Match) != BitWidth)
43886 return SDValue();
43887
43888 if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {
43889 SDValue Lo, Hi;
43890 std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);
43891 Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);
43892 MatchSizeInBits = Match.getValueSizeInBits();
43893 }
43894
43895 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
43896 MVT MaskSrcVT;
43897 if (64 == BitWidth || 32 == BitWidth)
43899 MatchSizeInBits / BitWidth);
43900 else
43901 MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
43902
43903 SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);
43904 Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);
43905 NumElts = MaskSrcVT.getVectorNumElements();
43906 }
43907 assert((NumElts <= 32 || NumElts == 64) &&
43908 "Not expecting more than 64 elements");
43909
43910 MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;
43911 if (BinOp == ISD::XOR) {
43912 // parity -> (PARITY(MOVMSK X))
43913 SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);
43914 return DAG.getZExtOrTrunc(Result, DL, ExtractVT);
43915 }
43916
43917 SDValue CmpC;
43918 ISD::CondCode CondCode;
43919 if (BinOp == ISD::OR) {
43920 // any_of -> MOVMSK != 0
43921 CmpC = DAG.getConstant(0, DL, CmpVT);
43922 CondCode = ISD::CondCode::SETNE;
43923 } else {
43924 // all_of -> MOVMSK == ((1 << NumElts) - 1)
43925 CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),
43926 DL, CmpVT);
43927 CondCode = ISD::CondCode::SETEQ;
43928 }
43929
43930 // The setcc produces an i8 of 0/1, so extend that to the result width and
43931 // negate to get the final 0/-1 mask value.
43932 EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);
43933 SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);
43934 SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);
43935 return DAG.getNegative(Zext, DL, ExtractVT);
43936}
43937
43939 const X86Subtarget &Subtarget) {
43940 if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())
43941 return SDValue();
43942
43943 EVT ExtractVT = Extract->getValueType(0);
43944 // Verify the type we're extracting is i32, as the output element type of
43945 // vpdpbusd is i32.
43946 if (ExtractVT != MVT::i32)
43947 return SDValue();
43948
43949 EVT VT = Extract->getOperand(0).getValueType();
43951 return SDValue();
43952
43953 // Match shuffle + add pyramid.
43954 ISD::NodeType BinOp;
43955 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
43956
43957 // We can't combine to vpdpbusd for zext, because each of the 4 multiplies
43958 // done by vpdpbusd compute a signed 16-bit product that will be sign extended
43959 // before adding into the accumulator.
43960 // TODO:
43961 // We also need to verify that the multiply has at least 2x the number of bits
43962 // of the input. We shouldn't match
43963 // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).
43964 // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))
43965 // Root = Root.getOperand(0);
43966
43967 // If there was a match, we want Root to be a mul.
43968 if (!Root || Root.getOpcode() != ISD::MUL)
43969 return SDValue();
43970
43971 // Check whether we have an extend and mul pattern
43972 SDValue LHS, RHS;
43973 if (!detectExtMul(DAG, Root, LHS, RHS))
43974 return SDValue();
43975
43976 // Create the dot product instruction.
43977 SDLoc DL(Extract);
43978 unsigned StageBias;
43979 SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);
43980
43981 // If the original vector was wider than 4 elements, sum over the results
43982 // in the DP vector.
43983 unsigned Stages = Log2_32(VT.getVectorNumElements());
43984 EVT DpVT = DP.getValueType();
43985
43986 if (Stages > StageBias) {
43987 unsigned DpElems = DpVT.getVectorNumElements();
43988
43989 for (unsigned i = Stages - StageBias; i > 0; --i) {
43990 SmallVector<int, 16> Mask(DpElems, -1);
43991 for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
43992 Mask[j] = MaskEnd + j;
43993
43994 SDValue Shuffle =
43995 DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);
43996 DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);
43997 }
43998 }
43999
44000 // Return the lowest ExtractSizeInBits bits.
44001 EVT ResVT =
44002 EVT::getVectorVT(*DAG.getContext(), ExtractVT,
44003 DpVT.getSizeInBits() / ExtractVT.getSizeInBits());
44004 DP = DAG.getBitcast(ResVT, DP);
44005 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,
44006 Extract->getOperand(1));
44007}
44008
44010 const X86Subtarget &Subtarget) {
44011 // PSADBW is only supported on SSE2 and up.
44012 if (!Subtarget.hasSSE2())
44013 return SDValue();
44014
44015 EVT ExtractVT = Extract->getValueType(0);
44016 // Verify the type we're extracting is either i32 or i64.
44017 // FIXME: Could support other types, but this is what we have coverage for.
44018 if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
44019 return SDValue();
44020
44021 EVT VT = Extract->getOperand(0).getValueType();
44023 return SDValue();
44024
44025 // Match shuffle + add pyramid.
44026 ISD::NodeType BinOp;
44027 SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});
44028
44029 // The operand is expected to be zero extended from i8
44030 // (verified in detectZextAbsDiff).
44031 // In order to convert to i64 and above, additional any/zero/sign
44032 // extend is expected.
44033 // The zero extend from 32 bit has no mathematical effect on the result.
44034 // Also the sign extend is basically zero extend
44035 // (extends the sign bit which is zero).
44036 // So it is correct to skip the sign/zero extend instruction.
44037 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
44038 Root.getOpcode() == ISD::ZERO_EXTEND ||
44039 Root.getOpcode() == ISD::ANY_EXTEND))
44040 Root = Root.getOperand(0);
44041
44042 // If there was a match, we want Root to be a select that is the root of an
44043 // abs-diff pattern.
44044 if (!Root || Root.getOpcode() != ISD::ABS)
44045 return SDValue();
44046
44047 // Check whether we have an abs-diff pattern feeding into the select.
44048 SDValue Zext0, Zext1;
44049 if (!detectZextAbsDiff(Root, Zext0, Zext1))
44050 return SDValue();
44051
44052 // Create the SAD instruction.
44053 SDLoc DL(Extract);
44054 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
44055
44056 // If the original vector was wider than 8 elements, sum over the results
44057 // in the SAD vector.
44058 unsigned Stages = Log2_32(VT.getVectorNumElements());
44059 EVT SadVT = SAD.getValueType();
44060 if (Stages > 3) {
44061 unsigned SadElems = SadVT.getVectorNumElements();
44062
44063 for(unsigned i = Stages - 3; i > 0; --i) {
44064 SmallVector<int, 16> Mask(SadElems, -1);
44065 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
44066 Mask[j] = MaskEnd + j;
44067
44068 SDValue Shuffle =
44069 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
44070 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
44071 }
44072 }
44073
44074 unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
44075 // Return the lowest ExtractSizeInBits bits.
44076 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
44077 SadVT.getSizeInBits() / ExtractSizeInBits);
44078 SAD = DAG.getBitcast(ResVT, SAD);
44079 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
44080 Extract->getOperand(1));
44081}
44082
44083// If this extract is from a loaded vector value and will be used as an
44084// integer, that requires a potentially expensive XMM -> GPR transfer.
44085// Additionally, if we can convert to a scalar integer load, that will likely
44086// be folded into a subsequent integer op.
44087// Note: SrcVec might not have a VecVT type, but it must be the same size.
44088// Note: Unlike the related fold for this in DAGCombiner, this is not limited
44089// to a single-use of the loaded vector. For the reasons above, we
44090// expect this to be profitable even if it creates an extra load.
44091static SDValue
44093 const SDLoc &dl, SelectionDAG &DAG,
44095 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
44096 "Only EXTRACT_VECTOR_ELT supported so far");
44097
44098 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44099 EVT VT = N->getValueType(0);
44100
44101 bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {
44102 return Use->getOpcode() == ISD::STORE ||
44103 Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
44104 Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
44105 });
44106
44107 auto *LoadVec = dyn_cast<LoadSDNode>(SrcVec);
44108 if (LoadVec && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&
44109 VecVT.getVectorElementType() == VT &&
44110 VecVT.getSizeInBits() == SrcVec.getValueSizeInBits() &&
44111 DCI.isAfterLegalizeDAG() && !LikelyUsedAsVector && LoadVec->isSimple()) {
44112 SDValue NewPtr = TLI.getVectorElementPointer(
44113 DAG, LoadVec->getBasePtr(), VecVT, DAG.getVectorIdxConstant(Idx, dl));
44114 unsigned PtrOff = VT.getSizeInBits() * Idx / 8;
44115 MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);
44116 Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);
44117 SDValue Load =
44118 DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,
44119 LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());
44120 DAG.makeEquivalentMemoryOrdering(LoadVec, Load);
44121 return Load;
44122 }
44123
44124 return SDValue();
44125}
44126
44127// Attempt to peek through a target shuffle and extract the scalar from the
44128// source.
44131 const X86Subtarget &Subtarget) {
44132 if (DCI.isBeforeLegalizeOps())
44133 return SDValue();
44134
44135 SDLoc dl(N);
44136 SDValue Src = N->getOperand(0);
44137 SDValue Idx = N->getOperand(1);
44138
44139 EVT VT = N->getValueType(0);
44140 EVT SrcVT = Src.getValueType();
44141 EVT SrcSVT = SrcVT.getVectorElementType();
44142 unsigned SrcEltBits = SrcSVT.getSizeInBits();
44143 unsigned NumSrcElts = SrcVT.getVectorNumElements();
44144
44145 // Don't attempt this for boolean mask vectors or unknown extraction indices.
44146 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
44147 return SDValue();
44148
44149 const APInt &IdxC = N->getConstantOperandAPInt(1);
44150 if (IdxC.uge(NumSrcElts))
44151 return SDValue();
44152
44153 SDValue SrcBC = peekThroughBitcasts(Src);
44154
44155 // Handle extract(bitcast(broadcast(scalar_value))).
44156 if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
44157 SDValue SrcOp = SrcBC.getOperand(0);
44158 EVT SrcOpVT = SrcOp.getValueType();
44159 if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
44160 (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
44161 unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
44162 unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
44163 // TODO support non-zero offsets.
44164 if (Offset == 0) {
44165 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
44166 SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
44167 return SrcOp;
44168 }
44169 }
44170 }
44171
44172 // If we're extracting a single element from a broadcast load and there are
44173 // no other users, just create a single load.
44174 if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {
44175 auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
44176 unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
44177 if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
44178 VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
44179 SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
44180 MemIntr->getBasePtr(),
44181 MemIntr->getPointerInfo(),
44182 MemIntr->getOriginalAlign(),
44183 MemIntr->getMemOperand()->getFlags());
44184 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
44185 return Load;
44186 }
44187 }
44188
44189 // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
44190 // TODO: Move to DAGCombine?
44191 if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
44192 SrcBC.getValueType().isInteger() &&
44193 (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
44194 SrcBC.getScalarValueSizeInBits() ==
44195 SrcBC.getOperand(0).getValueSizeInBits()) {
44196 unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
44197 if (IdxC.ult(Scale)) {
44198 unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
44199 SDValue Scl = SrcBC.getOperand(0);
44200 EVT SclVT = Scl.getValueType();
44201 if (Offset) {
44202 Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
44203 DAG.getShiftAmountConstant(Offset, SclVT, dl));
44204 }
44205 Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
44206 Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
44207 return Scl;
44208 }
44209 }
44210
44211 // Handle extract(truncate(x)) for 0'th index.
44212 // TODO: Treat this as a faux shuffle?
44213 // TODO: When can we use this for general indices?
44214 if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&
44215 (SrcVT.getSizeInBits() % 128) == 0) {
44216 Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
44217 MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);
44218 return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),
44219 Idx);
44220 }
44221
44222 // We can only legally extract other elements from 128-bit vectors and in
44223 // certain circumstances, depending on SSE-level.
44224 // TODO: Investigate float/double extraction if it will be just stored.
44225 auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,
44226 unsigned Idx) {
44227 EVT VecSVT = VecVT.getScalarType();
44228 if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&
44229 (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||
44230 VecSVT == MVT::i64)) {
44231 unsigned EltSizeInBits = VecSVT.getSizeInBits();
44232 unsigned NumEltsPerLane = 128 / EltSizeInBits;
44233 unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;
44234 unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();
44235 VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);
44236 Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);
44237 Idx &= (NumEltsPerLane - 1);
44238 }
44239 if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&
44240 ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
44241 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),
44242 DAG.getBitcast(VecVT, Vec),
44243 DAG.getIntPtrConstant(Idx, dl));
44244 }
44245 if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
44246 (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {
44247 unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
44248 return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),
44249 DAG.getTargetConstant(Idx, dl, MVT::i8));
44250 }
44251 return SDValue();
44252 };
44253
44254 // Resolve the target shuffle inputs and mask.
44257 if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
44258 return SDValue();
44259
44260 // Shuffle inputs must be the same size as the result.
44261 if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
44262 return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
44263 }))
44264 return SDValue();
44265
44266 // Attempt to narrow/widen the shuffle mask to the correct size.
44267 if (Mask.size() != NumSrcElts) {
44268 if ((NumSrcElts % Mask.size()) == 0) {
44269 SmallVector<int, 16> ScaledMask;
44270 int Scale = NumSrcElts / Mask.size();
44271 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
44272 Mask = std::move(ScaledMask);
44273 } else if ((Mask.size() % NumSrcElts) == 0) {
44274 // Simplify Mask based on demanded element.
44275 int ExtractIdx = (int)IdxC.getZExtValue();
44276 int Scale = Mask.size() / NumSrcElts;
44277 int Lo = Scale * ExtractIdx;
44278 int Hi = Scale * (ExtractIdx + 1);
44279 for (int i = 0, e = (int)Mask.size(); i != e; ++i)
44280 if (i < Lo || Hi <= i)
44281 Mask[i] = SM_SentinelUndef;
44282
44283 SmallVector<int, 16> WidenedMask;
44284 while (Mask.size() > NumSrcElts &&
44285 canWidenShuffleElements(Mask, WidenedMask))
44286 Mask = std::move(WidenedMask);
44287 }
44288 }
44289
44290 // If narrowing/widening failed, see if we can extract+zero-extend.
44291 int ExtractIdx;
44292 EVT ExtractVT;
44293 if (Mask.size() == NumSrcElts) {
44294 ExtractIdx = Mask[IdxC.getZExtValue()];
44295 ExtractVT = SrcVT;
44296 } else {
44297 unsigned Scale = Mask.size() / NumSrcElts;
44298 if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())
44299 return SDValue();
44300 unsigned ScaledIdx = Scale * IdxC.getZExtValue();
44301 if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))
44302 return SDValue();
44303 ExtractIdx = Mask[ScaledIdx];
44304 EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);
44305 ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());
44306 assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&
44307 "Failed to widen vector type");
44308 }
44309
44310 // If the shuffle source element is undef/zero then we can just accept it.
44311 if (ExtractIdx == SM_SentinelUndef)
44312 return DAG.getUNDEF(VT);
44313
44314 if (ExtractIdx == SM_SentinelZero)
44315 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
44316 : DAG.getConstant(0, dl, VT);
44317
44318 SDValue SrcOp = Ops[ExtractIdx / Mask.size()];
44319 ExtractIdx = ExtractIdx % Mask.size();
44320 if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))
44321 return DAG.getZExtOrTrunc(V, dl, VT);
44322
44323 if (N->getOpcode() == ISD::EXTRACT_VECTOR_ELT && ExtractVT == SrcVT)
44325 N, SrcVT, peekThroughBitcasts(SrcOp), ExtractIdx, dl, DAG, DCI))
44326 return V;
44327
44328 return SDValue();
44329}
44330
44331/// Extracting a scalar FP value from vector element 0 is free, so extract each
44332/// operand first, then perform the math as a scalar op.
44334 const X86Subtarget &Subtarget) {
44335 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
44336 SDValue Vec = ExtElt->getOperand(0);
44337 SDValue Index = ExtElt->getOperand(1);
44338 EVT VT = ExtElt->getValueType(0);
44339 EVT VecVT = Vec.getValueType();
44340
44341 // TODO: If this is a unary/expensive/expand op, allow extraction from a
44342 // non-zero element because the shuffle+scalar op will be cheaper?
44343 if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)
44344 return SDValue();
44345
44346 // Vector FP compares don't fit the pattern of FP math ops (propagate, not
44347 // extract, the condition code), so deal with those as a special-case.
44348 if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {
44349 EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();
44350 if (OpVT != MVT::f32 && OpVT != MVT::f64)
44351 return SDValue();
44352
44353 // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC
44354 SDLoc DL(ExtElt);
44355 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
44356 Vec.getOperand(0), Index);
44357 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,
44358 Vec.getOperand(1), Index);
44359 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));
44360 }
44361
44362 if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&
44363 VT != MVT::f64)
44364 return SDValue();
44365
44366 // Vector FP selects don't fit the pattern of FP math ops (because the
44367 // condition has a different type and we have to change the opcode), so deal
44368 // with those here.
44369 // FIXME: This is restricted to pre type legalization by ensuring the setcc
44370 // has i1 elements. If we loosen this we need to convert vector bool to a
44371 // scalar bool.
44372 if (Vec.getOpcode() == ISD::VSELECT &&
44373 Vec.getOperand(0).getOpcode() == ISD::SETCC &&
44374 Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&
44375 Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {
44376 // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)
44377 SDLoc DL(ExtElt);
44380 Vec.getOperand(0), Index);
44381 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
44382 Vec.getOperand(1), Index);
44383 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
44384 Vec.getOperand(2), Index);
44385 return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);
44386 }
44387
44388 // TODO: This switch could include FNEG and the x86-specific FP logic ops
44389 // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid
44390 // missed load folding and fma+fneg combining.
44391 switch (Vec.getOpcode()) {
44392 case ISD::FMA: // Begin 3 operands
44393 case ISD::FMAD:
44394 case ISD::FADD: // Begin 2 operands
44395 case ISD::FSUB:
44396 case ISD::FMUL:
44397 case ISD::FDIV:
44398 case ISD::FREM:
44399 case ISD::FCOPYSIGN:
44400 case ISD::FMINNUM:
44401 case ISD::FMAXNUM:
44402 case ISD::FMINNUM_IEEE:
44403 case ISD::FMAXNUM_IEEE:
44404 case ISD::FMAXIMUM:
44405 case ISD::FMINIMUM:
44406 case X86ISD::FMAX:
44407 case X86ISD::FMIN:
44408 case ISD::FABS: // Begin 1 operand
44409 case ISD::FSQRT:
44410 case ISD::FRINT:
44411 case ISD::FCEIL:
44412 case ISD::FTRUNC:
44413 case ISD::FNEARBYINT:
44414 case ISD::FROUNDEVEN:
44415 case ISD::FROUND:
44416 case ISD::FFLOOR:
44417 case X86ISD::FRCP:
44418 case X86ISD::FRSQRT: {
44419 // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...
44420 SDLoc DL(ExtElt);
44422 for (SDValue Op : Vec->ops())
44423 ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));
44424 return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);
44425 }
44426 default:
44427 return SDValue();
44428 }
44429 llvm_unreachable("All opcodes should return within switch");
44430}
44431
44432/// Try to convert a vector reduction sequence composed of binops and shuffles
44433/// into horizontal ops.
44435 const X86Subtarget &Subtarget) {
44436 assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
44437
44438 // We need at least SSE2 to anything here.
44439 if (!Subtarget.hasSSE2())
44440 return SDValue();
44441
44442 ISD::NodeType Opc;
44443 SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,
44444 {ISD::ADD, ISD::MUL, ISD::FADD}, true);
44445 if (!Rdx)
44446 return SDValue();
44447
44448 SDValue Index = ExtElt->getOperand(1);
44450 "Reduction doesn't end in an extract from index 0");
44451
44452 EVT VT = ExtElt->getValueType(0);
44453 EVT VecVT = Rdx.getValueType();
44454 if (VecVT.getScalarType() != VT)
44455 return SDValue();
44456
44457 SDLoc DL(ExtElt);
44458 unsigned NumElts = VecVT.getVectorNumElements();
44459 unsigned EltSizeInBits = VecVT.getScalarSizeInBits();
44460
44461 // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.
44462 auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {
44463 if (V.getValueType() == MVT::v4i8) {
44464 if (ZeroExtend && Subtarget.hasSSE41()) {
44465 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,
44466 DAG.getConstant(0, DL, MVT::v4i32),
44467 DAG.getBitcast(MVT::i32, V),
44468 DAG.getIntPtrConstant(0, DL));
44469 return DAG.getBitcast(MVT::v16i8, V);
44470 }
44471 V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,
44472 ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)
44473 : DAG.getUNDEF(MVT::v4i8));
44474 }
44475 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,
44476 DAG.getUNDEF(MVT::v8i8));
44477 };
44478
44479 // vXi8 mul reduction - promote to vXi16 mul reduction.
44480 if (Opc == ISD::MUL) {
44481 if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))
44482 return SDValue();
44483 if (VecVT.getSizeInBits() >= 128) {
44484 EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);
44485 SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
44486 SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));
44487 Lo = DAG.getBitcast(WideVT, Lo);
44488 Hi = DAG.getBitcast(WideVT, Hi);
44489 Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);
44490 while (Rdx.getValueSizeInBits() > 128) {
44491 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
44492 Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);
44493 }
44494 } else {
44495 Rdx = WidenToV16I8(Rdx, false);
44496 Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));
44497 Rdx = DAG.getBitcast(MVT::v8i16, Rdx);
44498 }
44499 if (NumElts >= 8)
44500 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
44501 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
44502 {4, 5, 6, 7, -1, -1, -1, -1}));
44503 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
44504 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
44505 {2, 3, -1, -1, -1, -1, -1, -1}));
44506 Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,
44507 DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,
44508 {1, -1, -1, -1, -1, -1, -1, -1}));
44509 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
44510 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
44511 }
44512
44513 // vXi8 add reduction - sub 128-bit vector.
44514 if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {
44515 Rdx = WidenToV16I8(Rdx, true);
44516 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
44517 DAG.getConstant(0, DL, MVT::v16i8));
44518 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
44519 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
44520 }
44521
44522 // Must be a >=128-bit vector with pow2 elements.
44523 if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))
44524 return SDValue();
44525
44526 // vXi8 add reduction - sum lo/hi halves then use PSADBW.
44527 if (VT == MVT::i8) {
44528 while (Rdx.getValueSizeInBits() > 128) {
44529 SDValue Lo, Hi;
44530 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
44531 VecVT = Lo.getValueType();
44532 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
44533 }
44534 assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
44535
44537 MVT::v16i8, DL, Rdx, Rdx,
44538 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
44539 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);
44540 Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
44541 getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
44542 Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
44543 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
44544 }
44545
44546 // See if we can use vXi8 PSADBW add reduction for larger zext types.
44547 // If the source vector values are 0-255, then we can use PSADBW to
44548 // sum+zext v8i8 subvectors to vXi64, then perform the reduction.
44549 // TODO: See if its worth avoiding vXi16/i32 truncations?
44550 if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&
44551 DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&
44552 (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||
44553 Subtarget.hasAVX512())) {
44554 if (Rdx.getValueType() == MVT::v8i16) {
44555 Rdx = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Rdx,
44556 DAG.getUNDEF(MVT::v8i16));
44557 } else {
44558 EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);
44559 Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);
44560 if (ByteVT.getSizeInBits() < 128)
44561 Rdx = WidenToV16I8(Rdx, true);
44562 }
44563
44564 // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
44565 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
44566 ArrayRef<SDValue> Ops) {
44567 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
44568 SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());
44569 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);
44570 };
44571 MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);
44572 Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);
44573
44574 // TODO: We could truncate to vXi16/vXi32 before performing the reduction.
44575 while (Rdx.getValueSizeInBits() > 128) {
44576 SDValue Lo, Hi;
44577 std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
44578 VecVT = Lo.getValueType();
44579 Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
44580 }
44581 assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected");
44582
44583 if (NumElts > 8) {
44584 SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});
44585 Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);
44586 }
44587
44588 VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());
44589 Rdx = DAG.getBitcast(VecVT, Rdx);
44590 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
44591 }
44592
44593 // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
44594 if (!shouldUseHorizontalOp(true, DAG, Subtarget))
44595 return SDValue();
44596
44597 unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
44598
44599 // 256-bit horizontal instructions operate on 128-bit chunks rather than
44600 // across the whole vector, so we need an extract + hop preliminary stage.
44601 // This is the only step where the operands of the hop are not the same value.
44602 // TODO: We could extend this to handle 512-bit or even longer vectors.
44603 if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||
44604 ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {
44605 unsigned NumElts = VecVT.getVectorNumElements();
44606 SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);
44607 SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);
44608 Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);
44609 VecVT = Rdx.getValueType();
44610 }
44611 if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&
44612 !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))
44613 return SDValue();
44614
44615 // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0
44616 unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());
44617 for (unsigned i = 0; i != ReductionSteps; ++i)
44618 Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);
44619
44620 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
44621}
44622
44623/// Detect vector gather/scatter index generation and convert it from being a
44624/// bunch of shuffles and extracts into a somewhat faster sequence.
44625/// For i686, the best sequence is apparently storing the value and loading
44626/// scalars back, while for x64 we should use 64-bit extracts and shifts.
44629 const X86Subtarget &Subtarget) {
44630 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
44631 return NewOp;
44632
44633 SDValue InputVector = N->getOperand(0);
44634 SDValue EltIdx = N->getOperand(1);
44635 auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);
44636
44637 EVT SrcVT = InputVector.getValueType();
44638 EVT VT = N->getValueType(0);
44639 SDLoc dl(InputVector);
44640 bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
44641 unsigned NumSrcElts = SrcVT.getVectorNumElements();
44642 unsigned NumEltBits = VT.getScalarSizeInBits();
44643 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44644
44645 if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
44646 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
44647
44648 // Integer Constant Folding.
44649 if (CIdx && VT.isInteger()) {
44650 APInt UndefVecElts;
44651 SmallVector<APInt, 16> EltBits;
44652 unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();
44653 if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,
44654 EltBits, /*AllowWholeUndefs*/ true,
44655 /*AllowPartialUndefs*/ false)) {
44656 uint64_t Idx = CIdx->getZExtValue();
44657 if (UndefVecElts[Idx])
44658 return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
44659 return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);
44660 }
44661
44662 // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).
44663 // Improves lowering of bool masks on rust which splits them into byte array.
44664 if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {
44665 SDValue Src = peekThroughBitcasts(InputVector);
44666 if (Src.getValueType().getScalarType() == MVT::i1 &&
44667 TLI.isTypeLegal(Src.getValueType())) {
44668 MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);
44669 SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,
44670 DAG.getIntPtrConstant(CIdx->getZExtValue() * NumEltBits, dl));
44671 return DAG.getBitcast(VT, Sub);
44672 }
44673 }
44674 }
44675
44676 if (IsPextr) {
44677 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),
44678 DCI))
44679 return SDValue(N, 0);
44680
44681 // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).
44682 if ((InputVector.getOpcode() == X86ISD::PINSRB ||
44683 InputVector.getOpcode() == X86ISD::PINSRW) &&
44684 InputVector.getOperand(2) == EltIdx) {
44685 assert(SrcVT == InputVector.getOperand(0).getValueType() &&
44686 "Vector type mismatch");
44687 SDValue Scl = InputVector.getOperand(1);
44688 Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
44689 return DAG.getZExtOrTrunc(Scl, dl, VT);
44690 }
44691
44692 // TODO - Remove this once we can handle the implicit zero-extension of
44693 // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and
44694 // combineBasicSADPattern.
44695 return SDValue();
44696 }
44697
44698 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
44699 if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&
44700 InputVector.getOpcode() == ISD::BITCAST &&
44701 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
44702 isNullConstant(EltIdx) && InputVector.hasOneUse())
44703 return DAG.getBitcast(VT, InputVector);
44704
44705 // Detect mmx to i32 conversion through a v2i32 elt extract.
44706 if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&
44707 InputVector.getOpcode() == ISD::BITCAST &&
44708 InputVector.getOperand(0).getValueType() == MVT::x86mmx &&
44709 isNullConstant(EltIdx) && InputVector.hasOneUse())
44710 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,
44711 InputVector.getOperand(0));
44712
44713 // Check whether this extract is the root of a sum of absolute differences
44714 // pattern. This has to be done here because we really want it to happen
44715 // pre-legalization,
44716 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
44717 return SAD;
44718
44719 if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))
44720 return VPDPBUSD;
44721
44722 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
44723 if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))
44724 return Cmp;
44725
44726 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
44727 if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))
44728 return MinMax;
44729
44730 // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..
44731 if (SDValue V = combineArithReduction(N, DAG, Subtarget))
44732 return V;
44733
44734 if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget))
44735 return V;
44736
44737 if (CIdx)
44739 N, InputVector.getValueType(), InputVector, CIdx->getZExtValue(),
44740 dl, DAG, DCI))
44741 return V;
44742
44743 // Attempt to extract a i1 element by using MOVMSK to extract the signbits
44744 // and then testing the relevant element.
44745 //
44746 // Note that we only combine extracts on the *same* result number, i.e.
44747 // t0 = merge_values a0, a1, a2, a3
44748 // i1 = extract_vector_elt t0, Constant:i64<2>
44749 // i1 = extract_vector_elt t0, Constant:i64<3>
44750 // but not
44751 // i1 = extract_vector_elt t0:1, Constant:i64<2>
44752 // since the latter would need its own MOVMSK.
44753 if (SrcVT.getScalarType() == MVT::i1) {
44754 bool IsVar = !CIdx;
44755 SmallVector<SDNode *, 16> BoolExtracts;
44756 unsigned ResNo = InputVector.getResNo();
44757 auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {
44758 if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
44759 Use->getOperand(0).getResNo() == ResNo &&
44760 Use->getValueType(0) == MVT::i1) {
44761 BoolExtracts.push_back(Use);
44762 IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));
44763 return true;
44764 }
44765 return false;
44766 };
44767 // TODO: Can we drop the oneuse check for constant extracts?
44768 if (all_of(InputVector->uses(), IsBoolExtract) &&
44769 (IsVar || BoolExtracts.size() > 1)) {
44770 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
44771 if (SDValue BC =
44772 combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
44773 for (SDNode *Use : BoolExtracts) {
44774 // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask
44775 // Mask = 1 << MaskIdx
44776 SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);
44777 SDValue MaskBit = DAG.getConstant(1, dl, BCVT);
44778 SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);
44779 SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);
44780 Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);
44781 DCI.CombineTo(Use, Res);
44782 }
44783 return SDValue(N, 0);
44784 }
44785 }
44786 }
44787
44788 // Attempt to fold extract(trunc(x),c) -> trunc(extract(x,c)).
44789 if (CIdx && InputVector.getOpcode() == ISD::TRUNCATE) {
44790 SDValue TruncSrc = InputVector.getOperand(0);
44791 EVT TruncSVT = TruncSrc.getValueType().getScalarType();
44792 if (DCI.isBeforeLegalize() && TLI.isTypeLegal(TruncSVT)) {
44793 SDValue NewExt =
44794 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TruncSVT, TruncSrc, EltIdx);
44795 return DAG.getAnyExtOrTrunc(NewExt, dl, VT);
44796 }
44797 }
44798
44799 return SDValue();
44800}
44801
44802// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
44803// This is more or less the reverse of combineBitcastvxi1.
44805 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
44806 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
44807 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
44808 Opcode != ISD::ANY_EXTEND)
44809 return SDValue();
44810 if (!DCI.isBeforeLegalizeOps())
44811 return SDValue();
44812 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
44813 return SDValue();
44814
44815 EVT SVT = VT.getScalarType();
44816 EVT InSVT = N0.getValueType().getScalarType();
44817 unsigned EltSizeInBits = SVT.getSizeInBits();
44818
44819 // Input type must be extending a bool vector (bit-casted from a scalar
44820 // integer) to legal integer types.
44821 if (!VT.isVector())
44822 return SDValue();
44823 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
44824 return SDValue();
44825 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
44826 return SDValue();
44827
44828 SDValue N00 = N0.getOperand(0);
44829 EVT SclVT = N00.getValueType();
44830 if (!SclVT.isScalarInteger())
44831 return SDValue();
44832
44833 SDValue Vec;
44834 SmallVector<int> ShuffleMask;
44835 unsigned NumElts = VT.getVectorNumElements();
44836 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
44837
44838 // Broadcast the scalar integer to the vector elements.
44839 if (NumElts > EltSizeInBits) {
44840 // If the scalar integer is greater than the vector element size, then we
44841 // must split it down into sub-sections for broadcasting. For example:
44842 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
44843 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
44844 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
44845 unsigned Scale = NumElts / EltSizeInBits;
44846 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
44847 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
44848 Vec = DAG.getBitcast(VT, Vec);
44849
44850 for (unsigned i = 0; i != Scale; ++i)
44851 ShuffleMask.append(EltSizeInBits, i);
44852 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
44853 } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
44854 (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
44855 // If we have register broadcast instructions, use the scalar size as the
44856 // element type for the shuffle. Then cast to the wider element type. The
44857 // widened bits won't be used, and this might allow the use of a broadcast
44858 // load.
44859 assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
44860 unsigned Scale = EltSizeInBits / NumElts;
44861 EVT BroadcastVT =
44862 EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
44863 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
44864 ShuffleMask.append(NumElts * Scale, 0);
44865 Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
44866 Vec = DAG.getBitcast(VT, Vec);
44867 } else {
44868 // For smaller scalar integers, we can simply any-extend it to the vector
44869 // element size (we don't care about the upper bits) and broadcast it to all
44870 // elements.
44871 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
44872 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
44873 ShuffleMask.append(NumElts, 0);
44874 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
44875 }
44876
44877 // Now, mask the relevant bit in each element.
44879 for (unsigned i = 0; i != NumElts; ++i) {
44880 int BitIdx = (i % EltSizeInBits);
44881 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
44882 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
44883 }
44884 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
44885 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
44886
44887 // Compare against the bitmask and extend the result.
44888 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
44889 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
44890 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
44891
44892 // For SEXT, this is now done, otherwise shift the result down for
44893 // zero-extension.
44894 if (Opcode == ISD::SIGN_EXTEND)
44895 return Vec;
44896 return DAG.getNode(ISD::SRL, DL, VT, Vec,
44897 DAG.getConstant(EltSizeInBits - 1, DL, VT));
44898}
44899
44900/// If a vector select has an operand that is -1 or 0, try to simplify the
44901/// select to a bitwise logic operation.
44902/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
44903static SDValue
44906 const X86Subtarget &Subtarget) {
44907 SDValue Cond = N->getOperand(0);
44908 SDValue LHS = N->getOperand(1);
44909 SDValue RHS = N->getOperand(2);
44910 EVT VT = LHS.getValueType();
44911 EVT CondVT = Cond.getValueType();
44912 SDLoc DL(N);
44913 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
44914
44915 if (N->getOpcode() != ISD::VSELECT)
44916 return SDValue();
44917
44918 assert(CondVT.isVector() && "Vector select expects a vector selector!");
44919
44920 // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
44921 // TODO: Can we assert that both operands are not zeros (because that should
44922 // get simplified at node creation time)?
44923 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
44924 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
44925
44926 // If both inputs are 0/undef, create a complete zero vector.
44927 // FIXME: As noted above this should be handled by DAGCombiner/getNode.
44928 if (TValIsAllZeros && FValIsAllZeros) {
44929 if (VT.isFloatingPoint())
44930 return DAG.getConstantFP(0.0, DL, VT);
44931 return DAG.getConstant(0, DL, VT);
44932 }
44933
44934 // To use the condition operand as a bitwise mask, it must have elements that
44935 // are the same size as the select elements. Ie, the condition operand must
44936 // have already been promoted from the IR select condition type <N x i1>.
44937 // Don't check if the types themselves are equal because that excludes
44938 // vector floating-point selects.
44939 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
44940 return SDValue();
44941
44942 // Try to invert the condition if true value is not all 1s and false value is
44943 // not all 0s. Only do this if the condition has one use.
44944 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
44945 if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&
44946 // Check if the selector will be produced by CMPP*/PCMP*.
44947 Cond.getOpcode() == ISD::SETCC &&
44948 // Check if SETCC has already been promoted.
44949 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
44950 CondVT) {
44951 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
44952
44953 if (TValIsAllZeros || FValIsAllOnes) {
44954 SDValue CC = Cond.getOperand(2);
44956 cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
44957 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
44958 NewCC);
44959 std::swap(LHS, RHS);
44960 TValIsAllOnes = FValIsAllOnes;
44961 FValIsAllZeros = TValIsAllZeros;
44962 }
44963 }
44964
44965 // Cond value must be 'sign splat' to be converted to a logical op.
44966 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
44967 return SDValue();
44968
44969 // vselect Cond, 111..., 000... -> Cond
44970 if (TValIsAllOnes && FValIsAllZeros)
44971 return DAG.getBitcast(VT, Cond);
44972
44973 if (!TLI.isTypeLegal(CondVT))
44974 return SDValue();
44975
44976 // vselect Cond, 111..., X -> or Cond, X
44977 if (TValIsAllOnes) {
44978 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
44979 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
44980 return DAG.getBitcast(VT, Or);
44981 }
44982
44983 // vselect Cond, X, 000... -> and Cond, X
44984 if (FValIsAllZeros) {
44985 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
44986 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
44987 return DAG.getBitcast(VT, And);
44988 }
44989
44990 // vselect Cond, 000..., X -> andn Cond, X
44991 if (TValIsAllZeros) {
44992 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
44993 SDValue AndN;
44994 // The canonical form differs for i1 vectors - x86andnp is not used
44995 if (CondVT.getScalarType() == MVT::i1)
44996 AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),
44997 CastRHS);
44998 else
44999 AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);
45000 return DAG.getBitcast(VT, AndN);
45001 }
45002
45003 return SDValue();
45004}
45005
45006/// If both arms of a vector select are concatenated vectors, split the select,
45007/// and concatenate the result to eliminate a wide (256-bit) vector instruction:
45008/// vselect Cond, (concat T0, T1), (concat F0, F1) -->
45009/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)
45011 const X86Subtarget &Subtarget) {
45012 unsigned Opcode = N->getOpcode();
45013 if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)
45014 return SDValue();
45015
45016 // TODO: Split 512-bit vectors too?
45017 EVT VT = N->getValueType(0);
45018 if (!VT.is256BitVector())
45019 return SDValue();
45020
45021 // TODO: Split as long as any 2 of the 3 operands are concatenated?
45022 SDValue Cond = N->getOperand(0);
45023 SDValue TVal = N->getOperand(1);
45024 SDValue FVal = N->getOperand(2);
45025 if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
45026 !isFreeToSplitVector(TVal.getNode(), DAG) ||
45027 !isFreeToSplitVector(FVal.getNode(), DAG))
45028 return SDValue();
45029
45030 auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
45031 ArrayRef<SDValue> Ops) {
45032 return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);
45033 };
45034 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },
45035 makeBlend, /*CheckBWI*/ false);
45036}
45037
45039 SDValue Cond = N->getOperand(0);
45040 SDValue LHS = N->getOperand(1);
45041 SDValue RHS = N->getOperand(2);
45042 SDLoc DL(N);
45043
45044 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
45045 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
45046 if (!TrueC || !FalseC)
45047 return SDValue();
45048
45049 // Don't do this for crazy integer types.
45050 EVT VT = N->getValueType(0);
45051 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
45052 return SDValue();
45053
45054 // We're going to use the condition bit in math or logic ops. We could allow
45055 // this with a wider condition value (post-legalization it becomes an i8),
45056 // but if nothing is creating selects that late, it doesn't matter.
45057 if (Cond.getValueType() != MVT::i1)
45058 return SDValue();
45059
45060 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
45061 // 3, 5, or 9 with i32/i64, so those get transformed too.
45062 // TODO: For constants that overflow or do not differ by power-of-2 or small
45063 // multiplier, convert to 'and' + 'add'.
45064 const APInt &TrueVal = TrueC->getAPIntValue();
45065 const APInt &FalseVal = FalseC->getAPIntValue();
45066
45067 // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.
45068 if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&
45069 Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {
45070 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
45071 if (CC == ISD::SETEQ || CC == ISD::SETNE)
45072 return SDValue();
45073 }
45074
45075 bool OV;
45076 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
45077 if (OV)
45078 return SDValue();
45079
45080 APInt AbsDiff = Diff.abs();
45081 if (AbsDiff.isPowerOf2() ||
45082 ((VT == MVT::i32 || VT == MVT::i64) &&
45083 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
45084
45085 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
45086 // of the condition can usually be folded into a compare predicate, but even
45087 // without that, the sequence should be cheaper than a CMOV alternative.
45088 if (TrueVal.slt(FalseVal)) {
45089 Cond = DAG.getNOT(DL, Cond, MVT::i1);
45090 std::swap(TrueC, FalseC);
45091 }
45092
45093 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
45094 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
45095
45096 // Multiply condition by the difference if non-one.
45097 if (!AbsDiff.isOne())
45098 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
45099
45100 // Add the base if non-zero.
45101 if (!FalseC->isZero())
45102 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
45103
45104 return R;
45105 }
45106
45107 return SDValue();
45108}
45109
45110/// If this is a *dynamic* select (non-constant condition) and we can match
45111/// this node with one of the variable blend instructions, restructure the
45112/// condition so that blends can use the high (sign) bit of each element.
45113/// This function will also call SimplifyDemandedBits on already created
45114/// BLENDV to perform additional simplifications.
45117 const X86Subtarget &Subtarget) {
45118 SDValue Cond = N->getOperand(0);
45119 if ((N->getOpcode() != ISD::VSELECT &&
45120 N->getOpcode() != X86ISD::BLENDV) ||
45122 return SDValue();
45123
45124 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45125 unsigned BitWidth = Cond.getScalarValueSizeInBits();
45126 EVT VT = N->getValueType(0);
45127
45128 // We can only handle the cases where VSELECT is directly legal on the
45129 // subtarget. We custom lower VSELECT nodes with constant conditions and
45130 // this makes it hard to see whether a dynamic VSELECT will correctly
45131 // lower, so we both check the operation's status and explicitly handle the
45132 // cases where a *dynamic* blend will fail even though a constant-condition
45133 // blend could be custom lowered.
45134 // FIXME: We should find a better way to handle this class of problems.
45135 // Potentially, we should combine constant-condition vselect nodes
45136 // pre-legalization into shuffles and not mark as many types as custom
45137 // lowered.
45139 return SDValue();
45140 // FIXME: We don't support i16-element blends currently. We could and
45141 // should support them by making *all* the bits in the condition be set
45142 // rather than just the high bit and using an i8-element blend.
45143 if (VT.getVectorElementType() == MVT::i16)
45144 return SDValue();
45145 // Dynamic blending was only available from SSE4.1 onward.
45146 if (VT.is128BitVector() && !Subtarget.hasSSE41())
45147 return SDValue();
45148 // Byte blends are only available in AVX2
45149 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
45150 return SDValue();
45151 // There are no 512-bit blend instructions that use sign bits.
45152 if (VT.is512BitVector())
45153 return SDValue();
45154
45155 // Don't optimize before the condition has been transformed to a legal type
45156 // and don't ever optimize vector selects that map to AVX512 mask-registers.
45157 if (BitWidth < 8 || BitWidth > 64)
45158 return SDValue();
45159
45160 auto OnlyUsedAsSelectCond = [](SDValue Cond) {
45161 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
45162 UI != UE; ++UI)
45163 if ((UI->getOpcode() != ISD::VSELECT &&
45164 UI->getOpcode() != X86ISD::BLENDV) ||
45165 UI.getOperandNo() != 0)
45166 return false;
45167
45168 return true;
45169 };
45170
45172
45173 if (OnlyUsedAsSelectCond(Cond)) {
45174 KnownBits Known;
45176 !DCI.isBeforeLegalizeOps());
45177 if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
45178 return SDValue();
45179
45180 // If we changed the computation somewhere in the DAG, this change will
45181 // affect all users of Cond. Update all the nodes so that we do not use
45182 // the generic VSELECT anymore. Otherwise, we may perform wrong
45183 // optimizations as we messed with the actual expectation for the vector
45184 // boolean values.
45185 for (SDNode *U : Cond->uses()) {
45186 if (U->getOpcode() == X86ISD::BLENDV)
45187 continue;
45188
45189 SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
45190 Cond, U->getOperand(1), U->getOperand(2));
45191 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
45192 DCI.AddToWorklist(U);
45193 }
45194 DCI.CommitTargetLoweringOpt(TLO);
45195 return SDValue(N, 0);
45196 }
45197
45198 // Otherwise we can still at least try to simplify multiple use bits.
45200 return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
45201 N->getOperand(1), N->getOperand(2));
45202
45203 return SDValue();
45204}
45205
45206// Try to match:
45207// (or (and (M, (sub 0, X)), (pandn M, X)))
45208// which is a special case of:
45209// (select M, (sub 0, X), X)
45210// Per:
45211// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
45212// We know that, if fNegate is 0 or 1:
45213// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
45214//
45215// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
45216// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
45217// ( M ? -X : X) == ((X ^ M ) + (M & 1))
45218// This lets us transform our vselect to:
45219// (add (xor X, M), (and M, 1))
45220// And further to:
45221// (sub (xor X, M), M)
45223 EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
45224 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
45225 EVT MaskVT = Mask.getValueType();
45226 assert(MaskVT.isInteger() &&
45227 DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
45228 "Mask must be zero/all-bits");
45229
45230 if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
45231 return SDValue();
45233 return SDValue();
45234
45235 auto IsNegV = [](SDNode *N, SDValue V) {
45236 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
45237 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
45238 };
45239
45240 SDValue V;
45241 if (IsNegV(Y.getNode(), X))
45242 V = X;
45243 else if (IsNegV(X.getNode(), Y))
45244 V = Y;
45245 else
45246 return SDValue();
45247
45248 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
45249 SDValue SubOp2 = Mask;
45250
45251 // If the negate was on the false side of the select, then
45252 // the operands of the SUB need to be swapped. PR 27251.
45253 // This is because the pattern being matched above is
45254 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
45255 // but if the pattern matched was
45256 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
45257 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
45258 // pattern also needs to be a negation of the replacement pattern above.
45259 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
45260 // sub accomplishes the negation of the replacement pattern.
45261 if (V == Y)
45262 std::swap(SubOp1, SubOp2);
45263
45264 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
45265 return DAG.getBitcast(VT, Res);
45266}
45267
45269 const X86Subtarget &Subtarget) {
45270 if (!Subtarget.hasAVX512())
45271 return SDValue();
45272 if (N->getOpcode() != ISD::VSELECT)
45273 return SDValue();
45274
45275 SDLoc DL(N);
45276 SDValue Cond = N->getOperand(0);
45277 SDValue LHS = N->getOperand(1);
45278 SDValue RHS = N->getOperand(2);
45279
45280 if (canCombineAsMaskOperation(LHS, Subtarget))
45281 return SDValue();
45282
45283 if (!canCombineAsMaskOperation(RHS, Subtarget))
45284 return SDValue();
45285
45286 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
45287 return SDValue();
45288
45289 // Commute LHS and RHS to create opportunity to select mask instruction.
45290 // (vselect M, L, R) -> (vselect ~M, R, L)
45291 ISD::CondCode NewCC =
45292 ISD::getSetCCInverse(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
45293 Cond.getOperand(0).getValueType());
45294 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), Cond.getOperand(0),
45295 Cond.getOperand(1), NewCC);
45296 return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS);
45297}
45298
45299/// Do target-specific dag combines on SELECT and VSELECT nodes.
45302 const X86Subtarget &Subtarget) {
45303 SDLoc DL(N);
45304 SDValue Cond = N->getOperand(0);
45305 SDValue LHS = N->getOperand(1);
45306 SDValue RHS = N->getOperand(2);
45307
45308 // Try simplification again because we use this function to optimize
45309 // BLENDV nodes that are not handled by the generic combiner.
45310 if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))
45311 return V;
45312
45313 // When avx512 is available the lhs operand of select instruction can be
45314 // folded with mask instruction, while the rhs operand can't. Commute the
45315 // lhs and rhs of the select instruction to create the opportunity of
45316 // folding.
45317 if (SDValue V = commuteSelect(N, DAG, Subtarget))
45318 return V;
45319
45320 EVT VT = LHS.getValueType();
45321 EVT CondVT = Cond.getValueType();
45322 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
45323 bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
45324
45325 // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
45326 // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
45327 // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
45328 if (CondVT.isVector() && CondVT.isInteger() &&
45329 CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
45330 (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
45333 DL, DAG, Subtarget))
45334 return V;
45335
45336 // Convert vselects with constant condition into shuffles.
45337 if (CondConstantVector && DCI.isBeforeLegalizeOps() &&
45338 (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {
45341 N->getOpcode() == X86ISD::BLENDV))
45342 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
45343 }
45344
45345 // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))
45346 // by forcing the unselected elements to zero.
45347 // TODO: Can we handle more shuffles with this?
45348 if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&
45349 LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&
45350 LHS.hasOneUse() && RHS.hasOneUse()) {
45351 MVT SimpleVT = VT.getSimpleVT();
45352 SmallVector<SDValue, 1> LHSOps, RHSOps;
45353 SmallVector<int, 64> LHSMask, RHSMask, CondMask;
45354 if (createShuffleMaskFromVSELECT(CondMask, Cond) &&
45355 getTargetShuffleMask(LHS, true, LHSOps, LHSMask) &&
45356 getTargetShuffleMask(RHS, true, RHSOps, RHSMask)) {
45357 int NumElts = VT.getVectorNumElements();
45358 for (int i = 0; i != NumElts; ++i) {
45359 // getConstVector sets negative shuffle mask values as undef, so ensure
45360 // we hardcode SM_SentinelZero values to zero (0x80).
45361 if (CondMask[i] < NumElts) {
45362 LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i];
45363 RHSMask[i] = 0x80;
45364 } else {
45365 LHSMask[i] = 0x80;
45366 RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i];
45367 }
45368 }
45369 LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),
45370 getConstVector(LHSMask, SimpleVT, DAG, DL, true));
45371 RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),
45372 getConstVector(RHSMask, SimpleVT, DAG, DL, true));
45373 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
45374 }
45375 }
45376
45377 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
45378 // instructions match the semantics of the common C idiom x<y?x:y but not
45379 // x<=y?x:y, because of how they handle negative zero (which can be
45380 // ignored in unsafe-math mode).
45381 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
45382 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
45383 VT != MVT::f80 && VT != MVT::f128 && !isSoftF16(VT, Subtarget) &&
45384 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
45385 (Subtarget.hasSSE2() ||
45386 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
45387 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
45388
45389 unsigned Opcode = 0;
45390 // Check for x CC y ? x : y.
45391 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
45392 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
45393 switch (CC) {
45394 default: break;
45395 case ISD::SETULT:
45396 // Converting this to a min would handle NaNs incorrectly, and swapping
45397 // the operands would cause it to handle comparisons between positive
45398 // and negative zero incorrectly.
45399 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
45401 !(DAG.isKnownNeverZeroFloat(LHS) ||
45403 break;
45404 std::swap(LHS, RHS);
45405 }
45406 Opcode = X86ISD::FMIN;
45407 break;
45408 case ISD::SETOLE:
45409 // Converting this to a min would handle comparisons between positive
45410 // and negative zero incorrectly.
45413 break;
45414 Opcode = X86ISD::FMIN;
45415 break;
45416 case ISD::SETULE:
45417 // Converting this to a min would handle both negative zeros and NaNs
45418 // incorrectly, but we can swap the operands to fix both.
45419 std::swap(LHS, RHS);
45420 [[fallthrough]];
45421 case ISD::SETOLT:
45422 case ISD::SETLT:
45423 case ISD::SETLE:
45424 Opcode = X86ISD::FMIN;
45425 break;
45426
45427 case ISD::SETOGE:
45428 // Converting this to a max would handle comparisons between positive
45429 // and negative zero incorrectly.
45432 break;
45433 Opcode = X86ISD::FMAX;
45434 break;
45435 case ISD::SETUGT:
45436 // Converting this to a max would handle NaNs incorrectly, and swapping
45437 // the operands would cause it to handle comparisons between positive
45438 // and negative zero incorrectly.
45439 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
45441 !(DAG.isKnownNeverZeroFloat(LHS) ||
45443 break;
45444 std::swap(LHS, RHS);
45445 }
45446 Opcode = X86ISD::FMAX;
45447 break;
45448 case ISD::SETUGE:
45449 // Converting this to a max would handle both negative zeros and NaNs
45450 // incorrectly, but we can swap the operands to fix both.
45451 std::swap(LHS, RHS);
45452 [[fallthrough]];
45453 case ISD::SETOGT:
45454 case ISD::SETGT:
45455 case ISD::SETGE:
45456 Opcode = X86ISD::FMAX;
45457 break;
45458 }
45459 // Check for x CC y ? y : x -- a min/max with reversed arms.
45460 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
45461 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
45462 switch (CC) {
45463 default: break;
45464 case ISD::SETOGE:
45465 // Converting this to a min would handle comparisons between positive
45466 // and negative zero incorrectly, and swapping the operands would
45467 // cause it to handle NaNs incorrectly.
45469 !(DAG.isKnownNeverZeroFloat(LHS) ||
45470 DAG.isKnownNeverZeroFloat(RHS))) {
45471 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
45472 break;
45473 std::swap(LHS, RHS);
45474 }
45475 Opcode = X86ISD::FMIN;
45476 break;
45477 case ISD::SETUGT:
45478 // Converting this to a min would handle NaNs incorrectly.
45479 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
45480 break;
45481 Opcode = X86ISD::FMIN;
45482 break;
45483 case ISD::SETUGE:
45484 // Converting this to a min would handle both negative zeros and NaNs
45485 // incorrectly, but we can swap the operands to fix both.
45486 std::swap(LHS, RHS);
45487 [[fallthrough]];
45488 case ISD::SETOGT:
45489 case ISD::SETGT:
45490 case ISD::SETGE:
45491 Opcode = X86ISD::FMIN;
45492 break;
45493
45494 case ISD::SETULT:
45495 // Converting this to a max would handle NaNs incorrectly.
45496 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
45497 break;
45498 Opcode = X86ISD::FMAX;
45499 break;
45500 case ISD::SETOLE:
45501 // Converting this to a max would handle comparisons between positive
45502 // and negative zero incorrectly, and swapping the operands would
45503 // cause it to handle NaNs incorrectly.
45505 !DAG.isKnownNeverZeroFloat(LHS) &&
45506 !DAG.isKnownNeverZeroFloat(RHS)) {
45507 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
45508 break;
45509 std::swap(LHS, RHS);
45510 }
45511 Opcode = X86ISD::FMAX;
45512 break;
45513 case ISD::SETULE:
45514 // Converting this to a max would handle both negative zeros and NaNs
45515 // incorrectly, but we can swap the operands to fix both.
45516 std::swap(LHS, RHS);
45517 [[fallthrough]];
45518 case ISD::SETOLT:
45519 case ISD::SETLT:
45520 case ISD::SETLE:
45521 Opcode = X86ISD::FMAX;
45522 break;
45523 }
45524 }
45525
45526 if (Opcode)
45527 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
45528 }
45529
45530 // Some mask scalar intrinsics rely on checking if only one bit is set
45531 // and implement it in C code like this:
45532 // A[0] = (U & 1) ? A[0] : W[0];
45533 // This creates some redundant instructions that break pattern matching.
45534 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
45535 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
45536 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
45537 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
45538 SDValue AndNode = Cond.getOperand(0);
45539 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
45540 isNullConstant(Cond.getOperand(1)) &&
45541 isOneConstant(AndNode.getOperand(1))) {
45542 // LHS and RHS swapped due to
45543 // setcc outputting 1 when AND resulted in 0 and vice versa.
45544 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
45545 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
45546 }
45547 }
45548
45549 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
45550 // lowering on KNL. In this case we convert it to
45551 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
45552 // The same situation all vectors of i8 and i16 without BWI.
45553 // Make sure we extend these even before type legalization gets a chance to
45554 // split wide vectors.
45555 // Since SKX these selects have a proper lowering.
45556 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
45557 CondVT.getVectorElementType() == MVT::i1 &&
45558 (VT.getVectorElementType() == MVT::i8 ||
45559 VT.getVectorElementType() == MVT::i16)) {
45560 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
45561 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
45562 }
45563
45564 // AVX512 - Extend select with zero to merge with target shuffle.
45565 // select(mask, extract_subvector(shuffle(x)), zero) -->
45566 // extract_subvector(select(insert_subvector(mask), shuffle(x), zero))
45567 // TODO - support non target shuffles as well.
45568 if (Subtarget.hasAVX512() && CondVT.isVector() &&
45569 CondVT.getVectorElementType() == MVT::i1) {
45570 auto SelectableOp = [&TLI](SDValue Op) {
45571 return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
45572 isTargetShuffle(Op.getOperand(0).getOpcode()) &&
45573 isNullConstant(Op.getOperand(1)) &&
45574 TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
45575 Op.hasOneUse() && Op.getOperand(0).hasOneUse();
45576 };
45577
45578 bool SelectableLHS = SelectableOp(LHS);
45579 bool SelectableRHS = SelectableOp(RHS);
45580 bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());
45581 bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());
45582
45583 if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {
45584 EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()
45585 : RHS.getOperand(0).getValueType();
45586 EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);
45587 LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,
45588 VT.getSizeInBits());
45589 RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,
45590 VT.getSizeInBits());
45591 Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,
45592 DAG.getUNDEF(SrcCondVT), Cond,
45593 DAG.getIntPtrConstant(0, DL));
45594 SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);
45595 return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
45596 }
45597 }
45598
45600 return V;
45601
45602 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
45603 Cond.hasOneUse()) {
45604 EVT CondVT = Cond.getValueType();
45605 SDValue Cond0 = Cond.getOperand(0);
45606 SDValue Cond1 = Cond.getOperand(1);
45607 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
45608
45609 // Canonicalize min/max:
45610 // (x > 0) ? x : 0 -> (x >= 0) ? x : 0
45611 // (x < -1) ? x : -1 -> (x <= -1) ? x : -1
45612 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
45613 // the need for an extra compare against zero. e.g.
45614 // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0
45615 // subl %esi, %edi
45616 // testl %edi, %edi
45617 // movl $0, %eax
45618 // cmovgl %edi, %eax
45619 // =>
45620 // xorl %eax, %eax
45621 // subl %esi, $edi
45622 // cmovsl %eax, %edi
45623 //
45624 // We can also canonicalize
45625 // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1
45626 // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1
45627 // This allows the use of a test instruction for the compare.
45628 if (LHS == Cond0 && RHS == Cond1) {
45629 if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||
45632 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
45633 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
45634 }
45635 if (CC == ISD::SETUGT && isOneConstant(RHS)) {
45636 ISD::CondCode NewCC = ISD::SETUGE;
45637 Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);
45638 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
45639 }
45640 }
45641
45642 // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.
45643 // fold eq + gt/lt nested selects into ge/le selects
45644 // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)
45645 // --> (select (cmpuge Cond0, Cond1), LHS, Y)
45646 // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)
45647 // --> (select (cmpsle Cond0, Cond1), LHS, Y)
45648 // .. etc ..
45649 if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&
45650 RHS.getOperand(0).getOpcode() == ISD::SETCC) {
45651 SDValue InnerSetCC = RHS.getOperand(0);
45652 ISD::CondCode InnerCC =
45653 cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();
45654 if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&
45655 Cond0 == InnerSetCC.getOperand(0) &&
45656 Cond1 == InnerSetCC.getOperand(1)) {
45657 ISD::CondCode NewCC;
45658 switch (CC == ISD::SETEQ ? InnerCC : CC) {
45659 // clang-format off
45660 case ISD::SETGT: NewCC = ISD::SETGE; break;
45661 case ISD::SETLT: NewCC = ISD::SETLE; break;
45662 case ISD::SETUGT: NewCC = ISD::SETUGE; break;
45663 case ISD::SETULT: NewCC = ISD::SETULE; break;
45664 default: NewCC = ISD::SETCC_INVALID; break;
45665 // clang-format on
45666 }
45667 if (NewCC != ISD::SETCC_INVALID) {
45668 Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);
45669 return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));
45670 }
45671 }
45672 }
45673 }
45674
45675 // Check if the first operand is all zeros and Cond type is vXi1.
45676 // If this an avx512 target we can improve the use of zero masking by
45677 // swapping the operands and inverting the condition.
45678 if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
45679 Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
45680 ISD::isBuildVectorAllZeros(LHS.getNode()) &&
45681 !ISD::isBuildVectorAllZeros(RHS.getNode())) {
45682 // Invert the cond to not(cond) : xor(op,allones)=not(op)
45683 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
45684 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
45685 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
45686 }
45687
45688 // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might
45689 // get split by legalization.
45690 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&
45691 CondVT.getVectorElementType() == MVT::i1 &&
45692 TLI.isTypeLegal(VT.getScalarType())) {
45693 EVT ExtCondVT = VT.changeVectorElementTypeToInteger();
45695 ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {
45696 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);
45697 return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);
45698 }
45699 }
45700
45701 // Early exit check
45702 if (!TLI.isTypeLegal(VT) || isSoftF16(VT, Subtarget))
45703 return SDValue();
45704
45705 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
45706 return V;
45707
45708 if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))
45709 return V;
45710
45711 if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))
45712 return V;
45713
45714 // select(~Cond, X, Y) -> select(Cond, Y, X)
45715 if (CondVT.getScalarType() != MVT::i1) {
45716 if (SDValue CondNot = IsNOT(Cond, DAG))
45717 return DAG.getNode(N->getOpcode(), DL, VT,
45718 DAG.getBitcast(CondVT, CondNot), RHS, LHS);
45719
45720 // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the
45721 // signbit.
45722 if (Cond.getOpcode() == X86ISD::PCMPGT &&
45723 ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&
45724 Cond.hasOneUse()) {
45725 Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,
45726 DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));
45727 return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);
45728 }
45729 }
45730
45731 // Try to optimize vXi1 selects if both operands are either all constants or
45732 // bitcasts from scalar integer type. In that case we can convert the operands
45733 // to integer and use an integer select which will be converted to a CMOV.
45734 // We need to take a little bit of care to avoid creating an i64 type after
45735 // type legalization.
45736 if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
45737 VT.getVectorElementType() == MVT::i1 &&
45738 (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
45740 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {
45741 bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
45742 bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
45743
45744 if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&
45745 LHS.getOperand(0).getValueType() == IntVT)) &&
45746 (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&
45747 RHS.getOperand(0).getValueType() == IntVT))) {
45748 if (LHSIsConst)
45750 else
45751 LHS = LHS.getOperand(0);
45752
45753 if (RHSIsConst)
45755 else
45756 RHS = RHS.getOperand(0);
45757
45758 SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
45759 return DAG.getBitcast(VT, Select);
45760 }
45761 }
45762 }
45763
45764 // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
45765 // single bits, then invert the predicate and swap the select operands.
45766 // This can lower using a vector shift bit-hack rather than mask and compare.
45767 if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
45768 N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
45769 Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
45770 Cond.getOperand(0).getOpcode() == ISD::AND &&
45771 isNullOrNullSplat(Cond.getOperand(1)) &&
45772 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
45773 Cond.getOperand(0).getValueType() == VT) {
45774 // The 'and' mask must be composed of power-of-2 constants.
45775 SDValue And = Cond.getOperand(0);
45776 auto *C = isConstOrConstSplat(And.getOperand(1));
45777 if (C && C->getAPIntValue().isPowerOf2()) {
45778 // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
45779 SDValue NotCond =
45780 DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
45781 return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
45782 }
45783
45784 // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
45785 // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
45786 // 16-bit lacks a proper blendv.
45787 unsigned EltBitWidth = VT.getScalarSizeInBits();
45788 bool CanShiftBlend =
45789 TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
45790 (Subtarget.hasAVX2() && EltBitWidth == 64) ||
45791 (Subtarget.hasXOP()));
45792 if (CanShiftBlend &&
45793 ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
45794 return C->getAPIntValue().isPowerOf2();
45795 })) {
45796 // Create a left-shift constant to get the mask bits over to the sign-bit.
45797 SDValue Mask = And.getOperand(1);
45798 SmallVector<int, 32> ShlVals;
45799 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
45800 auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
45801 ShlVals.push_back(EltBitWidth - 1 -
45802 MaskVal->getAPIntValue().exactLogBase2());
45803 }
45804 // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
45805 SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
45806 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
45807 SDValue NewCond =
45808 DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
45809 return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
45810 }
45811 }
45812
45813 return SDValue();
45814}
45815
45816/// Combine:
45817/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
45818/// to:
45819/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
45820/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
45821/// Note that this is only legal for some op/cc combinations.
45823 SelectionDAG &DAG,
45824 const X86Subtarget &Subtarget) {
45825 // This combine only operates on CMP-like nodes.
45826 if (!(Cmp.getOpcode() == X86ISD::CMP ||
45827 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
45828 return SDValue();
45829
45830 // Can't replace the cmp if it has more uses than the one we're looking at.
45831 // FIXME: We would like to be able to handle this, but would need to make sure
45832 // all uses were updated.
45833 if (!Cmp.hasOneUse())
45834 return SDValue();
45835
45836 // This only applies to variations of the common case:
45837 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
45838 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
45839 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
45840 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
45841 // Using the proper condcodes (see below), overflow is checked for.
45842
45843 // FIXME: We can generalize both constraints:
45844 // - XOR/OR/AND (if they were made to survive AtomicExpand)
45845 // - LHS != 1
45846 // if the result is compared.
45847
45848 SDValue CmpLHS = Cmp.getOperand(0);
45849 SDValue CmpRHS = Cmp.getOperand(1);
45850 EVT CmpVT = CmpLHS.getValueType();
45851
45852 if (!CmpLHS.hasOneUse())
45853 return SDValue();
45854
45855 unsigned Opc = CmpLHS.getOpcode();
45856 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
45857 return SDValue();
45858
45859 SDValue OpRHS = CmpLHS.getOperand(2);
45860 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
45861 if (!OpRHSC)
45862 return SDValue();
45863
45864 APInt Addend = OpRHSC->getAPIntValue();
45865 if (Opc == ISD::ATOMIC_LOAD_SUB)
45866 Addend = -Addend;
45867
45868 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
45869 if (!CmpRHSC)
45870 return SDValue();
45871
45872 APInt Comparison = CmpRHSC->getAPIntValue();
45873 APInt NegAddend = -Addend;
45874
45875 // See if we can adjust the CC to make the comparison match the negated
45876 // addend.
45877 if (Comparison != NegAddend) {
45878 APInt IncComparison = Comparison + 1;
45879 if (IncComparison == NegAddend) {
45880 if (CC == X86::COND_A && !Comparison.isMaxValue()) {
45881 Comparison = IncComparison;
45882 CC = X86::COND_AE;
45883 } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {
45884 Comparison = IncComparison;
45885 CC = X86::COND_L;
45886 }
45887 }
45888 APInt DecComparison = Comparison - 1;
45889 if (DecComparison == NegAddend) {
45890 if (CC == X86::COND_AE && !Comparison.isMinValue()) {
45891 Comparison = DecComparison;
45892 CC = X86::COND_A;
45893 } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {
45894 Comparison = DecComparison;
45895 CC = X86::COND_LE;
45896 }
45897 }
45898 }
45899
45900 // If the addend is the negation of the comparison value, then we can do
45901 // a full comparison by emitting the atomic arithmetic as a locked sub.
45902 if (Comparison == NegAddend) {
45903 // The CC is fine, but we need to rewrite the LHS of the comparison as an
45904 // atomic sub.
45905 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
45906 auto AtomicSub = DAG.getAtomic(
45907 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,
45908 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
45909 /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),
45910 AN->getMemOperand());
45911 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);
45912 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
45913 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
45914 return LockOp;
45915 }
45916
45917 // We can handle comparisons with zero in a number of cases by manipulating
45918 // the CC used.
45919 if (!Comparison.isZero())
45920 return SDValue();
45921
45922 if (CC == X86::COND_S && Addend == 1)
45923 CC = X86::COND_LE;
45924 else if (CC == X86::COND_NS && Addend == 1)
45925 CC = X86::COND_G;
45926 else if (CC == X86::COND_G && Addend == -1)
45927 CC = X86::COND_GE;
45928 else if (CC == X86::COND_LE && Addend == -1)
45929 CC = X86::COND_L;
45930 else
45931 return SDValue();
45932
45933 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
45934 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));
45935 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
45936 return LockOp;
45937}
45938
45939// Check whether a boolean test is testing a boolean value generated by
45940// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
45941// code.
45942//
45943// Simplify the following patterns:
45944// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
45945// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
45946// to (Op EFLAGS Cond)
45947//
45948// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
45949// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
45950// to (Op EFLAGS !Cond)
45951//
45952// where Op could be BRCOND or CMOV.
45953//
45955 // This combine only operates on CMP-like nodes.
45956 if (!(Cmp.getOpcode() == X86ISD::CMP ||
45957 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
45958 return SDValue();
45959
45960 // Quit if not used as a boolean value.
45961 if (CC != X86::COND_E && CC != X86::COND_NE)
45962 return SDValue();
45963
45964 // Check CMP operands. One of them should be 0 or 1 and the other should be
45965 // an SetCC or extended from it.
45966 SDValue Op1 = Cmp.getOperand(0);
45967 SDValue Op2 = Cmp.getOperand(1);
45968
45969 SDValue SetCC;
45970 const ConstantSDNode* C = nullptr;
45971 bool needOppositeCond = (CC == X86::COND_E);
45972 bool checkAgainstTrue = false; // Is it a comparison against 1?
45973
45974 if ((C = dyn_cast<ConstantSDNode>(Op1)))
45975 SetCC = Op2;
45976 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
45977 SetCC = Op1;
45978 else // Quit if all operands are not constants.
45979 return SDValue();
45980
45981 if (C->getZExtValue() == 1) {
45982 needOppositeCond = !needOppositeCond;
45983 checkAgainstTrue = true;
45984 } else if (C->getZExtValue() != 0)
45985 // Quit if the constant is neither 0 or 1.
45986 return SDValue();
45987
45988 bool truncatedToBoolWithAnd = false;
45989 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
45990 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
45991 SetCC.getOpcode() == ISD::TRUNCATE ||
45992 SetCC.getOpcode() == ISD::AND) {
45993 if (SetCC.getOpcode() == ISD::AND) {
45994 int OpIdx = -1;
45995 if (isOneConstant(SetCC.getOperand(0)))
45996 OpIdx = 1;
45997 if (isOneConstant(SetCC.getOperand(1)))
45998 OpIdx = 0;
45999 if (OpIdx < 0)
46000 break;
46001 SetCC = SetCC.getOperand(OpIdx);
46002 truncatedToBoolWithAnd = true;
46003 } else
46004 SetCC = SetCC.getOperand(0);
46005 }
46006
46007 switch (SetCC.getOpcode()) {
46009 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
46010 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
46011 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
46012 // truncated to i1 using 'and'.
46013 if (checkAgainstTrue && !truncatedToBoolWithAnd)
46014 break;
46016 "Invalid use of SETCC_CARRY!");
46017 [[fallthrough]];
46018 case X86ISD::SETCC:
46019 // Set the condition code or opposite one if necessary.
46021 if (needOppositeCond)
46023 return SetCC.getOperand(1);
46024 case X86ISD::CMOV: {
46025 // Check whether false/true value has canonical one, i.e. 0 or 1.
46026 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
46027 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
46028 // Quit if true value is not a constant.
46029 if (!TVal)
46030 return SDValue();
46031 // Quit if false value is not a constant.
46032 if (!FVal) {
46033 SDValue Op = SetCC.getOperand(0);
46034 // Skip 'zext' or 'trunc' node.
46035 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
46036 Op.getOpcode() == ISD::TRUNCATE)
46037 Op = Op.getOperand(0);
46038 // A special case for rdrand/rdseed, where 0 is set if false cond is
46039 // found.
46040 if ((Op.getOpcode() != X86ISD::RDRAND &&
46041 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
46042 return SDValue();
46043 }
46044 // Quit if false value is not the constant 0 or 1.
46045 bool FValIsFalse = true;
46046 if (FVal && FVal->getZExtValue() != 0) {
46047 if (FVal->getZExtValue() != 1)
46048 return SDValue();
46049 // If FVal is 1, opposite cond is needed.
46050 needOppositeCond = !needOppositeCond;
46051 FValIsFalse = false;
46052 }
46053 // Quit if TVal is not the constant opposite of FVal.
46054 if (FValIsFalse && TVal->getZExtValue() != 1)
46055 return SDValue();
46056 if (!FValIsFalse && TVal->getZExtValue() != 0)
46057 return SDValue();
46059 if (needOppositeCond)
46061 return SetCC.getOperand(3);
46062 }
46063 }
46064
46065 return SDValue();
46066}
46067
46068/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
46069/// Match:
46070/// (X86or (X86setcc) (X86setcc))
46071/// (X86cmp (and (X86setcc) (X86setcc)), 0)
46073 X86::CondCode &CC1, SDValue &Flags,
46074 bool &isAnd) {
46075 if (Cond->getOpcode() == X86ISD::CMP) {
46076 if (!isNullConstant(Cond->getOperand(1)))
46077 return false;
46078
46079 Cond = Cond->getOperand(0);
46080 }
46081
46082 isAnd = false;
46083
46084 SDValue SetCC0, SetCC1;
46085 switch (Cond->getOpcode()) {
46086 default: return false;
46087 case ISD::AND:
46088 case X86ISD::AND:
46089 isAnd = true;
46090 [[fallthrough]];
46091 case ISD::OR:
46092 case X86ISD::OR:
46093 SetCC0 = Cond->getOperand(0);
46094 SetCC1 = Cond->getOperand(1);
46095 break;
46096 };
46097
46098 // Make sure we have SETCC nodes, using the same flags value.
46099 if (SetCC0.getOpcode() != X86ISD::SETCC ||
46100 SetCC1.getOpcode() != X86ISD::SETCC ||
46101 SetCC0->getOperand(1) != SetCC1->getOperand(1))
46102 return false;
46103
46104 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
46105 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
46106 Flags = SetCC0->getOperand(1);
46107 return true;
46108}
46109
46110// When legalizing carry, we create carries via add X, -1
46111// If that comes from an actual carry, via setcc, we use the
46112// carry directly.
46114 if (EFLAGS.getOpcode() == X86ISD::ADD) {
46115 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
46116 bool FoundAndLSB = false;
46117 SDValue Carry = EFLAGS.getOperand(0);
46118 while (Carry.getOpcode() == ISD::TRUNCATE ||
46119 Carry.getOpcode() == ISD::ZERO_EXTEND ||
46120 (Carry.getOpcode() == ISD::AND &&
46121 isOneConstant(Carry.getOperand(1)))) {
46122 FoundAndLSB |= Carry.getOpcode() == ISD::AND;
46123 Carry = Carry.getOperand(0);
46124 }
46125 if (Carry.getOpcode() == X86ISD::SETCC ||
46126 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
46127 // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?
46128 uint64_t CarryCC = Carry.getConstantOperandVal(0);
46129 SDValue CarryOp1 = Carry.getOperand(1);
46130 if (CarryCC == X86::COND_B)
46131 return CarryOp1;
46132 if (CarryCC == X86::COND_A) {
46133 // Try to convert COND_A into COND_B in an attempt to facilitate
46134 // materializing "setb reg".
46135 //
46136 // Do not flip "e > c", where "c" is a constant, because Cmp
46137 // instruction cannot take an immediate as its first operand.
46138 //
46139 if (CarryOp1.getOpcode() == X86ISD::SUB &&
46140 CarryOp1.getNode()->hasOneUse() &&
46141 CarryOp1.getValueType().isInteger() &&
46142 !isa<ConstantSDNode>(CarryOp1.getOperand(1))) {
46143 SDValue SubCommute =
46144 DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),
46145 CarryOp1.getOperand(1), CarryOp1.getOperand(0));
46146 return SDValue(SubCommute.getNode(), CarryOp1.getResNo());
46147 }
46148 }
46149 // If this is a check of the z flag of an add with 1, switch to the
46150 // C flag.
46151 if (CarryCC == X86::COND_E &&
46152 CarryOp1.getOpcode() == X86ISD::ADD &&
46153 isOneConstant(CarryOp1.getOperand(1)))
46154 return CarryOp1;
46155 } else if (FoundAndLSB) {
46156 SDLoc DL(Carry);
46157 SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());
46158 if (Carry.getOpcode() == ISD::SRL) {
46159 BitNo = Carry.getOperand(1);
46160 Carry = Carry.getOperand(0);
46161 }
46162 return getBT(Carry, BitNo, DL, DAG);
46163 }
46164 }
46165 }
46166
46167 return SDValue();
46168}
46169
46170/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
46171/// to avoid the inversion.
46173 SelectionDAG &DAG,
46174 const X86Subtarget &Subtarget) {
46175 // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
46176 if (EFLAGS.getOpcode() != X86ISD::PTEST &&
46177 EFLAGS.getOpcode() != X86ISD::TESTP)
46178 return SDValue();
46179
46180 // PTEST/TESTP sets EFLAGS as:
46181 // TESTZ: ZF = (Op0 & Op1) == 0
46182 // TESTC: CF = (~Op0 & Op1) == 0
46183 // TESTNZC: ZF == 0 && CF == 0
46184 MVT VT = EFLAGS.getSimpleValueType();
46185 SDValue Op0 = EFLAGS.getOperand(0);
46186 SDValue Op1 = EFLAGS.getOperand(1);
46187 MVT OpVT = Op0.getSimpleValueType();
46188 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
46189
46190 // TEST*(~X,Y) == TEST*(X,Y)
46191 if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
46192 X86::CondCode InvCC;
46193 switch (CC) {
46194 case X86::COND_B:
46195 // testc -> testz.
46196 InvCC = X86::COND_E;
46197 break;
46198 case X86::COND_AE:
46199 // !testc -> !testz.
46200 InvCC = X86::COND_NE;
46201 break;
46202 case X86::COND_E:
46203 // testz -> testc.
46204 InvCC = X86::COND_B;
46205 break;
46206 case X86::COND_NE:
46207 // !testz -> !testc.
46208 InvCC = X86::COND_AE;
46209 break;
46210 case X86::COND_A:
46211 case X86::COND_BE:
46212 // testnzc -> testnzc (no change).
46213 InvCC = CC;
46214 break;
46215 default:
46216 InvCC = X86::COND_INVALID;
46217 break;
46218 }
46219
46220 if (InvCC != X86::COND_INVALID) {
46221 CC = InvCC;
46222 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46223 DAG.getBitcast(OpVT, NotOp0), Op1);
46224 }
46225 }
46226
46227 if (CC == X86::COND_B || CC == X86::COND_AE) {
46228 // TESTC(X,~X) == TESTC(X,-1)
46229 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
46230 if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {
46231 SDLoc DL(EFLAGS);
46232 return DAG.getNode(
46233 EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1),
46234 DAG.getBitcast(OpVT,
46235 DAG.getAllOnesConstant(DL, NotOp1.getValueType())));
46236 }
46237 }
46238 }
46239
46240 if (CC == X86::COND_E || CC == X86::COND_NE) {
46241 // TESTZ(X,~Y) == TESTC(Y,X)
46242 if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
46244 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46245 DAG.getBitcast(OpVT, NotOp1), Op0);
46246 }
46247
46248 if (Op0 == Op1) {
46249 SDValue BC = peekThroughBitcasts(Op0);
46250 EVT BCVT = BC.getValueType();
46251
46252 // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
46253 if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
46254 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46255 DAG.getBitcast(OpVT, BC.getOperand(0)),
46256 DAG.getBitcast(OpVT, BC.getOperand(1)));
46257 }
46258
46259 // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
46260 if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
46262 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46263 DAG.getBitcast(OpVT, BC.getOperand(0)),
46264 DAG.getBitcast(OpVT, BC.getOperand(1)));
46265 }
46266
46267 // If every element is an all-sign value, see if we can use TESTP/MOVMSK
46268 // to more efficiently extract the sign bits and compare that.
46269 // TODO: Handle TESTC with comparison inversion.
46270 // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
46271 // TESTP/MOVMSK combines to make sure its never worse than PTEST?
46272 if (BCVT.isVector() && TLI.isTypeLegal(BCVT)) {
46273 unsigned EltBits = BCVT.getScalarSizeInBits();
46274 if (DAG.ComputeNumSignBits(BC) == EltBits) {
46275 assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
46276 APInt SignMask = APInt::getSignMask(EltBits);
46277 if (SDValue Res =
46278 TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
46279 // For vXi16 cases we need to use pmovmksb and extract every other
46280 // sign bit.
46281 SDLoc DL(EFLAGS);
46282 if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {
46283 MVT FloatSVT = MVT::getFloatingPointVT(EltBits);
46284 MVT FloatVT =
46285 MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);
46286 Res = DAG.getBitcast(FloatVT, Res);
46287 return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);
46288 } else if (EltBits == 16) {
46289 MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
46290 Res = DAG.getBitcast(MovmskVT, Res);
46291 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
46292 Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
46293 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
46294 } else {
46295 Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
46296 }
46297 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
46298 DAG.getConstant(0, DL, MVT::i32));
46299 }
46300 }
46301 }
46302 }
46303
46304 // TESTZ(-1,X) == TESTZ(X,X)
46306 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
46307
46308 // TESTZ(X,-1) == TESTZ(X,X)
46310 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
46311
46312 // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)
46313 // TODO: Add COND_NE handling?
46314 if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {
46315 SDValue Src0 = peekThroughBitcasts(Op0);
46316 SDValue Src1 = peekThroughBitcasts(Op1);
46317 if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {
46319 peekThroughBitcasts(Src0.getOperand(1)), true);
46321 peekThroughBitcasts(Src1.getOperand(1)), true);
46322 if (Src0 && Src1) {
46323 MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT();
46324 return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
46325 DAG.getBitcast(OpVT2, Src0),
46326 DAG.getBitcast(OpVT2, Src1));
46327 }
46328 }
46329 }
46330 }
46331
46332 return SDValue();
46333}
46334
46335// Attempt to simplify the MOVMSK input based on the comparison type.
46337 SelectionDAG &DAG,
46338 const X86Subtarget &Subtarget) {
46339 // Handle eq/ne against zero (any_of).
46340 // Handle eq/ne against -1 (all_of).
46341 if (!(CC == X86::COND_E || CC == X86::COND_NE))
46342 return SDValue();
46343 if (EFLAGS.getValueType() != MVT::i32)
46344 return SDValue();
46345 unsigned CmpOpcode = EFLAGS.getOpcode();
46346 if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
46347 return SDValue();
46348 auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
46349 if (!CmpConstant)
46350 return SDValue();
46351 const APInt &CmpVal = CmpConstant->getAPIntValue();
46352
46353 SDValue CmpOp = EFLAGS.getOperand(0);
46354 unsigned CmpBits = CmpOp.getValueSizeInBits();
46355 assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
46356
46357 // Peek through any truncate.
46358 if (CmpOp.getOpcode() == ISD::TRUNCATE)
46359 CmpOp = CmpOp.getOperand(0);
46360
46361 // Bail if we don't find a MOVMSK.
46362 if (CmpOp.getOpcode() != X86ISD::MOVMSK)
46363 return SDValue();
46364
46365 SDValue Vec = CmpOp.getOperand(0);
46366 MVT VecVT = Vec.getSimpleValueType();
46367 assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
46368 "Unexpected MOVMSK operand");
46369 unsigned NumElts = VecVT.getVectorNumElements();
46370 unsigned NumEltBits = VecVT.getScalarSizeInBits();
46371
46372 bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();
46373 bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&
46374 NumElts <= CmpBits && CmpVal.isMask(NumElts);
46375 if (!IsAnyOf && !IsAllOf)
46376 return SDValue();
46377
46378 // TODO: Check more combining cases for me.
46379 // Here we check the cmp use number to decide do combining or not.
46380 // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"
46381 // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.
46382 bool IsOneUse = CmpOp.getNode()->hasOneUse();
46383
46384 // See if we can peek through to a vector with a wider element type, if the
46385 // signbits extend down to all the sub-elements as well.
46386 // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
46387 // potential SimplifyDemandedBits/Elts cases.
46388 // If we looked through a truncate that discard bits, we can't do this
46389 // transform.
46390 // FIXME: We could do this transform for truncates that discarded bits by
46391 // inserting an AND mask between the new MOVMSK and the CMP.
46392 if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {
46393 SDValue BC = peekThroughBitcasts(Vec);
46394 MVT BCVT = BC.getSimpleValueType();
46395 unsigned BCNumElts = BCVT.getVectorNumElements();
46396 unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
46397 if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
46398 BCNumEltBits > NumEltBits &&
46399 DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
46400 SDLoc DL(EFLAGS);
46401 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);
46402 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
46403 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
46404 DAG.getConstant(CmpMask, DL, MVT::i32));
46405 }
46406 }
46407
46408 // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).
46409 // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).
46410 // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).
46411 // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
46412 if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
46414 if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
46415 Ops.size() == 2) {
46416 SDLoc DL(EFLAGS);
46417 EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
46418 APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);
46419 SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,
46420 DAG.getBitcast(SubVT, Ops[0]),
46421 DAG.getBitcast(SubVT, Ops[1]));
46422 V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);
46423 return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
46424 DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),
46425 DAG.getConstant(CmpMask, DL, MVT::i32));
46426 }
46427 }
46428
46429 // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
46430 // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
46431 // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).
46432 // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).
46433 if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {
46434 MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
46435 SDValue BC = peekThroughBitcasts(Vec);
46436 // Ensure MOVMSK was testing every signbit of BC.
46437 if (BC.getValueType().getVectorNumElements() <= NumElts) {
46438 if (BC.getOpcode() == X86ISD::PCMPEQ) {
46439 SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),
46440 BC.getOperand(0), BC.getOperand(1));
46441 V = DAG.getBitcast(TestVT, V);
46442 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
46443 }
46444 // Check for 256-bit split vector cases.
46445 if (BC.getOpcode() == ISD::AND &&
46446 BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&
46447 BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {
46448 SDValue LHS = BC.getOperand(0);
46449 SDValue RHS = BC.getOperand(1);
46450 LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),
46451 LHS.getOperand(0), LHS.getOperand(1));
46452 RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),
46453 RHS.getOperand(0), RHS.getOperand(1));
46454 LHS = DAG.getBitcast(TestVT, LHS);
46455 RHS = DAG.getBitcast(TestVT, RHS);
46456 SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);
46457 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
46458 }
46459 }
46460 }
46461
46462 // See if we can avoid a PACKSS by calling MOVMSK on the sources.
46463 // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
46464 // sign bits prior to the comparison with zero unless we know that
46465 // the vXi16 splats the sign bit down to the lower i8 half.
46466 // TODO: Handle all_of patterns.
46467 if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
46468 SDValue VecOp0 = Vec.getOperand(0);
46469 SDValue VecOp1 = Vec.getOperand(1);
46470 bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
46471 bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
46472 // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
46473 if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
46474 SDLoc DL(EFLAGS);
46475 SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
46476 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
46477 Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
46478 if (!SignExt0) {
46479 Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
46480 DAG.getConstant(0xAAAA, DL, MVT::i16));
46481 }
46482 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
46483 DAG.getConstant(0, DL, MVT::i16));
46484 }
46485 // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
46486 // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
46487 if (CmpBits >= 16 && Subtarget.hasInt256() &&
46488 (IsAnyOf || (SignExt0 && SignExt1))) {
46489 if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {
46490 SDLoc DL(EFLAGS);
46491 SDValue Result = peekThroughBitcasts(Src);
46492 if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&
46493 Result.getValueType().getVectorNumElements() <= NumElts) {
46494 SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),
46495 Result.getOperand(0), Result.getOperand(1));
46496 V = DAG.getBitcast(MVT::v4i64, V);
46497 return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
46498 }
46499 Result = DAG.getBitcast(MVT::v32i8, Result);
46500 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
46501 unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
46502 if (!SignExt0 || !SignExt1) {
46503 assert(IsAnyOf &&
46504 "Only perform v16i16 signmasks for any_of patterns");
46505 Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
46506 DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
46507 }
46508 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
46509 DAG.getConstant(CmpMask, DL, MVT::i32));
46510 }
46511 }
46512 }
46513
46514 // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
46515 // Since we peek through a bitcast, we need to be careful if the base vector
46516 // type has smaller elements than the MOVMSK type. In that case, even if
46517 // all the elements are demanded by the shuffle mask, only the "high"
46518 // elements which have highbits that align with highbits in the MOVMSK vec
46519 // elements are actually demanded. A simplification of spurious operations
46520 // on the "low" elements take place during other simplifications.
46521 //
46522 // For example:
46523 // MOVMSK64(BITCAST(SHUF32 X, (1,0,3,2))) even though all the elements are
46524 // demanded, because we are swapping around the result can change.
46525 //
46526 // To address this, we check that we can scale the shuffle mask to MOVMSK
46527 // element width (this will ensure "high" elements match). Its slightly overly
46528 // conservative, but fine for an edge case fold.
46529 SmallVector<int, 32> ShuffleMask, ScaledMaskUnused;
46530 SmallVector<SDValue, 2> ShuffleInputs;
46531 if (NumElts <= CmpBits &&
46532 getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
46533 ShuffleMask, DAG) &&
46534 ShuffleInputs.size() == 1 && isCompletePermute(ShuffleMask) &&
46535 ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits() &&
46536 scaleShuffleElements(ShuffleMask, NumElts, ScaledMaskUnused)) {
46537 SDLoc DL(EFLAGS);
46538 SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
46539 Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
46540 Result =
46541 DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
46542 return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, EFLAGS.getOperand(1));
46543 }
46544
46545 // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)
46546 // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)
46547 // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V)
46548 // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V)
46549 // iff every element is referenced.
46550 if (NumElts <= CmpBits && Subtarget.hasAVX() &&
46551 !Subtarget.preferMovmskOverVTest() && IsOneUse &&
46552 (NumEltBits == 32 || NumEltBits == 64)) {
46553 SDLoc DL(EFLAGS);
46554 MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);
46555 MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);
46556 MVT IntVT = FloatVT.changeVectorElementTypeToInteger();
46557 SDValue LHS = Vec;
46558 SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT);
46559 CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
46560 return DAG.getNode(X86ISD::TESTP, DL, MVT::i32,
46561 DAG.getBitcast(FloatVT, LHS),
46562 DAG.getBitcast(FloatVT, RHS));
46563 }
46564
46565 return SDValue();
46566}
46567
46568/// Optimize an EFLAGS definition used according to the condition code \p CC
46569/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
46570/// uses of chain values.
46572 SelectionDAG &DAG,
46573 const X86Subtarget &Subtarget) {
46574 if (CC == X86::COND_B)
46575 if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))
46576 return Flags;
46577
46578 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
46579 return R;
46580
46581 if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
46582 return R;
46583
46584 if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
46585 return R;
46586
46587 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
46588}
46589
46590/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
46593 const X86Subtarget &Subtarget) {
46594 SDLoc DL(N);
46595
46596 SDValue FalseOp = N->getOperand(0);
46597 SDValue TrueOp = N->getOperand(1);
46598 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
46599 SDValue Cond = N->getOperand(3);
46600
46601 // cmov X, X, ?, ? --> X
46602 if (TrueOp == FalseOp)
46603 return TrueOp;
46604
46605 // Try to simplify the EFLAGS and condition code operands.
46606 // We can't always do this as FCMOV only supports a subset of X86 cond.
46607 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
46608 if (!(FalseOp.getValueType() == MVT::f80 ||
46609 (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
46610 (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
46611 !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
46612 SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
46613 Flags};
46614 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
46615 }
46616 }
46617
46618 // If this is a select between two integer constants, try to do some
46619 // optimizations. Note that the operands are ordered the opposite of SELECT
46620 // operands.
46621 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
46622 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
46623 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
46624 // larger than FalseC (the false value).
46625 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
46627 std::swap(TrueC, FalseC);
46628 std::swap(TrueOp, FalseOp);
46629 }
46630
46631 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
46632 // This is efficient for any integer data type (including i8/i16) and
46633 // shift amount.
46634 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
46635 Cond = getSETCC(CC, Cond, DL, DAG);
46636
46637 // Zero extend the condition if needed.
46638 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
46639
46640 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
46641 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
46642 DAG.getConstant(ShAmt, DL, MVT::i8));
46643 return Cond;
46644 }
46645
46646 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
46647 // for any integer data type, including i8/i16.
46648 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
46649 Cond = getSETCC(CC, Cond, DL, DAG);
46650
46651 // Zero extend the condition if needed.
46653 FalseC->getValueType(0), Cond);
46654 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
46655 SDValue(FalseC, 0));
46656 return Cond;
46657 }
46658
46659 // Optimize cases that will turn into an LEA instruction. This requires
46660 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
46661 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
46662 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
46663 assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&
46664 "Implicit constant truncation");
46665
46666 bool isFastMultiplier = false;
46667 if (Diff.ult(10)) {
46668 switch (Diff.getZExtValue()) {
46669 default: break;
46670 case 1: // result = add base, cond
46671 case 2: // result = lea base( , cond*2)
46672 case 3: // result = lea base(cond, cond*2)
46673 case 4: // result = lea base( , cond*4)
46674 case 5: // result = lea base(cond, cond*4)
46675 case 8: // result = lea base( , cond*8)
46676 case 9: // result = lea base(cond, cond*8)
46677 isFastMultiplier = true;
46678 break;
46679 }
46680 }
46681
46682 if (isFastMultiplier) {
46683 Cond = getSETCC(CC, Cond, DL ,DAG);
46684 // Zero extend the condition if needed.
46685 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
46686 Cond);
46687 // Scale the condition by the difference.
46688 if (Diff != 1)
46689 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
46690 DAG.getConstant(Diff, DL, Cond.getValueType()));
46691
46692 // Add the base if non-zero.
46693 if (FalseC->getAPIntValue() != 0)
46694 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
46695 SDValue(FalseC, 0));
46696 return Cond;
46697 }
46698 }
46699 }
46700 }
46701
46702 // Handle these cases:
46703 // (select (x != c), e, c) -> select (x != c), e, x),
46704 // (select (x == c), c, e) -> select (x == c), x, e)
46705 // where the c is an integer constant, and the "select" is the combination
46706 // of CMOV and CMP.
46707 //
46708 // The rationale for this change is that the conditional-move from a constant
46709 // needs two instructions, however, conditional-move from a register needs
46710 // only one instruction.
46711 //
46712 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
46713 // some instruction-combining opportunities. This opt needs to be
46714 // postponed as late as possible.
46715 //
46716 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
46717 // the DCI.xxxx conditions are provided to postpone the optimization as
46718 // late as possible.
46719
46720 ConstantSDNode *CmpAgainst = nullptr;
46721 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
46722 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
46723 !isa<ConstantSDNode>(Cond.getOperand(0))) {
46724
46725 if (CC == X86::COND_NE &&
46726 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
46728 std::swap(TrueOp, FalseOp);
46729 }
46730
46731 if (CC == X86::COND_E &&
46732 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
46733 SDValue Ops[] = {FalseOp, Cond.getOperand(0),
46734 DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
46735 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
46736 }
46737 }
46738 }
46739
46740 // Transform:
46741 //
46742 // (cmov 1 T (uge T 2))
46743 //
46744 // to:
46745 //
46746 // (adc T 0 (sub T 1))
46747 if (CC == X86::COND_AE && isOneConstant(FalseOp) &&
46748 Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {
46749 SDValue Cond0 = Cond.getOperand(0);
46750 if (Cond0.getOpcode() == ISD::TRUNCATE)
46751 Cond0 = Cond0.getOperand(0);
46752 auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
46753 if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
46754 EVT CondVT = Cond->getValueType(0);
46755 EVT OuterVT = N->getValueType(0);
46756 // Subtract 1 and generate a carry.
46757 SDValue NewSub =
46758 DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
46759 DAG.getConstant(1, DL, CondVT));
46760 SDValue EFLAGS(NewSub.getNode(), 1);
46761 return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(OuterVT, MVT::i32),
46762 TrueOp, DAG.getConstant(0, DL, OuterVT), EFLAGS);
46763 }
46764 }
46765
46766 // Fold and/or of setcc's to double CMOV:
46767 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
46768 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
46769 //
46770 // This combine lets us generate:
46771 // cmovcc1 (jcc1 if we don't have CMOV)
46772 // cmovcc2 (same)
46773 // instead of:
46774 // setcc1
46775 // setcc2
46776 // and/or
46777 // cmovne (jne if we don't have CMOV)
46778 // When we can't use the CMOV instruction, it might increase branch
46779 // mispredicts.
46780 // When we can use CMOV, or when there is no mispredict, this improves
46781 // throughput and reduces register pressure.
46782 //
46783 if (CC == X86::COND_NE) {
46784 SDValue Flags;
46785 X86::CondCode CC0, CC1;
46786 bool isAndSetCC;
46787 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
46788 if (isAndSetCC) {
46789 std::swap(FalseOp, TrueOp);
46792 }
46793
46794 SDValue LOps[] = {FalseOp, TrueOp,
46795 DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
46796 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
46797 SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
46798 Flags};
46799 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
46800 return CMOV;
46801 }
46802 }
46803
46804 // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->
46805 // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)
46806 // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->
46807 // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)
46808 if ((CC == X86::COND_NE || CC == X86::COND_E) &&
46809 Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {
46810 SDValue Add = TrueOp;
46811 SDValue Const = FalseOp;
46812 // Canonicalize the condition code for easier matching and output.
46813 if (CC == X86::COND_E)
46814 std::swap(Add, Const);
46815
46816 // We might have replaced the constant in the cmov with the LHS of the
46817 // compare. If so change it to the RHS of the compare.
46818 if (Const == Cond.getOperand(0))
46819 Const = Cond.getOperand(1);
46820
46821 // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.
46822 if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&
46823 Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&
46824 (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
46825 Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
46826 Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
46827 EVT VT = N->getValueType(0);
46828 // This should constant fold.
46829 SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
46830 SDValue CMov =
46831 DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),
46832 DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);
46833 return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));
46834 }
46835 }
46836
46837 return SDValue();
46838}
46839
46840/// Different mul shrinking modes.
46842
46844 EVT VT = N->getOperand(0).getValueType();
46845 if (VT.getScalarSizeInBits() != 32)
46846 return false;
46847
46848 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
46849 unsigned SignBits[2] = {1, 1};
46850 bool IsPositive[2] = {false, false};
46851 for (unsigned i = 0; i < 2; i++) {
46852 SDValue Opd = N->getOperand(i);
46853
46854 SignBits[i] = DAG.ComputeNumSignBits(Opd);
46855 IsPositive[i] = DAG.SignBitIsZero(Opd);
46856 }
46857
46858 bool AllPositive = IsPositive[0] && IsPositive[1];
46859 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
46860 // When ranges are from -128 ~ 127, use MULS8 mode.
46861 if (MinSignBits >= 25)
46862 Mode = ShrinkMode::MULS8;
46863 // When ranges are from 0 ~ 255, use MULU8 mode.
46864 else if (AllPositive && MinSignBits >= 24)
46865 Mode = ShrinkMode::MULU8;
46866 // When ranges are from -32768 ~ 32767, use MULS16 mode.
46867 else if (MinSignBits >= 17)
46868 Mode = ShrinkMode::MULS16;
46869 // When ranges are from 0 ~ 65535, use MULU16 mode.
46870 else if (AllPositive && MinSignBits >= 16)
46871 Mode = ShrinkMode::MULU16;
46872 else
46873 return false;
46874 return true;
46875}
46876
46877/// When the operands of vector mul are extended from smaller size values,
46878/// like i8 and i16, the type of mul may be shrinked to generate more
46879/// efficient code. Two typical patterns are handled:
46880/// Pattern1:
46881/// %2 = sext/zext <N x i8> %1 to <N x i32>
46882/// %4 = sext/zext <N x i8> %3 to <N x i32>
46883// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
46884/// %5 = mul <N x i32> %2, %4
46885///
46886/// Pattern2:
46887/// %2 = zext/sext <N x i16> %1 to <N x i32>
46888/// %4 = zext/sext <N x i16> %3 to <N x i32>
46889/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
46890/// %5 = mul <N x i32> %2, %4
46891///
46892/// There are four mul shrinking modes:
46893/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
46894/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
46895/// generate pmullw+sext32 for it (MULS8 mode).
46896/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
46897/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
46898/// generate pmullw+zext32 for it (MULU8 mode).
46899/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
46900/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
46901/// generate pmullw+pmulhw for it (MULS16 mode).
46902/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
46903/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
46904/// generate pmullw+pmulhuw for it (MULU16 mode).
46906 const X86Subtarget &Subtarget) {
46907 // Check for legality
46908 // pmullw/pmulhw are not supported by SSE.
46909 if (!Subtarget.hasSSE2())
46910 return SDValue();
46911
46912 // Check for profitability
46913 // pmulld is supported since SSE41. It is better to use pmulld
46914 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
46915 // the expansion.
46916 bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
46917 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
46918 return SDValue();
46919
46920 ShrinkMode Mode;
46921 if (!canReduceVMulWidth(N, DAG, Mode))
46922 return SDValue();
46923
46924 SDValue N0 = N->getOperand(0);
46925 SDValue N1 = N->getOperand(1);
46926 EVT VT = N->getOperand(0).getValueType();
46927 unsigned NumElts = VT.getVectorNumElements();
46928 if ((NumElts % 2) != 0)
46929 return SDValue();
46930
46931 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
46932
46933 // Shrink the operands of mul.
46934 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
46935 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
46936
46937 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
46938 // lower part is needed.
46939 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
46940 if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
46941 return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
46943 DL, VT, MulLo);
46944
46945 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
46946 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
46947 // the higher part is also needed.
46948 SDValue MulHi =
46949 DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
46950 ReducedVT, NewN0, NewN1);
46951
46952 // Repack the lower part and higher part result of mul into a wider
46953 // result.
46954 // Generate shuffle functioning as punpcklwd.
46955 SmallVector<int, 16> ShuffleMask(NumElts);
46956 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
46957 ShuffleMask[2 * i] = i;
46958 ShuffleMask[2 * i + 1] = i + NumElts;
46959 }
46960 SDValue ResLo =
46961 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
46962 ResLo = DAG.getBitcast(ResVT, ResLo);
46963 // Generate shuffle functioning as punpckhwd.
46964 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
46965 ShuffleMask[2 * i] = i + NumElts / 2;
46966 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
46967 }
46968 SDValue ResHi =
46969 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
46970 ResHi = DAG.getBitcast(ResVT, ResHi);
46971 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
46972}
46973
46975 EVT VT, const SDLoc &DL) {
46976
46977 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
46978 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
46979 DAG.getConstant(Mult, DL, VT));
46980 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
46981 DAG.getConstant(Shift, DL, MVT::i8));
46982 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
46983 N->getOperand(0));
46984 return Result;
46985 };
46986
46987 auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {
46988 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
46989 DAG.getConstant(Mul1, DL, VT));
46990 Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,
46991 DAG.getConstant(Mul2, DL, VT));
46992 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
46993 N->getOperand(0));
46994 return Result;
46995 };
46996
46997 switch (MulAmt) {
46998 default:
46999 break;
47000 case 11:
47001 // mul x, 11 => add ((shl (mul x, 5), 1), x)
47002 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
47003 case 21:
47004 // mul x, 21 => add ((shl (mul x, 5), 2), x)
47005 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
47006 case 41:
47007 // mul x, 41 => add ((shl (mul x, 5), 3), x)
47008 return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);
47009 case 22:
47010 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
47011 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
47012 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
47013 case 19:
47014 // mul x, 19 => add ((shl (mul x, 9), 1), x)
47015 return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);
47016 case 37:
47017 // mul x, 37 => add ((shl (mul x, 9), 2), x)
47018 return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);
47019 case 73:
47020 // mul x, 73 => add ((shl (mul x, 9), 3), x)
47021 return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);
47022 case 13:
47023 // mul x, 13 => add ((shl (mul x, 3), 2), x)
47024 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
47025 case 23:
47026 // mul x, 23 => sub ((shl (mul x, 3), 3), x)
47027 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
47028 case 26:
47029 // mul x, 26 => add ((mul (mul x, 5), 5), x)
47030 return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);
47031 case 28:
47032 // mul x, 28 => add ((mul (mul x, 9), 3), x)
47033 return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);
47034 case 29:
47035 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
47036 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
47037 combineMulMulAddOrSub(9, 3, /*isAdd*/ true));
47038 }
47039
47040 // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed
47041 // by a single LEA.
47042 // First check if this a sum of two power of 2s because that's easy. Then
47043 // count how many zeros are up to the first bit.
47044 // TODO: We can do this even without LEA at a cost of two shifts and an add.
47045 if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
47046 unsigned ScaleShift = llvm::countr_zero(MulAmt);
47047 if (ScaleShift >= 1 && ScaleShift < 4) {
47048 unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
47049 SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47050 DAG.getConstant(ShiftAmt, DL, MVT::i8));
47051 SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47052 DAG.getConstant(ScaleShift, DL, MVT::i8));
47053 return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
47054 }
47055 }
47056
47057 return SDValue();
47058}
47059
47060// If the upper 17 bits of either element are zero and the other element are
47061// zero/sign bits then we can use PMADDWD, which is always at least as quick as
47062// PMULLD, except on KNL.
47064 SelectionDAG &DAG,
47065 const X86Subtarget &Subtarget) {
47066 if (!Subtarget.hasSSE2())
47067 return SDValue();
47068
47069 if (Subtarget.isPMADDWDSlow())
47070 return SDValue();
47071
47072 EVT VT = N->getValueType(0);
47073
47074 // Only support vXi32 vectors.
47075 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
47076 return SDValue();
47077
47078 // Make sure the type is legal or can split/widen to a legal type.
47079 // With AVX512 but without BWI, we would need to split v32i16.
47080 unsigned NumElts = VT.getVectorNumElements();
47081 if (NumElts == 1 || !isPowerOf2_32(NumElts))
47082 return SDValue();
47083
47084 // With AVX512 but without BWI, we would need to split v32i16.
47085 if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())
47086 return SDValue();
47087
47088 SDValue N0 = N->getOperand(0);
47089 SDValue N1 = N->getOperand(1);
47090
47091 // If we are zero/sign extending two steps without SSE4.1, its better to
47092 // reduce the vmul width instead.
47093 if (!Subtarget.hasSSE41() &&
47094 (((N0.getOpcode() == ISD::ZERO_EXTEND &&
47095 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
47096 (N1.getOpcode() == ISD::ZERO_EXTEND &&
47097 N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||
47098 ((N0.getOpcode() == ISD::SIGN_EXTEND &&
47099 N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&
47100 (N1.getOpcode() == ISD::SIGN_EXTEND &&
47101 N1.getOperand(0).getScalarValueSizeInBits() <= 8))))
47102 return SDValue();
47103
47104 // If we are sign extending a wide vector without SSE4.1, its better to reduce
47105 // the vmul width instead.
47106 if (!Subtarget.hasSSE41() &&
47107 (N0.getOpcode() == ISD::SIGN_EXTEND &&
47108 N0.getOperand(0).getValueSizeInBits() > 128) &&
47109 (N1.getOpcode() == ISD::SIGN_EXTEND &&
47110 N1.getOperand(0).getValueSizeInBits() > 128))
47111 return SDValue();
47112
47113 // Sign bits must extend down to the lowest i16.
47114 if (DAG.ComputeMaxSignificantBits(N1) > 16 ||
47115 DAG.ComputeMaxSignificantBits(N0) > 16)
47116 return SDValue();
47117
47118 // At least one of the elements must be zero in the upper 17 bits, or can be
47119 // safely made zero without altering the final result.
47120 auto GetZeroableOp = [&](SDValue Op) {
47121 APInt Mask17 = APInt::getHighBitsSet(32, 17);
47122 if (DAG.MaskedValueIsZero(Op, Mask17))
47123 return Op;
47124 // Mask off upper 16-bits of sign-extended constants.
47126 return DAG.getNode(ISD::AND, DL, VT, Op, DAG.getConstant(0xFFFF, DL, VT));
47127 if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {
47128 SDValue Src = Op.getOperand(0);
47129 // Convert sext(vXi16) to zext(vXi16).
47130 if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)
47131 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
47132 // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets
47133 // which will expand the extension.
47134 if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {
47135 EVT ExtVT = VT.changeVectorElementType(MVT::i16);
47136 Src = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, Src);
47137 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Src);
47138 }
47139 }
47140 // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.
47141 if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
47142 N->isOnlyUserOf(Op.getNode())) {
47143 SDValue Src = Op.getOperand(0);
47144 if (Src.getScalarValueSizeInBits() == 16)
47145 return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Src);
47146 }
47147 // Convert VSRAI(Op, 16) to VSRLI(Op, 16).
47148 if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
47149 N->isOnlyUserOf(Op.getNode())) {
47150 return DAG.getNode(X86ISD::VSRLI, DL, VT, Op.getOperand(0),
47151 Op.getOperand(1));
47152 }
47153 return SDValue();
47154 };
47155 SDValue ZeroN0 = GetZeroableOp(N0);
47156 SDValue ZeroN1 = GetZeroableOp(N1);
47157 if (!ZeroN0 && !ZeroN1)
47158 return SDValue();
47159 N0 = ZeroN0 ? ZeroN0 : N0;
47160 N1 = ZeroN1 ? ZeroN1 : N1;
47161
47162 // Use SplitOpsAndApply to handle AVX splitting.
47163 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47164 ArrayRef<SDValue> Ops) {
47165 MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
47166 MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);
47167 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
47168 DAG.getBitcast(OpVT, Ops[0]),
47169 DAG.getBitcast(OpVT, Ops[1]));
47170 };
47171 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMADDWDBuilder);
47172}
47173
47175 const X86Subtarget &Subtarget) {
47176 if (!Subtarget.hasSSE2())
47177 return SDValue();
47178
47179 EVT VT = N->getValueType(0);
47180
47181 // Only support vXi64 vectors.
47182 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
47183 VT.getVectorNumElements() < 2 ||
47185 return SDValue();
47186
47187 SDValue N0 = N->getOperand(0);
47188 SDValue N1 = N->getOperand(1);
47189
47190 // MULDQ returns the 64-bit result of the signed multiplication of the lower
47191 // 32-bits. We can lower with this if the sign bits stretch that far.
47192 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
47193 DAG.ComputeNumSignBits(N1) > 32) {
47194 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47195 ArrayRef<SDValue> Ops) {
47196 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
47197 };
47198 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULDQBuilder,
47199 /*CheckBWI*/ false);
47200 }
47201
47202 // If the upper bits are zero we can use a single pmuludq.
47203 APInt Mask = APInt::getHighBitsSet(64, 32);
47204 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
47205 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
47206 ArrayRef<SDValue> Ops) {
47207 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
47208 };
47209 return SplitOpsAndApply(DAG, Subtarget, DL, VT, {N0, N1}, PMULUDQBuilder,
47210 /*CheckBWI*/ false);
47211 }
47212
47213 return SDValue();
47214}
47215
47218 const X86Subtarget &Subtarget) {
47219 EVT VT = N->getValueType(0);
47220 SDLoc DL(N);
47221
47222 if (SDValue V = combineMulToPMADDWD(N, DL, DAG, Subtarget))
47223 return V;
47224
47225 if (SDValue V = combineMulToPMULDQ(N, DL, DAG, Subtarget))
47226 return V;
47227
47228 if (DCI.isBeforeLegalize() && VT.isVector())
47229 return reduceVMULWidth(N, DL, DAG, Subtarget);
47230
47231 // Optimize a single multiply with constant into two operations in order to
47232 // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
47234 return SDValue();
47235
47236 // An imul is usually smaller than the alternative sequence.
47238 return SDValue();
47239
47240 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
47241 return SDValue();
47242
47243 if (VT != MVT::i64 && VT != MVT::i32 &&
47244 (!VT.isVector() || !VT.isSimple() || !VT.isInteger()))
47245 return SDValue();
47246
47248 N->getOperand(1), /*AllowUndefs*/ true, /*AllowTrunc*/ false);
47249 const APInt *C = nullptr;
47250 if (!CNode) {
47251 if (VT.isVector())
47252 if (auto *RawC = getTargetConstantFromNode(N->getOperand(1)))
47253 if (auto *SplatC = RawC->getSplatValue())
47254 C = &(SplatC->getUniqueInteger());
47255
47256 if (!C || C->getBitWidth() != VT.getScalarSizeInBits())
47257 return SDValue();
47258 } else {
47259 C = &(CNode->getAPIntValue());
47260 }
47261
47262 if (isPowerOf2_64(C->getZExtValue()))
47263 return SDValue();
47264
47265 int64_t SignMulAmt = C->getSExtValue();
47266 assert(SignMulAmt != INT64_MIN && "Int min should have been handled!");
47267 uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;
47268
47269 SDValue NewMul = SDValue();
47270 if (VT == MVT::i64 || VT == MVT::i32) {
47271 if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {
47272 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
47273 DAG.getConstant(AbsMulAmt, DL, VT));
47274 if (SignMulAmt < 0)
47275 NewMul = DAG.getNegative(NewMul, DL, VT);
47276
47277 return NewMul;
47278 }
47279
47280 uint64_t MulAmt1 = 0;
47281 uint64_t MulAmt2 = 0;
47282 if ((AbsMulAmt % 9) == 0) {
47283 MulAmt1 = 9;
47284 MulAmt2 = AbsMulAmt / 9;
47285 } else if ((AbsMulAmt % 5) == 0) {
47286 MulAmt1 = 5;
47287 MulAmt2 = AbsMulAmt / 5;
47288 } else if ((AbsMulAmt % 3) == 0) {
47289 MulAmt1 = 3;
47290 MulAmt2 = AbsMulAmt / 3;
47291 }
47292
47293 // For negative multiply amounts, only allow MulAmt2 to be a power of 2.
47294 if (MulAmt2 &&
47295 (isPowerOf2_64(MulAmt2) ||
47296 (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
47297
47298 if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
47299 N->use_begin()->getOpcode() == ISD::ADD))
47300 // If second multiplifer is pow2, issue it first. We want the multiply
47301 // by 3, 5, or 9 to be folded into the addressing mode unless the lone
47302 // use is an add. Only do this for positive multiply amounts since the
47303 // negate would prevent it from being used as an address mode anyway.
47304 std::swap(MulAmt1, MulAmt2);
47305
47306 if (isPowerOf2_64(MulAmt1))
47307 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47308 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
47309 else
47310 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
47311 DAG.getConstant(MulAmt1, DL, VT));
47312
47313 if (isPowerOf2_64(MulAmt2))
47314 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
47315 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
47316 else
47317 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
47318 DAG.getConstant(MulAmt2, DL, VT));
47319
47320 // Negate the result.
47321 if (SignMulAmt < 0)
47322 NewMul = DAG.getNegative(NewMul, DL, VT);
47323 } else if (!Subtarget.slowLEA())
47324 NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);
47325 }
47326 if (!NewMul) {
47327 EVT ShiftVT = VT.isVector() ? VT : MVT::i8;
47328 assert(C->getZExtValue() != 0 &&
47329 C->getZExtValue() != maxUIntN(VT.getScalarSizeInBits()) &&
47330 "Both cases that could cause potential overflows should have "
47331 "already been handled.");
47332 if (isPowerOf2_64(AbsMulAmt - 1)) {
47333 // (mul x, 2^N + 1) => (add (shl x, N), x)
47334 NewMul = DAG.getNode(
47335 ISD::ADD, DL, VT, N->getOperand(0),
47336 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47337 DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, ShiftVT)));
47338 if (SignMulAmt < 0)
47339 NewMul = DAG.getNegative(NewMul, DL, VT);
47340 } else if (isPowerOf2_64(AbsMulAmt + 1)) {
47341 // (mul x, 2^N - 1) => (sub (shl x, N), x)
47342 NewMul =
47343 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47344 DAG.getConstant(Log2_64(AbsMulAmt + 1), DL, ShiftVT));
47345 // To negate, reverse the operands of the subtract.
47346 if (SignMulAmt < 0)
47347 NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);
47348 else
47349 NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));
47350 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2) &&
47351 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
47352 // (mul x, 2^N + 2) => (add (shl x, N), (add x, x))
47353 NewMul =
47354 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47355 DAG.getConstant(Log2_64(AbsMulAmt - 2), DL, ShiftVT));
47356 NewMul = DAG.getNode(
47357 ISD::ADD, DL, VT, NewMul,
47358 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
47359 } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2) &&
47360 (!VT.isVector() || Subtarget.fastImmVectorShift())) {
47361 // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))
47362 NewMul =
47363 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47364 DAG.getConstant(Log2_64(AbsMulAmt + 2), DL, ShiftVT));
47365 NewMul = DAG.getNode(
47366 ISD::SUB, DL, VT, NewMul,
47367 DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));
47368 } else if (SignMulAmt >= 0 && VT.isVector() &&
47369 Subtarget.fastImmVectorShift()) {
47370 uint64_t AbsMulAmtLowBit = AbsMulAmt & (-AbsMulAmt);
47371 uint64_t ShiftAmt1;
47372 std::optional<unsigned> Opc;
47373 if (isPowerOf2_64(AbsMulAmt - AbsMulAmtLowBit)) {
47374 ShiftAmt1 = AbsMulAmt - AbsMulAmtLowBit;
47375 Opc = ISD::ADD;
47376 } else if (isPowerOf2_64(AbsMulAmt + AbsMulAmtLowBit)) {
47377 ShiftAmt1 = AbsMulAmt + AbsMulAmtLowBit;
47378 Opc = ISD::SUB;
47379 }
47380
47381 if (Opc) {
47382 SDValue Shift1 =
47383 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47384 DAG.getConstant(Log2_64(ShiftAmt1), DL, ShiftVT));
47385 SDValue Shift2 =
47386 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
47387 DAG.getConstant(Log2_64(AbsMulAmtLowBit), DL, ShiftVT));
47388 NewMul = DAG.getNode(*Opc, DL, VT, Shift1, Shift2);
47389 }
47390 }
47391 }
47392
47393 return NewMul;
47394}
47395
47396// Try to form a MULHU or MULHS node by looking for
47397// (srl (mul ext, ext), 16)
47398// TODO: This is X86 specific because we want to be able to handle wide types
47399// before type legalization. But we can only do it if the vector will be
47400// legalized via widening/splitting. Type legalization can't handle promotion
47401// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
47402// combiner.
47404 const X86Subtarget &Subtarget) {
47405 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
47406 "SRL or SRA node is required here!");
47407 SDLoc DL(N);
47408
47409 if (!Subtarget.hasSSE2())
47410 return SDValue();
47411
47412 // The operation feeding into the shift must be a multiply.
47413 SDValue ShiftOperand = N->getOperand(0);
47414 if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
47415 return SDValue();
47416
47417 // Input type should be at least vXi32.
47418 EVT VT = N->getValueType(0);
47419 if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
47420 return SDValue();
47421
47422 // Need a shift by 16.
47423 APInt ShiftAmt;
47424 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
47425 ShiftAmt != 16)
47426 return SDValue();
47427
47428 SDValue LHS = ShiftOperand.getOperand(0);
47429 SDValue RHS = ShiftOperand.getOperand(1);
47430
47431 unsigned ExtOpc = LHS.getOpcode();
47432 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
47433 RHS.getOpcode() != ExtOpc)
47434 return SDValue();
47435
47436 // Peek through the extends.
47437 LHS = LHS.getOperand(0);
47438 RHS = RHS.getOperand(0);
47439
47440 // Ensure the input types match.
47441 EVT MulVT = LHS.getValueType();
47442 if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
47443 return SDValue();
47444
47445 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
47446 SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
47447
47448 ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
47449 return DAG.getNode(ExtOpc, DL, VT, Mulh);
47450}
47451
47453 SDValue N0 = N->getOperand(0);
47454 SDValue N1 = N->getOperand(1);
47455 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
47456 EVT VT = N0.getValueType();
47457
47458 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
47459 // since the result of setcc_c is all zero's or all ones.
47460 if (VT.isInteger() && !VT.isVector() &&
47461 N1C && N0.getOpcode() == ISD::AND &&
47462 N0.getOperand(1).getOpcode() == ISD::Constant) {
47463 SDValue N00 = N0.getOperand(0);
47464 APInt Mask = N0.getConstantOperandAPInt(1);
47465 Mask <<= N1C->getAPIntValue();
47466 bool MaskOK = false;
47467 // We can handle cases concerning bit-widening nodes containing setcc_c if
47468 // we carefully interrogate the mask to make sure we are semantics
47469 // preserving.
47470 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
47471 // of the underlying setcc_c operation if the setcc_c was zero extended.
47472 // Consider the following example:
47473 // zext(setcc_c) -> i32 0x0000FFFF
47474 // c1 -> i32 0x0000FFFF
47475 // c2 -> i32 0x00000001
47476 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
47477 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
47478 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
47479 MaskOK = true;
47480 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
47482 MaskOK = true;
47483 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
47484 N00.getOpcode() == ISD::ANY_EXTEND) &&
47486 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
47487 }
47488 if (MaskOK && Mask != 0) {
47489 SDLoc DL(N);
47490 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
47491 }
47492 }
47493
47494 return SDValue();
47495}
47496
47498 const X86Subtarget &Subtarget) {
47499 SDValue N0 = N->getOperand(0);
47500 SDValue N1 = N->getOperand(1);
47501 EVT VT = N0.getValueType();
47502 unsigned Size = VT.getSizeInBits();
47503
47504 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
47505 return V;
47506
47507 APInt ShiftAmt;
47508 if (supportedVectorVarShift(VT, Subtarget, ISD::SRA) &&
47509 N1.getOpcode() == ISD::UMIN &&
47510 ISD::isConstantSplatVector(N1.getOperand(1).getNode(), ShiftAmt) &&
47511 ShiftAmt == VT.getScalarSizeInBits() - 1) {
47512 SDValue ShrAmtVal = N1.getOperand(0);
47513 SDLoc DL(N);
47514 return DAG.getNode(X86ISD::VSRAV, DL, N->getVTList(), N0, ShrAmtVal);
47515 }
47516
47517 // fold (SRA (SHL X, ShlConst), SraConst)
47518 // into (SHL (sext_in_reg X), ShlConst - SraConst)
47519 // or (sext_in_reg X)
47520 // or (SRA (sext_in_reg X), SraConst - ShlConst)
47521 // depending on relation between SraConst and ShlConst.
47522 // We only do this if (Size - ShlConst) is equal to 8, 16 or 32. That allows
47523 // us to do the sext_in_reg from corresponding bit.
47524
47525 // sexts in X86 are MOVs. The MOVs have the same code size
47526 // as above SHIFTs (only SHIFT on 1 has lower code size).
47527 // However the MOVs have 2 advantages to a SHIFT:
47528 // 1. MOVs can write to a register that differs from source
47529 // 2. MOVs accept memory operands
47530
47531 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
47532 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
47534 return SDValue();
47535
47536 SDValue N00 = N0.getOperand(0);
47537 SDValue N01 = N0.getOperand(1);
47538 APInt ShlConst = N01->getAsAPIntVal();
47539 APInt SraConst = N1->getAsAPIntVal();
47540 EVT CVT = N1.getValueType();
47541
47542 if (CVT != N01.getValueType())
47543 return SDValue();
47544 if (SraConst.isNegative())
47545 return SDValue();
47546
47547 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
47548 unsigned ShiftSize = SVT.getSizeInBits();
47549 // Only deal with (Size - ShlConst) being equal to 8, 16 or 32.
47550 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
47551 continue;
47552 SDLoc DL(N);
47553 SDValue NN =
47554 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
47555 if (SraConst.eq(ShlConst))
47556 return NN;
47557 if (SraConst.ult(ShlConst))
47558 return DAG.getNode(ISD::SHL, DL, VT, NN,
47559 DAG.getConstant(ShlConst - SraConst, DL, CVT));
47560 return DAG.getNode(ISD::SRA, DL, VT, NN,
47561 DAG.getConstant(SraConst - ShlConst, DL, CVT));
47562 }
47563 return SDValue();
47564}
47565
47568 const X86Subtarget &Subtarget) {
47569 SDValue N0 = N->getOperand(0);
47570 SDValue N1 = N->getOperand(1);
47571 EVT VT = N0.getValueType();
47572
47573 if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
47574 return V;
47575
47576 // Only do this on the last DAG combine as it can interfere with other
47577 // combines.
47578 if (!DCI.isAfterLegalizeDAG())
47579 return SDValue();
47580
47581 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
47582 // TODO: This is a generic DAG combine that became an x86-only combine to
47583 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
47584 // and-not ('andn').
47585 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
47586 return SDValue();
47587
47588 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
47589 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
47590 if (!ShiftC || !AndC)
47591 return SDValue();
47592
47593 // If we can shrink the constant mask below 8-bits or 32-bits, then this
47594 // transform should reduce code size. It may also enable secondary transforms
47595 // from improved known-bits analysis or instruction selection.
47596 APInt MaskVal = AndC->getAPIntValue();
47597
47598 // If this can be matched by a zero extend, don't optimize.
47599 if (MaskVal.isMask()) {
47600 unsigned TO = MaskVal.countr_one();
47601 if (TO >= 8 && isPowerOf2_32(TO))
47602 return SDValue();
47603 }
47604
47605 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
47606 unsigned OldMaskSize = MaskVal.getSignificantBits();
47607 unsigned NewMaskSize = NewMaskVal.getSignificantBits();
47608 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
47609 (OldMaskSize > 32 && NewMaskSize <= 32)) {
47610 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
47611 SDLoc DL(N);
47612 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
47613 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
47614 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
47615 }
47616 return SDValue();
47617}
47618
47620 const X86Subtarget &Subtarget) {
47621 unsigned Opcode = N->getOpcode();
47622 assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode");
47623
47624 SDLoc DL(N);
47625 EVT VT = N->getValueType(0);
47626 SDValue N0 = N->getOperand(0);
47627 SDValue N1 = N->getOperand(1);
47628 EVT SrcVT = N0.getValueType();
47629
47630 SDValue BC0 =
47631 N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;
47632 SDValue BC1 =
47633 N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;
47634
47635 // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
47636 // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
47637 // truncation trees that help us avoid lane crossing shuffles.
47638 // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
47639 // TODO: We don't handle vXf64 shuffles yet.
47640 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
47641 if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {
47643 SmallVector<int> ShuffleMask, ScaledMask;
47644 SDValue Vec = peekThroughBitcasts(BCSrc);
47645 if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {
47647 // To keep the HOP LHS/RHS coherency, we must be able to scale the unary
47648 // shuffle to a v4X64 width - we can probably relax this in the future.
47649 if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&
47650 ShuffleOps[0].getValueType().is256BitVector() &&
47651 scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {
47652 SDValue Lo, Hi;
47653 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
47654 std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);
47655 Lo = DAG.getBitcast(SrcVT, Lo);
47656 Hi = DAG.getBitcast(SrcVT, Hi);
47657 SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
47658 Res = DAG.getBitcast(ShufVT, Res);
47659 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);
47660 return DAG.getBitcast(VT, Res);
47661 }
47662 }
47663 }
47664 }
47665
47666 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).
47667 if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {
47668 // If either/both ops are a shuffle that can scale to v2x64,
47669 // then see if we can perform this as a v4x32 post shuffle.
47670 SmallVector<SDValue> Ops0, Ops1;
47671 SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;
47672 bool IsShuf0 =
47673 getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
47674 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
47675 all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
47676 bool IsShuf1 =
47677 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
47678 scaleShuffleElements(Mask1, 2, ScaledMask1) &&
47679 all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });
47680 if (IsShuf0 || IsShuf1) {
47681 if (!IsShuf0) {
47682 Ops0.assign({BC0});
47683 ScaledMask0.assign({0, 1});
47684 }
47685 if (!IsShuf1) {
47686 Ops1.assign({BC1});
47687 ScaledMask1.assign({0, 1});
47688 }
47689
47690 SDValue LHS, RHS;
47691 int PostShuffle[4] = {-1, -1, -1, -1};
47692 auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {
47693 if (M < 0)
47694 return true;
47695 Idx = M % 2;
47696 SDValue Src = Ops[M / 2];
47697 if (!LHS || LHS == Src) {
47698 LHS = Src;
47699 return true;
47700 }
47701 if (!RHS || RHS == Src) {
47702 Idx += 2;
47703 RHS = Src;
47704 return true;
47705 }
47706 return false;
47707 };
47708 if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&
47709 FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&
47710 FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&
47711 FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {
47712 LHS = DAG.getBitcast(SrcVT, LHS);
47713 RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);
47714 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;
47715 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
47716 Res = DAG.getBitcast(ShufVT, Res);
47717 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);
47718 return DAG.getBitcast(VT, Res);
47719 }
47720 }
47721 }
47722
47723 // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).
47724 if (VT.is256BitVector() && Subtarget.hasInt256()) {
47725 SmallVector<int> Mask0, Mask1;
47726 SmallVector<SDValue> Ops0, Ops1;
47727 SmallVector<int, 2> ScaledMask0, ScaledMask1;
47728 if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&
47729 getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&
47730 !Ops0.empty() && !Ops1.empty() &&
47731 all_of(Ops0,
47732 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
47733 all_of(Ops1,
47734 [](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&
47735 scaleShuffleElements(Mask0, 2, ScaledMask0) &&
47736 scaleShuffleElements(Mask1, 2, ScaledMask1)) {
47737 SDValue Op00 = peekThroughBitcasts(Ops0.front());
47738 SDValue Op10 = peekThroughBitcasts(Ops1.front());
47739 SDValue Op01 = peekThroughBitcasts(Ops0.back());
47740 SDValue Op11 = peekThroughBitcasts(Ops1.back());
47741 if ((Op00 == Op11) && (Op01 == Op10)) {
47742 std::swap(Op10, Op11);
47744 }
47745 if ((Op00 == Op10) && (Op01 == Op11)) {
47746 const int Map[4] = {0, 2, 1, 3};
47747 SmallVector<int, 4> ShuffleMask(
47748 {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],
47749 Map[ScaledMask1[1]]});
47750 MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
47751 SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),
47752 DAG.getBitcast(SrcVT, Op01));
47753 Res = DAG.getBitcast(ShufVT, Res);
47754 Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);
47755 return DAG.getBitcast(VT, Res);
47756 }
47757 }
47758 }
47759
47760 return SDValue();
47761}
47762
47765 const X86Subtarget &Subtarget) {
47766 unsigned Opcode = N->getOpcode();
47767 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
47768 "Unexpected pack opcode");
47769
47770 EVT VT = N->getValueType(0);
47771 SDValue N0 = N->getOperand(0);
47772 SDValue N1 = N->getOperand(1);
47773 unsigned NumDstElts = VT.getVectorNumElements();
47774 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
47775 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
47776 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
47777 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
47778 "Unexpected PACKSS/PACKUS input type");
47779
47780 bool IsSigned = (X86ISD::PACKSS == Opcode);
47781
47782 // Constant Folding.
47783 APInt UndefElts0, UndefElts1;
47784 SmallVector<APInt, 32> EltBits0, EltBits1;
47785 if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&
47786 (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&
47787 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0,
47788 /*AllowWholeUndefs*/ true,
47789 /*AllowPartialUndefs*/ true) &&
47790 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1,
47791 /*AllowWholeUndefs*/ true,
47792 /*AllowPartialUndefs*/ true)) {
47793 unsigned NumLanes = VT.getSizeInBits() / 128;
47794 unsigned NumSrcElts = NumDstElts / 2;
47795 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
47796 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
47797
47798 APInt Undefs(NumDstElts, 0);
47799 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));
47800 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
47801 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
47802 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
47803 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
47804 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
47805
47806 if (UndefElts[SrcIdx]) {
47807 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
47808 continue;
47809 }
47810
47811 APInt &Val = EltBits[SrcIdx];
47812 if (IsSigned) {
47813 // PACKSS: Truncate signed value with signed saturation.
47814 // Source values less than dst minint are saturated to minint.
47815 // Source values greater than dst maxint are saturated to maxint.
47816 Val = Val.truncSSat(DstBitsPerElt);
47817 } else {
47818 // PACKUS: Truncate signed value with unsigned saturation.
47819 // Source values less than zero are saturated to zero.
47820 // Source values greater than dst maxuint are saturated to maxuint.
47821 // NOTE: This is different from APInt::truncUSat.
47822 if (Val.isIntN(DstBitsPerElt))
47823 Val = Val.trunc(DstBitsPerElt);
47824 else if (Val.isNegative())
47825 Val = APInt::getZero(DstBitsPerElt);
47826 else
47827 Val = APInt::getAllOnes(DstBitsPerElt);
47828 }
47829 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
47830 }
47831 }
47832
47833 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
47834 }
47835
47836 // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
47837 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
47838 return V;
47839
47840 // Try to fold PACKSS(NOT(X),NOT(Y)) -> NOT(PACKSS(X,Y)).
47841 // Currently limit this to allsignbits cases only.
47842 if (IsSigned &&
47843 (N0.isUndef() || DAG.ComputeNumSignBits(N0) == SrcBitsPerElt) &&
47844 (N1.isUndef() || DAG.ComputeNumSignBits(N1) == SrcBitsPerElt)) {
47845 SDValue Not0 = N0.isUndef() ? N0 : IsNOT(N0, DAG);
47846 SDValue Not1 = N1.isUndef() ? N1 : IsNOT(N1, DAG);
47847 if (Not0 && Not1) {
47848 SDLoc DL(N);
47849 MVT SrcVT = N0.getSimpleValueType();
47850 SDValue Pack =
47851 DAG.getNode(X86ISD::PACKSS, DL, VT, DAG.getBitcast(SrcVT, Not0),
47852 DAG.getBitcast(SrcVT, Not1));
47853 return DAG.getNOT(DL, Pack, VT);
47854 }
47855 }
47856
47857 // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
47858 // truncate to create a larger truncate.
47859 if (Subtarget.hasAVX512() &&
47860 N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&
47861 N0.getOperand(0).getValueType() == MVT::v8i32) {
47862 if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||
47863 (!IsSigned &&
47864 DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {
47865 if (Subtarget.hasVLX())
47866 return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));
47867
47868 // Widen input to v16i32 so we can truncate that.
47869 SDLoc dl(N);
47870 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,
47871 N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));
47872 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);
47873 }
47874 }
47875
47876 // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.
47877 if (VT.is128BitVector()) {
47878 unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
47879 SDValue Src0, Src1;
47880 if (N0.getOpcode() == ExtOpc &&
47882 N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
47883 Src0 = N0.getOperand(0);
47884 }
47885 if (N1.getOpcode() == ExtOpc &&
47887 N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {
47888 Src1 = N1.getOperand(0);
47889 }
47890 if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {
47891 assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)");
47892 Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());
47893 Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());
47894 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);
47895 }
47896
47897 // Try again with pack(*_extend_vector_inreg, undef).
47898 unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG
47900 if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&
47901 N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)
47902 return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),
47903 DAG);
47904 }
47905
47906 // Attempt to combine as shuffle.
47907 SDValue Op(N, 0);
47908 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
47909 return Res;
47910
47911 return SDValue();
47912}
47913
47916 const X86Subtarget &Subtarget) {
47917 assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||
47918 X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
47919 "Unexpected horizontal add/sub opcode");
47920
47921 if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {
47922 MVT VT = N->getSimpleValueType(0);
47923 SDValue LHS = N->getOperand(0);
47924 SDValue RHS = N->getOperand(1);
47925
47926 // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).
47927 if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&
47928 LHS.getOpcode() == RHS.getOpcode() &&
47929 LHS.getValueType() == RHS.getValueType() &&
47930 N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {
47931 SDValue LHS0 = LHS.getOperand(0);
47932 SDValue LHS1 = LHS.getOperand(1);
47933 SDValue RHS0 = RHS.getOperand(0);
47934 SDValue RHS1 = RHS.getOperand(1);
47935 if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&
47936 (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {
47937 SDLoc DL(N);
47938 SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),
47939 LHS0.isUndef() ? LHS1 : LHS0,
47940 RHS0.isUndef() ? RHS1 : RHS0);
47941 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
47942 Res = DAG.getBitcast(ShufVT, Res);
47943 SDValue NewLHS =
47944 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
47945 getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));
47946 SDValue NewRHS =
47947 DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,
47948 getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));
47949 return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),
47950 DAG.getBitcast(VT, NewRHS));
47951 }
47952 }
47953 }
47954
47955 // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).
47956 if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))
47957 return V;
47958
47959 return SDValue();
47960}
47961
47964 const X86Subtarget &Subtarget) {
47965 assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||
47966 X86ISD::VSRL == N->getOpcode()) &&
47967 "Unexpected shift opcode");
47968 EVT VT = N->getValueType(0);
47969 SDValue N0 = N->getOperand(0);
47970 SDValue N1 = N->getOperand(1);
47971
47972 // Shift zero -> zero.
47974 return DAG.getConstant(0, SDLoc(N), VT);
47975
47976 // Detect constant shift amounts.
47977 APInt UndefElts;
47978 SmallVector<APInt, 32> EltBits;
47979 if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits,
47980 /*AllowWholeUndefs*/ true,
47981 /*AllowPartialUndefs*/ false)) {
47982 unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);
47983 return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,
47984 EltBits[0].getZExtValue(), DAG);
47985 }
47986
47987 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
47988 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
47989 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
47990 return SDValue(N, 0);
47991
47992 return SDValue();
47993}
47994
47997 const X86Subtarget &Subtarget) {
47998 unsigned Opcode = N->getOpcode();
47999 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
48000 X86ISD::VSRLI == Opcode) &&
48001 "Unexpected shift opcode");
48002 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
48003 EVT VT = N->getValueType(0);
48004 SDValue N0 = N->getOperand(0);
48005 SDValue N1 = N->getOperand(1);
48006 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
48007 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
48008 "Unexpected value type");
48009 assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
48010
48011 // (shift undef, X) -> 0
48012 if (N0.isUndef())
48013 return DAG.getConstant(0, SDLoc(N), VT);
48014
48015 // Out of range logical bit shifts are guaranteed to be zero.
48016 // Out of range arithmetic bit shifts splat the sign bit.
48017 unsigned ShiftVal = N->getConstantOperandVal(1);
48018 if (ShiftVal >= NumBitsPerElt) {
48019 if (LogicalShift)
48020 return DAG.getConstant(0, SDLoc(N), VT);
48021 ShiftVal = NumBitsPerElt - 1;
48022 }
48023
48024 // (shift X, 0) -> X
48025 if (!ShiftVal)
48026 return N0;
48027
48028 // (shift 0, C) -> 0
48030 // N0 is all zeros or undef. We guarantee that the bits shifted into the
48031 // result are all zeros, not undef.
48032 return DAG.getConstant(0, SDLoc(N), VT);
48033
48034 // (VSRAI -1, C) -> -1
48035 if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
48036 // N0 is all ones or undef. We guarantee that the bits shifted into the
48037 // result are all ones, not undef.
48038 return DAG.getConstant(-1, SDLoc(N), VT);
48039
48040 auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {
48041 unsigned NewShiftVal = Amt0 + Amt1;
48042 if (NewShiftVal >= NumBitsPerElt) {
48043 // Out of range logical bit shifts are guaranteed to be zero.
48044 // Out of range arithmetic bit shifts splat the sign bit.
48045 if (LogicalShift)
48046 return DAG.getConstant(0, SDLoc(N), VT);
48047 NewShiftVal = NumBitsPerElt - 1;
48048 }
48049 return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
48050 DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
48051 };
48052
48053 // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
48054 if (Opcode == N0.getOpcode())
48055 return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));
48056
48057 // (shl (add X, X), C) -> (shl X, (C + 1))
48058 if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&
48059 N0.getOperand(0) == N0.getOperand(1))
48060 return MergeShifts(N0.getOperand(0), ShiftVal, 1);
48061
48062 // We can decode 'whole byte' logical bit shifts as shuffles.
48063 if (LogicalShift && (ShiftVal % 8) == 0) {
48064 SDValue Op(N, 0);
48065 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
48066 return Res;
48067 }
48068
48069 // Attempt to detect an expanded vXi64 SIGN_EXTEND_INREG vXi1 pattern, and
48070 // convert to a splatted v2Xi32 SIGN_EXTEND_INREG pattern:
48071 // psrad(pshufd(psllq(X,63),1,1,3,3),31) ->
48072 // pshufd(psrad(pslld(X,31),31),0,0,2,2).
48073 if (Opcode == X86ISD::VSRAI && NumBitsPerElt == 32 && ShiftVal == 31 &&
48074 N0.getOpcode() == X86ISD::PSHUFD &&
48075 N0.getConstantOperandVal(1) == getV4X86ShuffleImm({1, 1, 3, 3}) &&
48076 N0->hasOneUse()) {
48078 if (BC.getOpcode() == X86ISD::VSHLI &&
48079 BC.getScalarValueSizeInBits() == 64 &&
48080 BC.getConstantOperandVal(1) == 63) {
48081 SDLoc DL(N);
48082 SDValue Src = BC.getOperand(0);
48083 Src = DAG.getBitcast(VT, Src);
48084 Src = DAG.getNode(X86ISD::PSHUFD, DL, VT, Src,
48085 getV4X86ShuffleImm8ForMask({0, 0, 2, 2}, DL, DAG));
48086 Src = DAG.getNode(X86ISD::VSHLI, DL, VT, Src, N1);
48087 Src = DAG.getNode(X86ISD::VSRAI, DL, VT, Src, N1);
48088 return Src;
48089 }
48090 }
48091
48092 auto TryConstantFold = [&](SDValue V) {
48093 APInt UndefElts;
48094 SmallVector<APInt, 32> EltBits;
48095 if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits,
48096 /*AllowWholeUndefs*/ true,
48097 /*AllowPartialUndefs*/ true))
48098 return SDValue();
48099 assert(EltBits.size() == VT.getVectorNumElements() &&
48100 "Unexpected shift value type");
48101 // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
48102 // created an undef input due to no input bits being demanded, but user
48103 // still expects 0 in other bits.
48104 for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
48105 APInt &Elt = EltBits[i];
48106 if (UndefElts[i])
48107 Elt = 0;
48108 else if (X86ISD::VSHLI == Opcode)
48109 Elt <<= ShiftVal;
48110 else if (X86ISD::VSRAI == Opcode)
48111 Elt.ashrInPlace(ShiftVal);
48112 else
48113 Elt.lshrInPlace(ShiftVal);
48114 }
48115 // Reset undef elements since they were zeroed above.
48116 UndefElts = 0;
48117 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
48118 };
48119
48120 // Constant Folding.
48121 if (N->isOnlyUserOf(N0.getNode())) {
48122 if (SDValue C = TryConstantFold(N0))
48123 return C;
48124
48125 // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))
48126 // Don't break NOT patterns.
48128 if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&
48129 BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&
48131 if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {
48132 SDLoc DL(N);
48133 SDValue LHS = DAG.getNode(Opcode, DL, VT,
48134 DAG.getBitcast(VT, BC.getOperand(0)), N1);
48135 return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);
48136 }
48137 }
48138 }
48139
48140 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48141 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),
48142 DCI))
48143 return SDValue(N, 0);
48144
48145 return SDValue();
48146}
48147
48150 const X86Subtarget &Subtarget) {
48151 EVT VT = N->getValueType(0);
48152 unsigned Opcode = N->getOpcode();
48153 assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||
48154 (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||
48155 Opcode == ISD::INSERT_VECTOR_ELT) &&
48156 "Unexpected vector insertion");
48157
48158 SDValue Vec = N->getOperand(0);
48159 SDValue Scl = N->getOperand(1);
48160 SDValue Idx = N->getOperand(2);
48161
48162 // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).
48163 if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))
48164 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);
48165
48166 if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {
48167 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
48168 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48169 if (TLI.SimplifyDemandedBits(SDValue(N, 0),
48170 APInt::getAllOnes(NumBitsPerElt), DCI))
48171 return SDValue(N, 0);
48172 }
48173
48174 // Attempt to combine insertion patterns to a shuffle.
48175 if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
48176 SDValue Op(N, 0);
48177 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
48178 return Res;
48179 }
48180
48181 return SDValue();
48182}
48183
48184/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
48185/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
48186/// OR -> CMPNEQSS.
48189 const X86Subtarget &Subtarget) {
48190 unsigned opcode;
48191
48192 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
48193 // we're requiring SSE2 for both.
48194 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
48195 SDValue N0 = N->getOperand(0);
48196 SDValue N1 = N->getOperand(1);
48197 SDValue CMP0 = N0.getOperand(1);
48198 SDValue CMP1 = N1.getOperand(1);
48199 SDLoc DL(N);
48200
48201 // The SETCCs should both refer to the same CMP.
48202 if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
48203 return SDValue();
48204
48205 SDValue CMP00 = CMP0->getOperand(0);
48206 SDValue CMP01 = CMP0->getOperand(1);
48207 EVT VT = CMP00.getValueType();
48208
48209 if (VT == MVT::f32 || VT == MVT::f64 ||
48210 (VT == MVT::f16 && Subtarget.hasFP16())) {
48211 bool ExpectingFlags = false;
48212 // Check for any users that want flags:
48213 for (const SDNode *U : N->uses()) {
48214 if (ExpectingFlags)
48215 break;
48216
48217 switch (U->getOpcode()) {
48218 default:
48219 case ISD::BR_CC:
48220 case ISD::BRCOND:
48221 case ISD::SELECT:
48222 ExpectingFlags = true;
48223 break;
48224 case ISD::CopyToReg:
48225 case ISD::SIGN_EXTEND:
48226 case ISD::ZERO_EXTEND:
48227 case ISD::ANY_EXTEND:
48228 break;
48229 }
48230 }
48231
48232 if (!ExpectingFlags) {
48233 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
48234 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
48235
48236 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
48237 X86::CondCode tmp = cc0;
48238 cc0 = cc1;
48239 cc1 = tmp;
48240 }
48241
48242 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
48243 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
48244 // FIXME: need symbolic constants for these magic numbers.
48245 // See X86ATTInstPrinter.cpp:printSSECC().
48246 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
48247 if (Subtarget.hasAVX512()) {
48248 SDValue FSetCC =
48249 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
48250 DAG.getTargetConstant(x86cc, DL, MVT::i8));
48251 // Need to fill with zeros to ensure the bitcast will produce zeroes
48252 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
48253 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
48254 DAG.getConstant(0, DL, MVT::v16i1),
48255 FSetCC, DAG.getIntPtrConstant(0, DL));
48256 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
48257 N->getSimpleValueType(0));
48258 }
48259 SDValue OnesOrZeroesF =
48260 DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,
48261 CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));
48262
48263 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
48264 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
48265
48266 if (is64BitFP && !Subtarget.is64Bit()) {
48267 // On a 32-bit target, we cannot bitcast the 64-bit float to a
48268 // 64-bit integer, since that's not a legal type. Since
48269 // OnesOrZeroesF is all ones or all zeroes, we don't need all the
48270 // bits, but can do this little dance to extract the lowest 32 bits
48271 // and work with those going forward.
48272 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
48273 OnesOrZeroesF);
48274 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
48275 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
48276 Vector32, DAG.getIntPtrConstant(0, DL));
48277 IntVT = MVT::i32;
48278 }
48279
48280 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
48281 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
48282 DAG.getConstant(1, DL, IntVT));
48283 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
48284 ANDed);
48285 return OneBitOfTruth;
48286 }
48287 }
48288 }
48289 }
48290 return SDValue();
48291}
48292
48293/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
48295 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
48296
48297 MVT VT = N->getSimpleValueType(0);
48298 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
48299 return SDValue();
48300
48301 SDValue X, Y;
48302 SDValue N0 = N->getOperand(0);
48303 SDValue N1 = N->getOperand(1);
48304
48305 if (SDValue Not = IsNOT(N0, DAG)) {
48306 X = Not;
48307 Y = N1;
48308 } else if (SDValue Not = IsNOT(N1, DAG)) {
48309 X = Not;
48310 Y = N0;
48311 } else
48312 return SDValue();
48313
48314 X = DAG.getBitcast(VT, X);
48315 Y = DAG.getBitcast(VT, Y);
48316 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
48317}
48318
48319/// Try to fold:
48320/// and (vector_shuffle<Z,...,Z>
48321/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y
48322/// ->
48323/// andnp (vector_shuffle<Z,...,Z>
48324/// (insert_vector_elt undef, X, Z), undef), Y
48326 const X86Subtarget &Subtarget) {
48327 assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP");
48328
48329 EVT VT = N->getValueType(0);
48330 // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original
48331 // value and require extra moves.
48332 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
48333 ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))
48334 return SDValue();
48335
48336 auto GetNot = [&DAG](SDValue V) {
48337 auto *SVN = dyn_cast<ShuffleVectorSDNode>(peekThroughOneUseBitcasts(V));
48338 // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all
48339 // end-users are ISD::AND including cases
48340 // (and(extract_vector_element(SVN), Y)).
48341 if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||
48342 !SVN->getOperand(1).isUndef()) {
48343 return SDValue();
48344 }
48345 SDValue IVEN = SVN->getOperand(0);
48346 if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||
48347 !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())
48348 return SDValue();
48349 if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||
48350 IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())
48351 return SDValue();
48352 SDValue Src = IVEN.getOperand(1);
48353 if (SDValue Not = IsNOT(Src, DAG)) {
48354 SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);
48355 SDValue NotIVEN =
48357 IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));
48358 return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,
48359 SVN->getOperand(1), SVN->getMask());
48360 }
48361 return SDValue();
48362 };
48363
48364 SDValue X, Y;
48365 SDValue N0 = N->getOperand(0);
48366 SDValue N1 = N->getOperand(1);
48367 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48368
48369 if (SDValue Not = GetNot(N0)) {
48370 X = Not;
48371 Y = N1;
48372 } else if (SDValue Not = GetNot(N1)) {
48373 X = Not;
48374 Y = N0;
48375 } else
48376 return SDValue();
48377
48378 X = DAG.getBitcast(VT, X);
48379 Y = DAG.getBitcast(VT, Y);
48380 SDLoc DL(N);
48381
48382 // We do not split for SSE at all, but we need to split vectors for AVX1 and
48383 // AVX2.
48384 if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
48386 SDValue LoX, HiX;
48387 std::tie(LoX, HiX) = splitVector(X, DAG, DL);
48388 SDValue LoY, HiY;
48389 std::tie(LoY, HiY) = splitVector(Y, DAG, DL);
48390 EVT SplitVT = LoX.getValueType();
48391 SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});
48392 SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});
48393 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});
48394 }
48395
48396 if (TLI.isTypeLegal(VT))
48397 return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});
48398
48399 return SDValue();
48400}
48401
48402// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
48403// logical operations, like in the example below.
48404// or (and (truncate x, truncate y)),
48405// (xor (truncate z, build_vector (constants)))
48406// Given a target type \p VT, we generate
48407// or (and x, y), (xor z, zext(build_vector (constants)))
48408// given x, y and z are of type \p VT. We can do so, if operands are either
48409// truncates from VT types, the second operand is a vector of constants or can
48410// be recursively promoted.
48412 SelectionDAG &DAG, unsigned Depth) {
48413 // Limit recursion to avoid excessive compile times.
48415 return SDValue();
48416
48417 if (!ISD::isBitwiseLogicOp(N.getOpcode()))
48418 return SDValue();
48419
48420 SDValue N0 = N.getOperand(0);
48421 SDValue N1 = N.getOperand(1);
48422
48423 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48424 if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
48425 return SDValue();
48426
48427 if (SDValue NN0 = PromoteMaskArithmetic(N0, DL, VT, DAG, Depth + 1))
48428 N0 = NN0;
48429 else {
48430 // The left side has to be a trunc.
48431 if (N0.getOpcode() != ISD::TRUNCATE)
48432 return SDValue();
48433
48434 // The type of the truncated inputs.
48435 if (N0.getOperand(0).getValueType() != VT)
48436 return SDValue();
48437
48438 N0 = N0.getOperand(0);
48439 }
48440
48441 if (SDValue NN1 = PromoteMaskArithmetic(N1, DL, VT, DAG, Depth + 1))
48442 N1 = NN1;
48443 else {
48444 // The right side has to be a 'trunc' or a (foldable) constant.
48445 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
48446 N1.getOperand(0).getValueType() == VT;
48447 if (RHSTrunc)
48448 N1 = N1.getOperand(0);
48449 else if (SDValue Cst =
48451 N1 = Cst;
48452 else
48453 return SDValue();
48454 }
48455
48456 return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);
48457}
48458
48459// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
48460// register. In most cases we actually compare or select YMM-sized registers
48461// and mixing the two types creates horrible code. This method optimizes
48462// some of the transition sequences.
48463// Even with AVX-512 this is still useful for removing casts around logical
48464// operations on vXi1 mask types.
48466 SelectionDAG &DAG,
48467 const X86Subtarget &Subtarget) {
48468 EVT VT = N.getValueType();
48469 assert(VT.isVector() && "Expected vector type");
48470 assert((N.getOpcode() == ISD::ANY_EXTEND ||
48471 N.getOpcode() == ISD::ZERO_EXTEND ||
48472 N.getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
48473
48474 SDValue Narrow = N.getOperand(0);
48475 EVT NarrowVT = Narrow.getValueType();
48476
48477 // Generate the wide operation.
48478 SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, 0);
48479 if (!Op)
48480 return SDValue();
48481 switch (N.getOpcode()) {
48482 default: llvm_unreachable("Unexpected opcode");
48483 case ISD::ANY_EXTEND:
48484 return Op;
48485 case ISD::ZERO_EXTEND:
48486 return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
48487 case ISD::SIGN_EXTEND:
48488 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
48489 Op, DAG.getValueType(NarrowVT));
48490 }
48491}
48492
48493static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
48494 unsigned FPOpcode;
48495 switch (Opcode) {
48496 // clang-format off
48497 default: llvm_unreachable("Unexpected input node for FP logic conversion");
48498 case ISD::AND: FPOpcode = X86ISD::FAND; break;
48499 case ISD::OR: FPOpcode = X86ISD::FOR; break;
48500 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
48501 // clang-format on
48502 }
48503 return FPOpcode;
48504}
48505
48506/// If both input operands of a logic op are being cast from floating-point
48507/// types or FP compares, try to convert this into a floating-point logic node
48508/// to avoid unnecessary moves from SSE to integer registers.
48511 const X86Subtarget &Subtarget) {
48512 EVT VT = N->getValueType(0);
48513 SDValue N0 = N->getOperand(0);
48514 SDValue N1 = N->getOperand(1);
48515 SDLoc DL(N);
48516
48517 if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||
48518 (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))
48519 return SDValue();
48520
48521 SDValue N00 = N0.getOperand(0);
48522 SDValue N10 = N1.getOperand(0);
48523 EVT N00Type = N00.getValueType();
48524 EVT N10Type = N10.getValueType();
48525
48526 // Ensure that both types are the same and are legal scalar fp types.
48527 if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||
48528 (Subtarget.hasSSE2() && N00Type == MVT::f64) ||
48529 (Subtarget.hasFP16() && N00Type == MVT::f16)))
48530 return SDValue();
48531
48532 if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {
48533 unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
48534 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
48535 return DAG.getBitcast(VT, FPLogic);
48536 }
48537
48538 if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||
48539 !N1.hasOneUse())
48540 return SDValue();
48541
48542 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();
48543 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();
48544
48545 // The vector ISA for FP predicates is incomplete before AVX, so converting
48546 // COMIS* to CMPS* may not be a win before AVX.
48547 if (!Subtarget.hasAVX() &&
48548 !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))
48549 return SDValue();
48550
48551 // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)
48552 // and vector logic:
48553 // logic (setcc N00, N01), (setcc N10, N11) -->
48554 // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0
48555 unsigned NumElts = 128 / N00Type.getSizeInBits();
48556 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);
48557 EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
48558 SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);
48559 SDValue N01 = N0.getOperand(1);
48560 SDValue N11 = N1.getOperand(1);
48561 SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);
48562 SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);
48563 SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);
48564 SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);
48565 SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);
48566 SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);
48567 SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1);
48568 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);
48569}
48570
48571// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
48572// to reduce XMM->GPR traffic.
48574 unsigned Opc = N->getOpcode();
48575 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
48576 "Unexpected bit opcode");
48577
48578 SDValue N0 = N->getOperand(0);
48579 SDValue N1 = N->getOperand(1);
48580
48581 // Both operands must be single use MOVMSK.
48582 if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
48583 N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
48584 return SDValue();
48585
48586 SDValue Vec0 = N0.getOperand(0);
48587 SDValue Vec1 = N1.getOperand(0);
48588 EVT VecVT0 = Vec0.getValueType();
48589 EVT VecVT1 = Vec1.getValueType();
48590
48591 // Both MOVMSK operands must be from vectors of the same size and same element
48592 // size, but its OK for a fp/int diff.
48593 if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
48594 VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
48595 return SDValue();
48596
48597 SDLoc DL(N);
48598 unsigned VecOpc =
48599 VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
48600 SDValue Result =
48601 DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
48602 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
48603}
48604
48605// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).
48606// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws
48607// handles in InstCombine.
48609 unsigned Opc = N->getOpcode();
48610 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
48611 "Unexpected bit opcode");
48612
48613 SDValue N0 = N->getOperand(0);
48614 SDValue N1 = N->getOperand(1);
48615 EVT VT = N->getValueType(0);
48616
48617 // Both operands must be single use.
48618 if (!N0.hasOneUse() || !N1.hasOneUse())
48619 return SDValue();
48620
48621 // Search for matching shifts.
48624
48625 unsigned BCOpc = BC0.getOpcode();
48626 EVT BCVT = BC0.getValueType();
48627 if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())
48628 return SDValue();
48629
48630 switch (BCOpc) {
48631 case X86ISD::VSHLI:
48632 case X86ISD::VSRLI:
48633 case X86ISD::VSRAI: {
48634 if (BC0.getOperand(1) != BC1.getOperand(1))
48635 return SDValue();
48636
48637 SDLoc DL(N);
48638 SDValue BitOp =
48639 DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));
48640 SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));
48641 return DAG.getBitcast(VT, Shift);
48642 }
48643 }
48644
48645 return SDValue();
48646}
48647
48648// Attempt to fold:
48649// BITOP(PACKSS(X,Z),PACKSS(Y,W)) --> PACKSS(BITOP(X,Y),BITOP(Z,W)).
48650// TODO: Handle PACKUS handling.
48652 unsigned Opc = N->getOpcode();
48653 assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
48654 "Unexpected bit opcode");
48655
48656 SDValue N0 = N->getOperand(0);
48657 SDValue N1 = N->getOperand(1);
48658 EVT VT = N->getValueType(0);
48659
48660 // Both operands must be single use.
48661 if (!N0.hasOneUse() || !N1.hasOneUse())
48662 return SDValue();
48663
48664 // Search for matching packs.
48667
48668 if (N0.getOpcode() != X86ISD::PACKSS || N1.getOpcode() != X86ISD::PACKSS)
48669 return SDValue();
48670
48671 MVT DstVT = N0.getSimpleValueType();
48672 if (DstVT != N1.getSimpleValueType())
48673 return SDValue();
48674
48675 MVT SrcVT = N0.getOperand(0).getSimpleValueType();
48676 unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
48677
48678 // Limit to allsignbits packing.
48679 if (DAG.ComputeNumSignBits(N0.getOperand(0)) != NumSrcBits ||
48680 DAG.ComputeNumSignBits(N0.getOperand(1)) != NumSrcBits ||
48681 DAG.ComputeNumSignBits(N1.getOperand(0)) != NumSrcBits ||
48682 DAG.ComputeNumSignBits(N1.getOperand(1)) != NumSrcBits)
48683 return SDValue();
48684
48685 SDLoc DL(N);
48686 SDValue LHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(0), N1.getOperand(0));
48687 SDValue RHS = DAG.getNode(Opc, DL, SrcVT, N0.getOperand(1), N1.getOperand(1));
48688 return DAG.getBitcast(VT, DAG.getNode(X86ISD::PACKSS, DL, DstVT, LHS, RHS));
48689}
48690
48691/// If this is a zero/all-bits result that is bitwise-anded with a low bits
48692/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
48693/// with a shift-right to eliminate loading the vector constant mask value.
48695 const X86Subtarget &Subtarget) {
48696 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
48697 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
48698 EVT VT = Op0.getValueType();
48699 if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())
48700 return SDValue();
48701
48702 // Try to convert an "is positive" signbit masking operation into arithmetic
48703 // shift and "andn". This saves a materialization of a -1 vector constant.
48704 // The "is negative" variant should be handled more generally because it only
48705 // requires "and" rather than "andn":
48706 // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y
48707 //
48708 // This is limited to the original type to avoid producing even more bitcasts.
48709 // If the bitcasts can't be eliminated, then it is unlikely that this fold
48710 // will be profitable.
48711 if (N->getValueType(0) == VT &&
48712 supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) {
48713 SDValue X, Y;
48714 if (Op1.getOpcode() == X86ISD::PCMPGT &&
48715 isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {
48716 X = Op1.getOperand(0);
48717 Y = Op0;
48718 } else if (Op0.getOpcode() == X86ISD::PCMPGT &&
48719 isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {
48720 X = Op0.getOperand(0);
48721 Y = Op1;
48722 }
48723 if (X && Y) {
48724 SDLoc DL(N);
48725 SDValue Sra =
48727 VT.getScalarSizeInBits() - 1, DAG);
48728 return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);
48729 }
48730 }
48731
48732 APInt SplatVal;
48733 if (!X86::isConstantSplat(Op1, SplatVal, false) || !SplatVal.isMask())
48734 return SDValue();
48735
48736 // Don't prevent creation of ANDN.
48737 if (isBitwiseNot(Op0))
48738 return SDValue();
48739
48740 if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL))
48741 return SDValue();
48742
48743 unsigned EltBitWidth = VT.getScalarSizeInBits();
48744 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
48745 return SDValue();
48746
48747 SDLoc DL(N);
48748 unsigned ShiftVal = SplatVal.countr_one();
48749 SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
48750 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);
48751 return DAG.getBitcast(N->getValueType(0), Shift);
48752}
48753
48754// Get the index node from the lowered DAG of a GEP IR instruction with one
48755// indexing dimension.
48757 if (Ld->isIndexed())
48758 return SDValue();
48759
48760 SDValue Base = Ld->getBasePtr();
48761
48762 if (Base.getOpcode() != ISD::ADD)
48763 return SDValue();
48764
48765 SDValue ShiftedIndex = Base.getOperand(0);
48766
48767 if (ShiftedIndex.getOpcode() != ISD::SHL)
48768 return SDValue();
48769
48770 return ShiftedIndex.getOperand(0);
48771
48772}
48773
48774static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
48775 return Subtarget.hasBMI2() &&
48776 (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit()));
48777}
48778
48779// This function recognizes cases where X86 bzhi instruction can replace and
48780// 'and-load' sequence.
48781// In case of loading integer value from an array of constants which is defined
48782// as follows:
48783//
48784// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
48785//
48786// then applying a bitwise and on the result with another input.
48787// It's equivalent to performing bzhi (zero high bits) on the input, with the
48788// same index of the load.
48790 const X86Subtarget &Subtarget) {
48791 MVT VT = Node->getSimpleValueType(0);
48792 SDLoc dl(Node);
48793
48794 // Check if subtarget has BZHI instruction for the node's type
48795 if (!hasBZHI(Subtarget, VT))
48796 return SDValue();
48797
48798 // Try matching the pattern for both operands.
48799 for (unsigned i = 0; i < 2; i++) {
48800 SDValue N = Node->getOperand(i);
48801 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
48802
48803 // continue if the operand is not a load instruction
48804 if (!Ld)
48805 return SDValue();
48806
48807 const Value *MemOp = Ld->getMemOperand()->getValue();
48808
48809 if (!MemOp)
48810 return SDValue();
48811
48812 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
48813 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
48814 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
48815
48816 Constant *Init = GV->getInitializer();
48817 Type *Ty = Init->getType();
48818 if (!isa<ConstantDataArray>(Init) ||
48819 !Ty->getArrayElementType()->isIntegerTy() ||
48821 VT.getSizeInBits() ||
48822 Ty->getArrayNumElements() >
48824 continue;
48825
48826 // Check if the array's constant elements are suitable to our case.
48827 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
48828 bool ConstantsMatch = true;
48829 for (uint64_t j = 0; j < ArrayElementCount; j++) {
48830 auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));
48831 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
48832 ConstantsMatch = false;
48833 break;
48834 }
48835 }
48836 if (!ConstantsMatch)
48837 continue;
48838
48839 // Do the transformation (For 32-bit type):
48840 // -> (and (load arr[idx]), inp)
48841 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
48842 // that will be replaced with one bzhi instruction.
48843 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
48844 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
48845
48846 // Get the Node which indexes into the array.
48848 if (!Index)
48849 return SDValue();
48850 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
48851
48852 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
48853 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
48854
48855 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
48856 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
48857
48858 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
48859 }
48860 }
48861 }
48862 }
48863 return SDValue();
48864}
48865
48866// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
48867// Where C is a mask containing the same number of bits as the setcc and
48868// where the setcc will freely 0 upper bits of k-register. We can replace the
48869// undef in the concat with 0s and remove the AND. This mainly helps with
48870// v2i1/v4i1 setcc being casted to scalar.
48872 const X86Subtarget &Subtarget) {
48873 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
48874
48875 EVT VT = N->getValueType(0);
48876
48877 // Make sure this is an AND with constant. We will check the value of the
48878 // constant later.
48879 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));
48880 if (!C1)
48881 return SDValue();
48882
48883 // This is implied by the ConstantSDNode.
48884 assert(!VT.isVector() && "Expected scalar VT!");
48885
48886 SDValue Src = N->getOperand(0);
48887 if (!Src.hasOneUse())
48888 return SDValue();
48889
48890 // (Optionally) peek through any_extend().
48891 if (Src.getOpcode() == ISD::ANY_EXTEND) {
48892 if (!Src.getOperand(0).hasOneUse())
48893 return SDValue();
48894 Src = Src.getOperand(0);
48895 }
48896
48897 if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())
48898 return SDValue();
48899
48900 Src = Src.getOperand(0);
48901 EVT SrcVT = Src.getValueType();
48902
48903 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
48904 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
48905 !TLI.isTypeLegal(SrcVT))
48906 return SDValue();
48907
48908 if (Src.getOpcode() != ISD::CONCAT_VECTORS)
48909 return SDValue();
48910
48911 // We only care about the first subvector of the concat, we expect the
48912 // other subvectors to be ignored due to the AND if we make the change.
48913 SDValue SubVec = Src.getOperand(0);
48914 EVT SubVecVT = SubVec.getValueType();
48915
48916 // The RHS of the AND should be a mask with as many bits as SubVec.
48917 if (!TLI.isTypeLegal(SubVecVT) ||
48918 !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))
48919 return SDValue();
48920
48921 // First subvector should be a setcc with a legal result type or a
48922 // AND containing at least one setcc with a legal result type.
48923 auto IsLegalSetCC = [&](SDValue V) {
48924 if (V.getOpcode() != ISD::SETCC)
48925 return false;
48926 EVT SetccVT = V.getOperand(0).getValueType();
48927 if (!TLI.isTypeLegal(SetccVT) ||
48928 !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
48929 return false;
48930 if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
48931 return false;
48932 return true;
48933 };
48934 if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&
48935 (IsLegalSetCC(SubVec.getOperand(0)) ||
48936 IsLegalSetCC(SubVec.getOperand(1))))))
48937 return SDValue();
48938
48939 // We passed all the checks. Rebuild the concat_vectors with zeroes
48940 // and cast it back to VT.
48941 SDLoc dl(N);
48942 SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
48943 DAG.getConstant(0, dl, SubVecVT));
48944 Ops[0] = SubVec;
48945 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
48946 Ops);
48947 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());
48948 return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);
48949}
48950
48951static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG,
48952 SDValue OpMustEq, SDValue Op, unsigned Depth) {
48953 // We don't want to go crazy with the recursion here. This isn't a super
48954 // important optimization.
48955 static constexpr unsigned kMaxDepth = 2;
48956
48957 // Only do this re-ordering if op has one use.
48958 if (!Op.hasOneUse())
48959 return SDValue();
48960
48961 SDLoc DL(Op);
48962 // If we hit another assosiative op, recurse further.
48963 if (Op.getOpcode() == Opc) {
48964 // Done recursing.
48965 if (Depth++ >= kMaxDepth)
48966 return SDValue();
48967
48968 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
48969 if (SDValue R =
48970 getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))
48971 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,
48972 Op.getOperand(1 - OpIdx));
48973
48974 } else if (Op.getOpcode() == ISD::SUB) {
48975 if (Opc == ISD::AND) {
48976 // BLSI: (and x, (sub 0, x))
48977 if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)
48978 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
48979 }
48980 // Opc must be ISD::AND or ISD::XOR
48981 // BLSR: (and x, (sub x, 1))
48982 // BLSMSK: (xor x, (sub x, 1))
48983 if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
48984 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
48985
48986 } else if (Op.getOpcode() == ISD::ADD) {
48987 // Opc must be ISD::AND or ISD::XOR
48988 // BLSR: (and x, (add x, -1))
48989 // BLSMSK: (xor x, (add x, -1))
48990 if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)
48991 return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);
48992 }
48993 return SDValue();
48994}
48995
48997 const X86Subtarget &Subtarget) {
48998 EVT VT = N->getValueType(0);
48999 // Make sure this node is a candidate for BMI instructions.
49000 if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||
49001 (VT != MVT::i32 && VT != MVT::i64))
49002 return SDValue();
49003
49004 assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR);
49005
49006 // Try and match LHS and RHS.
49007 for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)
49008 if (SDValue OpMatch =
49009 getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),
49010 N->getOperand(1 - OpIdx), 0))
49011 return OpMatch;
49012 return SDValue();
49013}
49014
49017 const X86Subtarget &Subtarget) {
49018 SDValue N0 = N->getOperand(0);
49019 SDValue N1 = N->getOperand(1);
49020 EVT VT = N->getValueType(0);
49021 SDLoc dl(N);
49022 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49023
49024 // If this is SSE1 only convert to FAND to avoid scalarization.
49025 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
49026 return DAG.getBitcast(MVT::v4i32,
49027 DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,
49028 DAG.getBitcast(MVT::v4f32, N0),
49029 DAG.getBitcast(MVT::v4f32, N1)));
49030 }
49031
49032 // Use a 32-bit and+zext if upper bits known zero.
49033 if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {
49034 APInt HiMask = APInt::getHighBitsSet(64, 32);
49035 if (DAG.MaskedValueIsZero(N1, HiMask) ||
49036 DAG.MaskedValueIsZero(N0, HiMask)) {
49037 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);
49038 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);
49039 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
49040 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
49041 }
49042 }
49043
49044 // Match all-of bool scalar reductions into a bitcast/movmsk + cmp.
49045 // TODO: Support multiple SrcOps.
49046 if (VT == MVT::i1) {
49048 SmallVector<APInt, 2> SrcPartials;
49049 if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
49050 SrcOps.size() == 1) {
49051 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
49052 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
49053 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
49054 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
49055 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
49056 if (Mask) {
49057 assert(SrcPartials[0].getBitWidth() == NumElts &&
49058 "Unexpected partial reduction mask");
49059 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
49060 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
49061 return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
49062 }
49063 }
49064 }
49065
49066 // InstCombine converts:
49067 // `(-x << C0) & C1`
49068 // to
49069 // `(x * (Pow2_Ceil(C1) - (1 << C0))) & C1`
49070 // This saves an IR instruction but on x86 the neg/shift version is preferable
49071 // so undo the transform.
49072
49073 if (N0.getOpcode() == ISD::MUL && N0.hasOneUse()) {
49074 // TODO: We don't actually need a splat for this, we just need the checks to
49075 // hold for each element.
49076 ConstantSDNode *N1C = isConstOrConstSplat(N1, /*AllowUndefs*/ true,
49077 /*AllowTruncation*/ false);
49078 ConstantSDNode *N01C =
49079 isConstOrConstSplat(N0.getOperand(1), /*AllowUndefs*/ true,
49080 /*AllowTruncation*/ false);
49081 if (N1C && N01C) {
49082 const APInt &MulC = N01C->getAPIntValue();
49083 const APInt &AndC = N1C->getAPIntValue();
49084 APInt MulCLowBit = MulC & (-MulC);
49085 if (MulC.uge(AndC) && !MulC.isPowerOf2() &&
49086 (MulCLowBit + MulC).isPowerOf2()) {
49087 SDValue Neg = DAG.getNegative(N0.getOperand(0), dl, VT);
49088 int32_t MulCLowBitLog = MulCLowBit.exactLogBase2();
49089 assert(MulCLowBitLog != -1 &&
49090 "Isolated lowbit is somehow not a power of 2!");
49091 SDValue Shift = DAG.getNode(ISD::SHL, dl, VT, Neg,
49092 DAG.getConstant(MulCLowBitLog, dl, VT));
49093 return DAG.getNode(ISD::AND, dl, VT, Shift, N1);
49094 }
49095 }
49096 }
49097
49098 if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
49099 return V;
49100
49101 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
49102 return R;
49103
49104 if (SDValue R = combineBitOpWithShift(N, DAG))
49105 return R;
49106
49107 if (SDValue R = combineBitOpWithPACK(N, DAG))
49108 return R;
49109
49110 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
49111 return FPLogic;
49112
49113 if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))
49114 return R;
49115
49116 if (DCI.isBeforeLegalizeOps())
49117 return SDValue();
49118
49119 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
49120 return R;
49121
49122 if (SDValue R = combineAndNotIntoANDNP(N, DAG))
49123 return R;
49124
49125 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
49126 return ShiftRight;
49127
49128 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
49129 return R;
49130
49131 // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
49132 // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
49133 // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
49134 if (VT.isVector() && getTargetConstantFromNode(N1)) {
49135 unsigned Opc0 = N0.getOpcode();
49136 if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&
49138 DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&
49139 N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {
49140 SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);
49141 return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);
49142 }
49143 }
49144
49145 // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant
49146 // avoids slow variable shift (moving shift amount to ECX etc.)
49147 if (isOneConstant(N1) && N0->hasOneUse()) {
49148 SDValue Src = N0;
49149 while ((Src.getOpcode() == ISD::ZERO_EXTEND ||
49150 Src.getOpcode() == ISD::TRUNCATE) &&
49151 Src.getOperand(0)->hasOneUse())
49152 Src = Src.getOperand(0);
49153 bool ContainsNOT = false;
49154 X86::CondCode X86CC = X86::COND_B;
49155 // Peek through AND(NOT(SRL(X,Y)),1).
49156 if (isBitwiseNot(Src)) {
49157 Src = Src.getOperand(0);
49158 X86CC = X86::COND_AE;
49159 ContainsNOT = true;
49160 }
49161 if (Src.getOpcode() == ISD::SRL &&
49162 !isa<ConstantSDNode>(Src.getOperand(1))) {
49163 SDValue BitNo = Src.getOperand(1);
49164 Src = Src.getOperand(0);
49165 // Peek through AND(SRL(NOT(X),Y),1).
49166 if (isBitwiseNot(Src)) {
49167 Src = Src.getOperand(0);
49168 X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;
49169 ContainsNOT = true;
49170 }
49171 // If we have BMI2 then SHRX should be faster for i32/i64 cases.
49172 if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))
49173 if (SDValue BT = getBT(Src, BitNo, dl, DAG))
49174 return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);
49175 }
49176 }
49177
49178 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
49179 // Attempt to recursively combine a bitmask AND with shuffles.
49180 SDValue Op(N, 0);
49181 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49182 return Res;
49183
49184 // If either operand is a constant mask, then only the elements that aren't
49185 // zero are actually demanded by the other operand.
49186 auto GetDemandedMasks = [&](SDValue Op) {
49187 APInt UndefElts;
49188 SmallVector<APInt> EltBits;
49189 int NumElts = VT.getVectorNumElements();
49190 int EltSizeInBits = VT.getScalarSizeInBits();
49191 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
49192 APInt DemandedElts = APInt::getAllOnes(NumElts);
49193 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
49194 EltBits)) {
49195 DemandedBits.clearAllBits();
49196 DemandedElts.clearAllBits();
49197 for (int I = 0; I != NumElts; ++I) {
49198 if (UndefElts[I]) {
49199 // We can't assume an undef src element gives an undef dst - the
49200 // other src might be zero.
49201 DemandedBits.setAllBits();
49202 DemandedElts.setBit(I);
49203 } else if (!EltBits[I].isZero()) {
49204 DemandedBits |= EltBits[I];
49205 DemandedElts.setBit(I);
49206 }
49207 }
49208 }
49209 return std::make_pair(DemandedBits, DemandedElts);
49210 };
49211 APInt Bits0, Elts0;
49212 APInt Bits1, Elts1;
49213 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
49214 std::tie(Bits1, Elts1) = GetDemandedMasks(N0);
49215
49216 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
49217 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
49218 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
49219 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
49220 if (N->getOpcode() != ISD::DELETED_NODE)
49221 DCI.AddToWorklist(N);
49222 return SDValue(N, 0);
49223 }
49224
49225 SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);
49226 SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);
49227 if (NewN0 || NewN1)
49228 return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,
49229 NewN1 ? NewN1 : N1);
49230 }
49231
49232 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
49233 if ((VT.getScalarSizeInBits() % 8) == 0 &&
49235 isa<ConstantSDNode>(N0.getOperand(1)) && N0->hasOneUse()) {
49236 SDValue BitMask = N1;
49237 SDValue SrcVec = N0.getOperand(0);
49238 EVT SrcVecVT = SrcVec.getValueType();
49239
49240 // Check that the constant bitmask masks whole bytes.
49241 APInt UndefElts;
49242 SmallVector<APInt, 64> EltBits;
49243 if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&
49244 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
49245 llvm::all_of(EltBits, [](const APInt &M) {
49246 return M.isZero() || M.isAllOnes();
49247 })) {
49248 unsigned NumElts = SrcVecVT.getVectorNumElements();
49249 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
49250 unsigned Idx = N0.getConstantOperandVal(1);
49251
49252 // Create a root shuffle mask from the byte mask and the extracted index.
49253 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
49254 for (unsigned i = 0; i != Scale; ++i) {
49255 if (UndefElts[i])
49256 continue;
49257 int VecIdx = Scale * Idx + i;
49258 ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;
49259 }
49260
49262 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,
49264 /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,
49265 /*AllowVarPerLaneMask*/ true, DAG, Subtarget))
49266 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,
49267 N0.getOperand(1));
49268 }
49269 }
49270
49271 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
49272 return R;
49273
49274 return SDValue();
49275}
49276
49277// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))
49279 const X86Subtarget &Subtarget) {
49280 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
49281
49282 MVT VT = N->getSimpleValueType(0);
49283 unsigned EltSizeInBits = VT.getScalarSizeInBits();
49284 if (!VT.isVector() || (EltSizeInBits % 8) != 0)
49285 return SDValue();
49286
49287 SDValue N0 = peekThroughBitcasts(N->getOperand(0));
49288 SDValue N1 = peekThroughBitcasts(N->getOperand(1));
49289 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
49290 return SDValue();
49291
49292 // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use
49293 // VPTERNLOG. Otherwise only do this if either mask has multiple uses already.
49294 if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||
49295 !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))
49296 return SDValue();
49297
49298 // Attempt to extract constant byte masks.
49299 APInt UndefElts0, UndefElts1;
49300 SmallVector<APInt, 32> EltBits0, EltBits1;
49301 if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,
49302 /*AllowWholeUndefs*/ false,
49303 /*AllowPartialUndefs*/ false))
49304 return SDValue();
49305 if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,
49306 /*AllowWholeUndefs*/ false,
49307 /*AllowPartialUndefs*/ false))
49308 return SDValue();
49309
49310 for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {
49311 // TODO - add UNDEF elts support.
49312 if (UndefElts0[i] || UndefElts1[i])
49313 return SDValue();
49314 if (EltBits0[i] != ~EltBits1[i])
49315 return SDValue();
49316 }
49317
49318 SDLoc DL(N);
49319
49320 if (useVPTERNLOG(Subtarget, VT)) {
49321 // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.
49322 // VPTERNLOG is only available as vXi32/64-bit types.
49323 MVT OpSVT = EltSizeInBits <= 32 ? MVT::i32 : MVT::i64;
49324 MVT OpVT =
49325 MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());
49326 SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));
49327 SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));
49328 SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));
49329 SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
49330 SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},
49331 DAG, Subtarget);
49332 return DAG.getBitcast(VT, Res);
49333 }
49334
49335 SDValue X = N->getOperand(0);
49336 SDValue Y =
49337 DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
49338 DAG.getBitcast(VT, N1.getOperand(0)));
49339 return DAG.getNode(ISD::OR, DL, VT, X, Y);
49340}
49341
49342// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
49343static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
49344 if (N->getOpcode() != ISD::OR)
49345 return false;
49346
49347 SDValue N0 = N->getOperand(0);
49348 SDValue N1 = N->getOperand(1);
49349
49350 // Canonicalize AND to LHS.
49351 if (N1.getOpcode() == ISD::AND)
49352 std::swap(N0, N1);
49353
49354 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
49355 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
49356 return false;
49357
49358 Mask = N1.getOperand(0);
49359 X = N1.getOperand(1);
49360
49361 // Check to see if the mask appeared in both the AND and ANDNP.
49362 if (N0.getOperand(0) == Mask)
49363 Y = N0.getOperand(1);
49364 else if (N0.getOperand(1) == Mask)
49365 Y = N0.getOperand(0);
49366 else
49367 return false;
49368
49369 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
49370 // ANDNP combine allows other combines to happen that prevent matching.
49371 return true;
49372}
49373
49374// Try to fold:
49375// (or (and (m, y), (pandn m, x)))
49376// into:
49377// (vselect m, x, y)
49378// As a special case, try to fold:
49379// (or (and (m, (sub 0, x)), (pandn m, x)))
49380// into:
49381// (sub (xor X, M), M)
49383 const X86Subtarget &Subtarget) {
49384 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
49385
49386 EVT VT = N->getValueType(0);
49387 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
49388 (VT.is256BitVector() && Subtarget.hasInt256())))
49389 return SDValue();
49390
49391 SDValue X, Y, Mask;
49392 if (!matchLogicBlend(N, X, Y, Mask))
49393 return SDValue();
49394
49395 // Validate that X, Y, and Mask are bitcasts, and see through them.
49396 Mask = peekThroughBitcasts(Mask);
49399
49400 EVT MaskVT = Mask.getValueType();
49401 unsigned EltBits = MaskVT.getScalarSizeInBits();
49402
49403 // TODO: Attempt to handle floating point cases as well?
49404 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
49405 return SDValue();
49406
49407 SDLoc DL(N);
49408
49409 // Attempt to combine to conditional negate: (sub (xor X, M), M)
49410 if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,
49411 DAG, Subtarget))
49412 return Res;
49413
49414 // PBLENDVB is only available on SSE 4.1.
49415 if (!Subtarget.hasSSE41())
49416 return SDValue();
49417
49418 // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
49419 if (Subtarget.hasVLX())
49420 return SDValue();
49421
49422 MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
49423
49424 X = DAG.getBitcast(BlendVT, X);
49425 Y = DAG.getBitcast(BlendVT, Y);
49426 Mask = DAG.getBitcast(BlendVT, Mask);
49427 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
49428 return DAG.getBitcast(VT, Mask);
49429}
49430
49431// Helper function for combineOrCmpEqZeroToCtlzSrl
49432// Transforms:
49433// seteq(cmp x, 0)
49434// into:
49435// srl(ctlz x), log2(bitsize(x))
49436// Input pattern is checked by caller.
49438 SDValue Cmp = Op.getOperand(1);
49439 EVT VT = Cmp.getOperand(0).getValueType();
49440 unsigned Log2b = Log2_32(VT.getSizeInBits());
49441 SDLoc dl(Op);
49442 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
49443 // The result of the shift is true or false, and on X86, the 32-bit
49444 // encoding of shr and lzcnt is more desirable.
49445 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
49446 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
49447 DAG.getConstant(Log2b, dl, MVT::i8));
49448 return Scc;
49449}
49450
49451// Try to transform:
49452// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
49453// into:
49454// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
49455// Will also attempt to match more generic cases, eg:
49456// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
49457// Only applies if the target supports the FastLZCNT feature.
49460 const X86Subtarget &Subtarget) {
49461 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
49462 return SDValue();
49463
49464 auto isORCandidate = [](SDValue N) {
49465 return (N->getOpcode() == ISD::OR && N->hasOneUse());
49466 };
49467
49468 // Check the zero extend is extending to 32-bit or more. The code generated by
49469 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
49470 // instructions to clear the upper bits.
49471 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
49472 !isORCandidate(N->getOperand(0)))
49473 return SDValue();
49474
49475 // Check the node matches: setcc(eq, cmp 0)
49476 auto isSetCCCandidate = [](SDValue N) {
49477 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
49478 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
49479 N->getOperand(1).getOpcode() == X86ISD::CMP &&
49480 isNullConstant(N->getOperand(1).getOperand(1)) &&
49481 N->getOperand(1).getValueType().bitsGE(MVT::i32);
49482 };
49483
49484 SDNode *OR = N->getOperand(0).getNode();
49485 SDValue LHS = OR->getOperand(0);
49486 SDValue RHS = OR->getOperand(1);
49487
49488 // Save nodes matching or(or, setcc(eq, cmp 0)).
49490 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
49491 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
49492 ORNodes.push_back(OR);
49493 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
49494 LHS = OR->getOperand(0);
49495 RHS = OR->getOperand(1);
49496 }
49497
49498 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
49499 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
49500 !isORCandidate(SDValue(OR, 0)))
49501 return SDValue();
49502
49503 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
49504 // to
49505 // or(srl(ctlz),srl(ctlz)).
49506 // The dag combiner can then fold it into:
49507 // srl(or(ctlz, ctlz)).
49508 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);
49509 SDValue Ret, NewRHS;
49510 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))
49511 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);
49512
49513 if (!Ret)
49514 return SDValue();
49515
49516 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
49517 while (!ORNodes.empty()) {
49518 OR = ORNodes.pop_back_val();
49519 LHS = OR->getOperand(0);
49520 RHS = OR->getOperand(1);
49521 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
49522 if (RHS->getOpcode() == ISD::OR)
49523 std::swap(LHS, RHS);
49524 NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);
49525 if (!NewRHS)
49526 return SDValue();
49527 Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);
49528 }
49529
49530 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
49531}
49532
49534 SDValue And1_L, SDValue And1_R,
49535 const SDLoc &DL, SelectionDAG &DAG) {
49536 if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
49537 return SDValue();
49538 SDValue NotOp = And0_L->getOperand(0);
49539 if (NotOp == And1_R)
49540 std::swap(And1_R, And1_L);
49541 if (NotOp != And1_L)
49542 return SDValue();
49543
49544 // (~(NotOp) & And0_R) | (NotOp & And1_R)
49545 // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
49546 EVT VT = And1_L->getValueType(0);
49547 SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);
49548 SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);
49549 SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
49550 SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);
49551 return Xor1;
49552}
49553
49554/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
49555/// equivalent `((x ^ y) & m) ^ y)` pattern.
49556/// This is typically a better representation for targets without a fused
49557/// "and-not" operation. This function is intended to be called from a
49558/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.
49560 // Note that masked-merge variants using XOR or ADD expressions are
49561 // normalized to OR by InstCombine so we only check for OR.
49562 assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
49563 SDValue N0 = Node->getOperand(0);
49564 if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
49565 return SDValue();
49566 SDValue N1 = Node->getOperand(1);
49567 if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
49568 return SDValue();
49569
49570 SDLoc DL(Node);
49571 SDValue N00 = N0->getOperand(0);
49572 SDValue N01 = N0->getOperand(1);
49573 SDValue N10 = N1->getOperand(0);
49574 SDValue N11 = N1->getOperand(1);
49575 if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
49576 return Result;
49577 if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
49578 return Result;
49579 if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
49580 return Result;
49581 if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
49582 return Result;
49583 return SDValue();
49584}
49585
49586/// If this is an add or subtract where one operand is produced by a cmp+setcc,
49587/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
49588/// with CMP+{ADC, SBB}.
49589/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.
49590static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
49591 SDValue X, SDValue Y,
49592 SelectionDAG &DAG,
49593 bool ZeroSecondOpOnly = false) {
49594 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
49595 return SDValue();
49596
49597 // Look through a one-use zext.
49598 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())
49599 Y = Y.getOperand(0);
49600
49602 SDValue EFLAGS;
49603 if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {
49604 CC = (X86::CondCode)Y.getConstantOperandVal(0);
49605 EFLAGS = Y.getOperand(1);
49606 } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&
49607 Y.hasOneUse()) {
49608 EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);
49609 }
49610
49611 if (!EFLAGS)
49612 return SDValue();
49613
49614 // If X is -1 or 0, then we have an opportunity to avoid constants required in
49615 // the general case below.
49616 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
49617 if (ConstantX && !ZeroSecondOpOnly) {
49618 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||
49619 (IsSub && CC == X86::COND_B && ConstantX->isZero())) {
49620 // This is a complicated way to get -1 or 0 from the carry flag:
49621 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
49622 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
49623 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49624 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49625 EFLAGS);
49626 }
49627
49628 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||
49629 (IsSub && CC == X86::COND_A && ConstantX->isZero())) {
49630 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
49631 EFLAGS.getValueType().isInteger() &&
49632 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49633 // Swap the operands of a SUB, and we have the same pattern as above.
49634 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
49635 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
49636 SDValue NewSub = DAG.getNode(
49637 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
49638 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49639 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
49640 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49641 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49642 NewEFLAGS);
49643 }
49644 }
49645 }
49646
49647 if (CC == X86::COND_B) {
49648 // X + SETB Z --> adc X, 0
49649 // X - SETB Z --> sbb X, 0
49650 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
49651 DAG.getVTList(VT, MVT::i32), X,
49652 DAG.getConstant(0, DL, VT), EFLAGS);
49653 }
49654
49655 if (ZeroSecondOpOnly)
49656 return SDValue();
49657
49658 if (CC == X86::COND_A) {
49659 // Try to convert COND_A into COND_B in an attempt to facilitate
49660 // materializing "setb reg".
49661 //
49662 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
49663 // cannot take an immediate as its first operand.
49664 //
49665 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
49666 EFLAGS.getValueType().isInteger() &&
49667 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49668 SDValue NewSub =
49669 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
49670 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49671 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
49672 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
49673 DAG.getVTList(VT, MVT::i32), X,
49674 DAG.getConstant(0, DL, VT), NewEFLAGS);
49675 }
49676 }
49677
49678 if (CC == X86::COND_AE) {
49679 // X + SETAE --> sbb X, -1
49680 // X - SETAE --> adc X, -1
49681 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
49682 DAG.getVTList(VT, MVT::i32), X,
49683 DAG.getConstant(-1, DL, VT), EFLAGS);
49684 }
49685
49686 if (CC == X86::COND_BE) {
49687 // X + SETBE --> sbb X, -1
49688 // X - SETBE --> adc X, -1
49689 // Try to convert COND_BE into COND_AE in an attempt to facilitate
49690 // materializing "setae reg".
49691 //
49692 // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
49693 // cannot take an immediate as its first operand.
49694 //
49695 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
49696 EFLAGS.getValueType().isInteger() &&
49697 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
49698 SDValue NewSub =
49699 DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
49700 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
49701 SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
49702 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
49703 DAG.getVTList(VT, MVT::i32), X,
49704 DAG.getConstant(-1, DL, VT), NewEFLAGS);
49705 }
49706 }
49707
49708 if (CC != X86::COND_E && CC != X86::COND_NE)
49709 return SDValue();
49710
49711 if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||
49712 !X86::isZeroNode(EFLAGS.getOperand(1)) ||
49713 !EFLAGS.getOperand(0).getValueType().isInteger())
49714 return SDValue();
49715
49716 SDValue Z = EFLAGS.getOperand(0);
49717 EVT ZVT = Z.getValueType();
49718
49719 // If X is -1 or 0, then we have an opportunity to avoid constants required in
49720 // the general case below.
49721 if (ConstantX) {
49722 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
49723 // fake operands:
49724 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
49725 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
49726 if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||
49727 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {
49728 SDValue Zero = DAG.getConstant(0, DL, ZVT);
49729 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49730 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
49731 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49732 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49733 SDValue(Neg.getNode(), 1));
49734 }
49735
49736 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
49737 // with fake operands:
49738 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
49739 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
49740 if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||
49741 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {
49742 SDValue One = DAG.getConstant(1, DL, ZVT);
49743 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49744 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
49745 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
49746 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
49747 Cmp1.getValue(1));
49748 }
49749 }
49750
49751 // (cmp Z, 1) sets the carry flag if Z is 0.
49752 SDValue One = DAG.getConstant(1, DL, ZVT);
49753 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
49754 SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
49755
49756 // Add the flags type for ADC/SBB nodes.
49757 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
49758
49759 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
49760 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
49761 if (CC == X86::COND_NE)
49762 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
49763 DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
49764
49765 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
49766 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
49767 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
49768 DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
49769}
49770
49771/// If this is an add or subtract where one operand is produced by a cmp+setcc,
49772/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
49773/// with CMP+{ADC, SBB}.
49775 bool IsSub = N->getOpcode() == ISD::SUB;
49776 SDValue X = N->getOperand(0);
49777 SDValue Y = N->getOperand(1);
49778 EVT VT = N->getValueType(0);
49779 SDLoc DL(N);
49780
49781 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
49782 return ADCOrSBB;
49783
49784 // Commute and try again (negate the result for subtracts).
49785 if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {
49786 if (IsSub)
49787 ADCOrSBB = DAG.getNegative(ADCOrSBB, DL, VT);
49788 return ADCOrSBB;
49789 }
49790
49791 return SDValue();
49792}
49793
49795 SelectionDAG &DAG) {
49796 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) &&
49797 "Unexpected opcode");
49798
49799 // Delegate to combineAddOrSubToADCOrSBB if we have:
49800 //
49801 // (xor/or (zero_extend (setcc)) imm)
49802 //
49803 // where imm is odd if and only if we have xor, in which case the XOR/OR are
49804 // equivalent to a SUB/ADD, respectively.
49805 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
49806 N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {
49807 if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {
49808 bool IsSub = N->getOpcode() == ISD::XOR;
49809 bool N1COdd = N1C->getZExtValue() & 1;
49810 if (IsSub ? N1COdd : !N1COdd) {
49811 SDLoc DL(N);
49812 EVT VT = N->getValueType(0);
49813 if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))
49814 return R;
49815 }
49816 }
49817 }
49818
49819 // not(pcmpeq(and(X,CstPow2),0)) -> pcmpeq(and(X,CstPow2),CstPow2)
49820 if (N->getOpcode() == ISD::XOR && N0.getOpcode() == X86ISD::PCMPEQ &&
49821 N0.getOperand(0).getOpcode() == ISD::AND &&
49824 MVT VT = N->getSimpleValueType(0);
49825 APInt UndefElts;
49826 SmallVector<APInt> EltBits;
49828 VT.getScalarSizeInBits(), UndefElts,
49829 EltBits)) {
49830 bool IsPow2OrUndef = true;
49831 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
49832 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
49833
49834 if (IsPow2OrUndef)
49835 return DAG.getNode(X86ISD::PCMPEQ, SDLoc(N), VT, N0.getOperand(0),
49836 N0.getOperand(0).getOperand(1));
49837 }
49838 }
49839
49840 return SDValue();
49841}
49842
49845 const X86Subtarget &Subtarget) {
49846 SDValue N0 = N->getOperand(0);
49847 SDValue N1 = N->getOperand(1);
49848 EVT VT = N->getValueType(0);
49849 SDLoc dl(N);
49850 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
49851
49852 // If this is SSE1 only convert to FOR to avoid scalarization.
49853 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
49854 return DAG.getBitcast(MVT::v4i32,
49855 DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,
49856 DAG.getBitcast(MVT::v4f32, N0),
49857 DAG.getBitcast(MVT::v4f32, N1)));
49858 }
49859
49860 // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
49861 // TODO: Support multiple SrcOps.
49862 if (VT == MVT::i1) {
49864 SmallVector<APInt, 2> SrcPartials;
49865 if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
49866 SrcOps.size() == 1) {
49867 unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
49868 EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
49869 SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
49870 if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
49871 Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
49872 if (Mask) {
49873 assert(SrcPartials[0].getBitWidth() == NumElts &&
49874 "Unexpected partial reduction mask");
49875 SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
49876 SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
49877 Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
49878 return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
49879 }
49880 }
49881 }
49882
49883 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
49884 return R;
49885
49886 if (SDValue R = combineBitOpWithShift(N, DAG))
49887 return R;
49888
49889 if (SDValue R = combineBitOpWithPACK(N, DAG))
49890 return R;
49891
49892 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
49893 return FPLogic;
49894
49895 if (DCI.isBeforeLegalizeOps())
49896 return SDValue();
49897
49898 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
49899 return R;
49900
49901 if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
49902 return R;
49903
49904 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
49905 return R;
49906
49907 // (0 - SetCC) | C -> (zext (not SetCC)) * (C + 1) - 1 if we can get a LEA out of it.
49908 if ((VT == MVT::i32 || VT == MVT::i64) &&
49909 N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
49910 isNullConstant(N0.getOperand(0))) {
49911 SDValue Cond = N0.getOperand(1);
49912 if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())
49913 Cond = Cond.getOperand(0);
49914
49915 if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {
49916 if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {
49917 uint64_t Val = CN->getZExtValue();
49918 if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 || Val == 8) {
49919 X86::CondCode CCode = (X86::CondCode)Cond.getConstantOperandVal(0);
49920 CCode = X86::GetOppositeBranchCondition(CCode);
49921 SDValue NotCond = getSETCC(CCode, Cond.getOperand(1), SDLoc(Cond), DAG);
49922
49923 SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);
49924 R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));
49925 R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));
49926 return R;
49927 }
49928 }
49929 }
49930 }
49931
49932 // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
49933 // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
49934 // iff the upper elements of the non-shifted arg are zero.
49935 // KUNPCK require 16+ bool vector elements.
49936 if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
49937 unsigned NumElts = VT.getVectorNumElements();
49938 unsigned HalfElts = NumElts / 2;
49939 APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
49940 if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
49941 N1.getConstantOperandAPInt(1) == HalfElts &&
49942 DAG.MaskedVectorIsZero(N0, UpperElts)) {
49943 return DAG.getNode(
49944 ISD::CONCAT_VECTORS, dl, VT,
49945 extractSubVector(N0, 0, DAG, dl, HalfElts),
49946 extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
49947 }
49948 if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
49949 N0.getConstantOperandAPInt(1) == HalfElts &&
49950 DAG.MaskedVectorIsZero(N1, UpperElts)) {
49951 return DAG.getNode(
49952 ISD::CONCAT_VECTORS, dl, VT,
49953 extractSubVector(N1, 0, DAG, dl, HalfElts),
49954 extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
49955 }
49956 }
49957
49958 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
49959 // Attempt to recursively combine an OR of shuffles.
49960 SDValue Op(N, 0);
49961 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
49962 return Res;
49963
49964 // If either operand is a constant mask, then only the elements that aren't
49965 // allones are actually demanded by the other operand.
49966 auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {
49967 APInt UndefElts;
49968 SmallVector<APInt> EltBits;
49969 int NumElts = VT.getVectorNumElements();
49970 int EltSizeInBits = VT.getScalarSizeInBits();
49971 if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))
49972 return false;
49973
49974 APInt DemandedElts = APInt::getZero(NumElts);
49975 for (int I = 0; I != NumElts; ++I)
49976 if (!EltBits[I].isAllOnes())
49977 DemandedElts.setBit(I);
49978
49979 return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);
49980 };
49981 if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {
49982 if (N->getOpcode() != ISD::DELETED_NODE)
49983 DCI.AddToWorklist(N);
49984 return SDValue(N, 0);
49985 }
49986 }
49987
49988 // We should fold "masked merge" patterns when `andn` is not available.
49989 if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)
49990 if (SDValue R = foldMaskedMerge(N, DAG))
49991 return R;
49992
49993 if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))
49994 return R;
49995
49996 return SDValue();
49997}
49998
49999/// Try to turn tests against the signbit in the form of:
50000/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
50001/// into:
50002/// SETGT(X, -1)
50004 // This is only worth doing if the output type is i8 or i1.
50005 EVT ResultType = N->getValueType(0);
50006 if (ResultType != MVT::i8 && ResultType != MVT::i1)
50007 return SDValue();
50008
50009 SDValue N0 = N->getOperand(0);
50010 SDValue N1 = N->getOperand(1);
50011
50012 // We should be performing an xor against a truncated shift.
50013 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
50014 return SDValue();
50015
50016 // Make sure we are performing an xor against one.
50017 if (!isOneConstant(N1))
50018 return SDValue();
50019
50020 // SetCC on x86 zero extends so only act on this if it's a logical shift.
50021 SDValue Shift = N0.getOperand(0);
50022 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
50023 return SDValue();
50024
50025 // Make sure we are truncating from one of i16, i32 or i64.
50026 EVT ShiftTy = Shift.getValueType();
50027 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
50028 return SDValue();
50029
50030 // Make sure the shift amount extracts the sign bit.
50031 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
50032 Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))
50033 return SDValue();
50034
50035 // Create a greater-than comparison against -1.
50036 // N.B. Using SETGE against 0 works but we want a canonical looking
50037 // comparison, using SETGT matches up with what TranslateX86CC.
50038 SDLoc DL(N);
50039 SDValue ShiftOp = Shift.getOperand(0);
50040 EVT ShiftOpTy = ShiftOp.getValueType();
50041 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50042 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
50043 *DAG.getContext(), ResultType);
50044 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
50045 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
50046 if (SetCCResultType != ResultType)
50047 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
50048 return Cond;
50049}
50050
50051/// Turn vector tests of the signbit in the form of:
50052/// xor (sra X, elt_size(X)-1), -1
50053/// into:
50054/// pcmpgt X, -1
50055///
50056/// This should be called before type legalization because the pattern may not
50057/// persist after that.
50059 const X86Subtarget &Subtarget) {
50060 EVT VT = N->getValueType(0);
50061 if (!VT.isSimple())
50062 return SDValue();
50063
50064 switch (VT.getSimpleVT().SimpleTy) {
50065 // clang-format off
50066 default: return SDValue();
50067 case MVT::v16i8:
50068 case MVT::v8i16:
50069 case MVT::v4i32:
50070 case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
50071 case MVT::v32i8:
50072 case MVT::v16i16:
50073 case MVT::v8i32:
50074 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
50075 // clang-format on
50076 }
50077
50078 // There must be a shift right algebraic before the xor, and the xor must be a
50079 // 'not' operation.
50080 SDValue Shift = N->getOperand(0);
50081 SDValue Ones = N->getOperand(1);
50082 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
50084 return SDValue();
50085
50086 // The shift should be smearing the sign bit across each vector element.
50087 auto *ShiftAmt =
50088 isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);
50089 if (!ShiftAmt ||
50090 ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))
50091 return SDValue();
50092
50093 // Create a greater-than comparison against -1. We don't use the more obvious
50094 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
50095 return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
50096}
50097
50098/// Detect patterns of truncation with unsigned saturation:
50099///
50100/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
50101/// Return the source value x to be truncated or SDValue() if the pattern was
50102/// not matched.
50103///
50104/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
50105/// where C1 >= 0 and C2 is unsigned max of destination type.
50106///
50107/// (truncate (smax (smin (x, C2), C1)) to dest_type)
50108/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
50109///
50110/// These two patterns are equivalent to:
50111/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
50112/// So return the smax(x, C1) value to be truncated or SDValue() if the
50113/// pattern was not matched.
50115 const SDLoc &DL) {
50116 EVT InVT = In.getValueType();
50117
50118 // Saturation with truncation. We truncate from InVT to VT.
50120 "Unexpected types for truncate operation");
50121
50122 // Match min/max and return limit value as a parameter.
50123 auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
50124 if (V.getOpcode() == Opcode &&
50125 ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
50126 return V.getOperand(0);
50127 return SDValue();
50128 };
50129
50130 APInt C1, C2;
50131 if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
50132 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
50133 // the element size of the destination type.
50134 if (C2.isMask(VT.getScalarSizeInBits()))
50135 return UMin;
50136
50137 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
50138 if (MatchMinMax(SMin, ISD::SMAX, C1))
50139 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
50140 return SMin;
50141
50142 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
50143 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
50144 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
50145 C2.uge(C1)) {
50146 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
50147 }
50148
50149 return SDValue();
50150}
50151
50152/// Detect patterns of truncation with signed saturation:
50153/// (truncate (smin ((smax (x, signed_min_of_dest_type)),
50154/// signed_max_of_dest_type)) to dest_type)
50155/// or:
50156/// (truncate (smax ((smin (x, signed_max_of_dest_type)),
50157/// signed_min_of_dest_type)) to dest_type).
50158/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
50159/// Return the source value to be truncated or SDValue() if the pattern was not
50160/// matched.
50161static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
50162 unsigned NumDstBits = VT.getScalarSizeInBits();
50163 unsigned NumSrcBits = In.getScalarValueSizeInBits();
50164 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
50165
50166 auto MatchMinMax = [](SDValue V, unsigned Opcode,
50167 const APInt &Limit) -> SDValue {
50168 APInt C;
50169 if (V.getOpcode() == Opcode &&
50170 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
50171 return V.getOperand(0);
50172 return SDValue();
50173 };
50174
50175 APInt SignedMax, SignedMin;
50176 if (MatchPackUS) {
50177 SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);
50178 SignedMin = APInt(NumSrcBits, 0);
50179 } else {
50180 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
50181 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
50182 }
50183
50184 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
50185 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
50186 return SMax;
50187
50188 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
50189 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
50190 return SMin;
50191
50192 return SDValue();
50193}
50194
50196 SelectionDAG &DAG,
50197 const X86Subtarget &Subtarget) {
50198 if (!Subtarget.hasSSE2() || !VT.isVector())
50199 return SDValue();
50200
50201 EVT SVT = VT.getVectorElementType();
50202 EVT InVT = In.getValueType();
50203 EVT InSVT = InVT.getVectorElementType();
50204
50205 // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
50206 // split across two registers. We can use a packusdw+perm to clamp to 0-65535
50207 // and concatenate at the same time. Then we can use a final vpmovuswb to
50208 // clip to 0-255.
50209 if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
50210 InVT == MVT::v16i32 && VT == MVT::v16i8) {
50211 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
50212 // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
50213 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
50214 DL, DAG, Subtarget);
50215 assert(Mid && "Failed to pack!");
50216 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
50217 }
50218 }
50219
50220 // vXi32 truncate instructions are available with AVX512F.
50221 // vXi16 truncate instructions are only available with AVX512BW.
50222 // For 256-bit or smaller vectors, we require VLX.
50223 // FIXME: We could widen truncates to 512 to remove the VLX restriction.
50224 // If the result type is 256-bits or larger and we have disable 512-bit
50225 // registers, we should go ahead and use the pack instructions if possible.
50226 bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||
50227 (Subtarget.hasBWI() && InSVT == MVT::i16)) &&
50228 (InVT.getSizeInBits() > 128) &&
50229 (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
50230 !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
50231
50232 if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
50234 (SVT == MVT::i8 || SVT == MVT::i16) &&
50235 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
50236 if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
50237 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
50238 if (SVT == MVT::i8 && InSVT == MVT::i32) {
50239 EVT MidVT = VT.changeVectorElementType(MVT::i16);
50240 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
50241 DAG, Subtarget);
50242 assert(Mid && "Failed to pack!");
50244 Subtarget);
50245 assert(V && "Failed to pack!");
50246 return V;
50247 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
50248 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
50249 Subtarget);
50250 }
50251 if (SDValue SSatVal = detectSSatPattern(In, VT))
50252 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
50253 Subtarget);
50254 }
50255
50256 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50257 if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
50258 Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&
50259 (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {
50260 unsigned TruncOpc = 0;
50261 SDValue SatVal;
50262 if (SDValue SSatVal = detectSSatPattern(In, VT)) {
50263 SatVal = SSatVal;
50264 TruncOpc = X86ISD::VTRUNCS;
50265 } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {
50266 SatVal = USatVal;
50267 TruncOpc = X86ISD::VTRUNCUS;
50268 }
50269 if (SatVal) {
50270 unsigned ResElts = VT.getVectorNumElements();
50271 // If the input type is less than 512 bits and we don't have VLX, we need
50272 // to widen to 512 bits.
50273 if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {
50274 unsigned NumConcats = 512 / InVT.getSizeInBits();
50275 ResElts *= NumConcats;
50276 SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));
50277 ConcatOps[0] = SatVal;
50278 InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,
50279 NumConcats * InVT.getVectorNumElements());
50280 SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);
50281 }
50282 // Widen the result if its narrower than 128 bits.
50283 if (ResElts * SVT.getSizeInBits() < 128)
50284 ResElts = 128 / SVT.getSizeInBits();
50285 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);
50286 SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);
50287 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
50288 DAG.getIntPtrConstant(0, DL));
50289 }
50290 }
50291
50292 return SDValue();
50293}
50294
50295/// This function detects the AVG pattern between vectors of unsigned i8/i16,
50296/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
50297/// ISD::AVGCEILU (AVG) instruction.
50299 const X86Subtarget &Subtarget,
50300 const SDLoc &DL) {
50301 if (!VT.isVector())
50302 return SDValue();
50303 EVT InVT = In.getValueType();
50304 unsigned NumElems = VT.getVectorNumElements();
50305
50306 EVT ScalarVT = VT.getVectorElementType();
50307 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))
50308 return SDValue();
50309
50310 // InScalarVT is the intermediate type in AVG pattern and it should be greater
50311 // than the original input type (i8/i16).
50312 EVT InScalarVT = InVT.getVectorElementType();
50313 if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())
50314 return SDValue();
50315
50316 if (!Subtarget.hasSSE2())
50317 return SDValue();
50318
50319 // Detect the following pattern:
50320 //
50321 // %1 = zext <N x i8> %a to <N x i32>
50322 // %2 = zext <N x i8> %b to <N x i32>
50323 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
50324 // %4 = add nuw nsw <N x i32> %3, %2
50325 // %5 = lshr <N x i32> %N, <i32 1 x N>
50326 // %6 = trunc <N x i32> %5 to <N x i8>
50327 //
50328 // In AVX512, the last instruction can also be a trunc store.
50329 if (In.getOpcode() != ISD::SRL)
50330 return SDValue();
50331
50332 // A lambda checking the given SDValue is a constant vector and each element
50333 // is in the range [Min, Max].
50334 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
50335 return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
50336 return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
50337 });
50338 };
50339
50340 auto IsZExtLike = [DAG = &DAG, ScalarVT](SDValue V) {
50341 unsigned MaxActiveBits = DAG->computeKnownBits(V).countMaxActiveBits();
50342 return MaxActiveBits <= ScalarVT.getSizeInBits();
50343 };
50344
50345 // Check if each element of the vector is right-shifted by one.
50346 SDValue LHS = In.getOperand(0);
50347 SDValue RHS = In.getOperand(1);
50348 if (!IsConstVectorInRange(RHS, 1, 1))
50349 return SDValue();
50350 if (LHS.getOpcode() != ISD::ADD)
50351 return SDValue();
50352
50353 // Detect a pattern of a + b + 1 where the order doesn't matter.
50354 SDValue Operands[3];
50355 Operands[0] = LHS.getOperand(0);
50356 Operands[1] = LHS.getOperand(1);
50357
50358 auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
50359 ArrayRef<SDValue> Ops) {
50360 return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops);
50361 };
50362
50363 auto AVGSplitter = [&](std::array<SDValue, 2> Ops) {
50364 for (SDValue &Op : Ops)
50365 if (Op.getValueType() != VT)
50366 Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
50367 // Pad to a power-of-2 vector, split+apply and extract the original vector.
50368 unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);
50369 EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);
50370 if (NumElemsPow2 != NumElems) {
50371 for (SDValue &Op : Ops) {
50372 SmallVector<SDValue, 32> EltsOfOp(NumElemsPow2, DAG.getUNDEF(ScalarVT));
50373 for (unsigned i = 0; i != NumElems; ++i) {
50374 SDValue Idx = DAG.getIntPtrConstant(i, DL);
50375 EltsOfOp[i] =
50376 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op, Idx);
50377 }
50378 Op = DAG.getBuildVector(Pow2VT, DL, EltsOfOp);
50379 }
50380 }
50381 SDValue Res = SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, Ops, AVGBuilder);
50382 if (NumElemsPow2 == NumElems)
50383 return Res;
50384 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
50385 DAG.getIntPtrConstant(0, DL));
50386 };
50387
50388 // Take care of the case when one of the operands is a constant vector whose
50389 // element is in the range [1, 256].
50390 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
50391 IsZExtLike(Operands[0])) {
50392 // The pattern is detected. Subtract one from the constant vector, then
50393 // demote it and emit X86ISD::AVG instruction.
50394 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
50395 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
50396 return AVGSplitter({Operands[0], Operands[1]});
50397 }
50398
50399 // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).
50400 // Match the or case only if its 'add-like' - can be replaced by an add.
50401 auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {
50402 if (ISD::ADD == V.getOpcode()) {
50403 Op0 = V.getOperand(0);
50404 Op1 = V.getOperand(1);
50405 return true;
50406 }
50407 if (ISD::ZERO_EXTEND != V.getOpcode())
50408 return false;
50409 V = V.getOperand(0);
50410 if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||
50411 !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))
50412 return false;
50413 Op0 = V.getOperand(0);
50414 Op1 = V.getOperand(1);
50415 return true;
50416 };
50417
50418 SDValue Op0, Op1;
50419 if (FindAddLike(Operands[0], Op0, Op1))
50420 std::swap(Operands[0], Operands[1]);
50421 else if (!FindAddLike(Operands[1], Op0, Op1))
50422 return SDValue();
50423 Operands[2] = Op0;
50424 Operands[1] = Op1;
50425
50426 // Now we have three operands of two additions. Check that one of them is a
50427 // constant vector with ones, and the other two can be promoted from i8/i16.
50428 for (SDValue &Op : Operands) {
50429 if (!IsConstVectorInRange(Op, 1, 1))
50430 continue;
50431 std::swap(Op, Operands[2]);
50432
50433 // Check if Operands[0] and Operands[1] are results of type promotion.
50434 for (int j = 0; j < 2; ++j)
50435 if (Operands[j].getValueType() != VT)
50436 if (!IsZExtLike(Operands[j]))
50437 return SDValue();
50438
50439 // The pattern is detected, emit X86ISD::AVG instruction(s).
50440 return AVGSplitter({Operands[0], Operands[1]});
50441 }
50442
50443 return SDValue();
50444}
50445
50448 const X86Subtarget &Subtarget) {
50449 LoadSDNode *Ld = cast<LoadSDNode>(N);
50450 EVT RegVT = Ld->getValueType(0);
50451 EVT MemVT = Ld->getMemoryVT();
50452 SDLoc dl(Ld);
50453 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50454
50455 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
50456 // into two 16-byte operations. Also split non-temporal aligned loads on
50457 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
50459 unsigned Fast;
50460 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
50461 Ext == ISD::NON_EXTLOAD &&
50462 ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
50463 Ld->getAlign() >= Align(16)) ||
50464 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
50465 *Ld->getMemOperand(), &Fast) &&
50466 !Fast))) {
50467 unsigned NumElems = RegVT.getVectorNumElements();
50468 if (NumElems < 2)
50469 return SDValue();
50470
50471 unsigned HalfOffset = 16;
50472 SDValue Ptr1 = Ld->getBasePtr();
50473 SDValue Ptr2 =
50474 DAG.getMemBasePlusOffset(Ptr1, TypeSize::getFixed(HalfOffset), dl);
50475 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
50476 NumElems / 2);
50477 SDValue Load1 =
50478 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
50479 Ld->getOriginalAlign(),
50480 Ld->getMemOperand()->getFlags());
50481 SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
50482 Ld->getPointerInfo().getWithOffset(HalfOffset),
50483 Ld->getOriginalAlign(),
50484 Ld->getMemOperand()->getFlags());
50485 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
50486 Load1.getValue(1), Load2.getValue(1));
50487
50488 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
50489 return DCI.CombineTo(N, NewVec, TF, true);
50490 }
50491
50492 // Bool vector load - attempt to cast to an integer, as we have good
50493 // (vXiY *ext(vXi1 bitcast(iX))) handling.
50494 if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&
50495 RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {
50496 unsigned NumElts = RegVT.getVectorNumElements();
50497 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
50498 if (TLI.isTypeLegal(IntVT)) {
50499 SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
50500 Ld->getPointerInfo(),
50501 Ld->getOriginalAlign(),
50502 Ld->getMemOperand()->getFlags());
50503 SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
50504 return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
50505 }
50506 }
50507
50508 // If we also load/broadcast this to a wider type, then just extract the
50509 // lowest subvector.
50510 if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&
50511 (RegVT.is128BitVector() || RegVT.is256BitVector())) {
50512 SDValue Ptr = Ld->getBasePtr();
50513 SDValue Chain = Ld->getChain();
50514 for (SDNode *User : Chain->uses()) {
50515 auto *UserLd = dyn_cast<MemSDNode>(User);
50516 if (User != N && UserLd &&
50517 (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
50518 User->getOpcode() == X86ISD::VBROADCAST_LOAD ||
50520 UserLd->getChain() == Chain && !User->hasAnyUseOfValue(1) &&
50521 User->getValueSizeInBits(0).getFixedValue() >
50522 RegVT.getFixedSizeInBits()) {
50523 if (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
50524 UserLd->getBasePtr() == Ptr &&
50525 UserLd->getMemoryVT().getSizeInBits() == MemVT.getSizeInBits()) {
50526 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
50527 RegVT.getSizeInBits());
50528 Extract = DAG.getBitcast(RegVT, Extract);
50529 return DCI.CombineTo(N, Extract, SDValue(User, 1));
50530 }
50531 auto MatchingBits = [](const APInt &Undefs, const APInt &UserUndefs,
50532 ArrayRef<APInt> Bits, ArrayRef<APInt> UserBits) {
50533 for (unsigned I = 0, E = Undefs.getBitWidth(); I != E; ++I) {
50534 if (Undefs[I])
50535 continue;
50536 if (UserUndefs[I] || Bits[I] != UserBits[I])
50537 return false;
50538 }
50539 return true;
50540 };
50541 // See if we are loading a constant that matches in the lower
50542 // bits of a longer constant (but from a different constant pool ptr).
50543 EVT UserVT = User->getValueType(0);
50544 SDValue UserPtr = UserLd->getBasePtr();
50546 const Constant *UserC = getTargetConstantFromBasePtr(UserPtr);
50547 if (LdC && UserC && UserPtr != Ptr) {
50548 unsigned LdSize = LdC->getType()->getPrimitiveSizeInBits();
50549 unsigned UserSize = UserC->getType()->getPrimitiveSizeInBits();
50550 if (LdSize < UserSize || !ISD::isNormalLoad(User)) {
50551 APInt Undefs, UserUndefs;
50552 SmallVector<APInt> Bits, UserBits;
50553 unsigned NumBits = std::min(RegVT.getScalarSizeInBits(),
50554 UserVT.getScalarSizeInBits());
50555 if (getTargetConstantBitsFromNode(SDValue(N, 0), NumBits, Undefs,
50556 Bits) &&
50558 UserUndefs, UserBits)) {
50559 if (MatchingBits(Undefs, UserUndefs, Bits, UserBits)) {
50560 SDValue Extract = extractSubVector(
50561 SDValue(User, 0), 0, DAG, SDLoc(N), RegVT.getSizeInBits());
50562 Extract = DAG.getBitcast(RegVT, Extract);
50563 return DCI.CombineTo(N, Extract, SDValue(User, 1));
50564 }
50565 }
50566 }
50567 }
50568 }
50569 }
50570 }
50571
50572 // Cast ptr32 and ptr64 pointers to the default address space before a load.
50573 unsigned AddrSpace = Ld->getAddressSpace();
50574 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
50575 AddrSpace == X86AS::PTR32_UPTR) {
50576 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
50577 if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
50578 SDValue Cast =
50579 DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
50580 return DAG.getExtLoad(Ext, dl, RegVT, Ld->getChain(), Cast,
50581 Ld->getPointerInfo(), MemVT, Ld->getOriginalAlign(),
50582 Ld->getMemOperand()->getFlags());
50583 }
50584 }
50585
50586 return SDValue();
50587}
50588
50589/// If V is a build vector of boolean constants and exactly one of those
50590/// constants is true, return the operand index of that true element.
50591/// Otherwise, return -1.
50592static int getOneTrueElt(SDValue V) {
50593 // This needs to be a build vector of booleans.
50594 // TODO: Checking for the i1 type matches the IR definition for the mask,
50595 // but the mask check could be loosened to i8 or other types. That might
50596 // also require checking more than 'allOnesValue'; eg, the x86 HW
50597 // instructions only require that the MSB is set for each mask element.
50598 // The ISD::MSTORE comments/definition do not specify how the mask operand
50599 // is formatted.
50600 auto *BV = dyn_cast<BuildVectorSDNode>(V);
50601 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
50602 return -1;
50603
50604 int TrueIndex = -1;
50605 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
50606 for (unsigned i = 0; i < NumElts; ++i) {
50607 const SDValue &Op = BV->getOperand(i);
50608 if (Op.isUndef())
50609 continue;
50610 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
50611 if (!ConstNode)
50612 return -1;
50613 if (ConstNode->getAPIntValue().countr_one() >= 1) {
50614 // If we already found a one, this is too many.
50615 if (TrueIndex >= 0)
50616 return -1;
50617 TrueIndex = i;
50618 }
50619 }
50620 return TrueIndex;
50621}
50622
50623/// Given a masked memory load/store operation, return true if it has one mask
50624/// bit set. If it has one mask bit set, then also return the memory address of
50625/// the scalar element to load/store, the vector index to insert/extract that
50626/// scalar element, and the alignment for the scalar memory access.
50628 SelectionDAG &DAG, SDValue &Addr,
50629 SDValue &Index, Align &Alignment,
50630 unsigned &Offset) {
50631 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
50632 if (TrueMaskElt < 0)
50633 return false;
50634
50635 // Get the address of the one scalar element that is specified by the mask
50636 // using the appropriate offset from the base pointer.
50637 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
50638 Offset = 0;
50639 Addr = MaskedOp->getBasePtr();
50640 if (TrueMaskElt != 0) {
50641 Offset = TrueMaskElt * EltVT.getStoreSize();
50643 SDLoc(MaskedOp));
50644 }
50645
50646 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
50647 Alignment = commonAlignment(MaskedOp->getOriginalAlign(),
50648 EltVT.getStoreSize());
50649 return true;
50650}
50651
50652/// If exactly one element of the mask is set for a non-extending masked load,
50653/// it is a scalar load and vector insert.
50654/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
50655/// mask have already been optimized in IR, so we don't bother with those here.
50656static SDValue
50659 const X86Subtarget &Subtarget) {
50660 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
50661 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
50662 // However, some target hooks may need to be added to know when the transform
50663 // is profitable. Endianness would also have to be considered.
50664
50665 SDValue Addr, VecIndex;
50666 Align Alignment;
50667 unsigned Offset;
50668 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))
50669 return SDValue();
50670
50671 // Load the one scalar element that is specified by the mask using the
50672 // appropriate offset from the base pointer.
50673 SDLoc DL(ML);
50674 EVT VT = ML->getValueType(0);
50675 EVT EltVT = VT.getVectorElementType();
50676
50677 EVT CastVT = VT;
50678 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
50679 EltVT = MVT::f64;
50680 CastVT = VT.changeVectorElementType(EltVT);
50681 }
50682
50683 SDValue Load =
50684 DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
50685 ML->getPointerInfo().getWithOffset(Offset),
50686 Alignment, ML->getMemOperand()->getFlags());
50687
50688 SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
50689
50690 // Insert the loaded element into the appropriate place in the vector.
50691 SDValue Insert =
50692 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
50693 Insert = DAG.getBitcast(VT, Insert);
50694 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
50695}
50696
50697static SDValue
50700 assert(ML->isUnindexed() && "Unexpected indexed masked load!");
50701 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
50702 return SDValue();
50703
50704 SDLoc DL(ML);
50705 EVT VT = ML->getValueType(0);
50706
50707 // If we are loading the first and last elements of a vector, it is safe and
50708 // always faster to load the whole vector. Replace the masked load with a
50709 // vector load and select.
50710 unsigned NumElts = VT.getVectorNumElements();
50711 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
50712 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
50713 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
50714 if (LoadFirstElt && LoadLastElt) {
50715 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
50716 ML->getMemOperand());
50717 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,
50718 ML->getPassThru());
50719 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
50720 }
50721
50722 // Convert a masked load with a constant mask into a masked load and a select.
50723 // This allows the select operation to use a faster kind of select instruction
50724 // (for example, vblendvps -> vblendps).
50725
50726 // Don't try this if the pass-through operand is already undefined. That would
50727 // cause an infinite loop because that's what we're about to create.
50728 if (ML->getPassThru().isUndef())
50729 return SDValue();
50730
50731 if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))
50732 return SDValue();
50733
50734 // The new masked load has an undef pass-through operand. The select uses the
50735 // original pass-through operand.
50736 SDValue NewML = DAG.getMaskedLoad(
50737 VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
50738 DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
50739 ML->getAddressingMode(), ML->getExtensionType());
50740 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
50741 ML->getPassThru());
50742
50743 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
50744}
50745
50748 const X86Subtarget &Subtarget) {
50749 auto *Mld = cast<MaskedLoadSDNode>(N);
50750
50751 // TODO: Expanding load with constant mask may be optimized as well.
50752 if (Mld->isExpandingLoad())
50753 return SDValue();
50754
50755 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
50756 if (SDValue ScalarLoad =
50757 reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
50758 return ScalarLoad;
50759
50760 // TODO: Do some AVX512 subsets benefit from this transform?
50761 if (!Subtarget.hasAVX512())
50762 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
50763 return Blend;
50764 }
50765
50766 // If the mask value has been legalized to a non-boolean vector, try to
50767 // simplify ops leading up to it. We only demand the MSB of each lane.
50768 SDValue Mask = Mld->getMask();
50769 if (Mask.getScalarValueSizeInBits() != 1) {
50770 EVT VT = Mld->getValueType(0);
50771 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50773 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
50774 if (N->getOpcode() != ISD::DELETED_NODE)
50775 DCI.AddToWorklist(N);
50776 return SDValue(N, 0);
50777 }
50778 if (SDValue NewMask =
50780 return DAG.getMaskedLoad(
50781 VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
50782 NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
50783 Mld->getAddressingMode(), Mld->getExtensionType());
50784 }
50785
50786 return SDValue();
50787}
50788
50789/// If exactly one element of the mask is set for a non-truncating masked store,
50790/// it is a vector extract and scalar store.
50791/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
50792/// mask have already been optimized in IR, so we don't bother with those here.
50794 SelectionDAG &DAG,
50795 const X86Subtarget &Subtarget) {
50796 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
50797 // However, some target hooks may need to be added to know when the transform
50798 // is profitable. Endianness would also have to be considered.
50799
50800 SDValue Addr, VecIndex;
50801 Align Alignment;
50802 unsigned Offset;
50803 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))
50804 return SDValue();
50805
50806 // Extract the one scalar element that is actually being stored.
50807 SDLoc DL(MS);
50808 SDValue Value = MS->getValue();
50809 EVT VT = Value.getValueType();
50810 EVT EltVT = VT.getVectorElementType();
50811 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
50812 EltVT = MVT::f64;
50813 EVT CastVT = VT.changeVectorElementType(EltVT);
50814 Value = DAG.getBitcast(CastVT, Value);
50815 }
50816 SDValue Extract =
50817 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
50818
50819 // Store that element at the appropriate offset from the base pointer.
50820 return DAG.getStore(MS->getChain(), DL, Extract, Addr,
50822 Alignment, MS->getMemOperand()->getFlags());
50823}
50824
50827 const X86Subtarget &Subtarget) {
50828 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
50829 if (Mst->isCompressingStore())
50830 return SDValue();
50831
50832 EVT VT = Mst->getValue().getValueType();
50833 SDLoc dl(Mst);
50834 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50835
50836 if (Mst->isTruncatingStore())
50837 return SDValue();
50838
50839 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
50840 return ScalarStore;
50841
50842 // If the mask value has been legalized to a non-boolean vector, try to
50843 // simplify ops leading up to it. We only demand the MSB of each lane.
50844 SDValue Mask = Mst->getMask();
50845 if (Mask.getScalarValueSizeInBits() != 1) {
50847 if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
50848 if (N->getOpcode() != ISD::DELETED_NODE)
50849 DCI.AddToWorklist(N);
50850 return SDValue(N, 0);
50851 }
50852 if (SDValue NewMask =
50854 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
50855 Mst->getBasePtr(), Mst->getOffset(), NewMask,
50856 Mst->getMemoryVT(), Mst->getMemOperand(),
50857 Mst->getAddressingMode());
50858 }
50859
50860 SDValue Value = Mst->getValue();
50861 if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&
50862 TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
50863 Mst->getMemoryVT())) {
50864 return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
50865 Mst->getBasePtr(), Mst->getOffset(), Mask,
50866 Mst->getMemoryVT(), Mst->getMemOperand(),
50867 Mst->getAddressingMode(), true);
50868 }
50869
50870 return SDValue();
50871}
50872
50875 const X86Subtarget &Subtarget) {
50876 StoreSDNode *St = cast<StoreSDNode>(N);
50877 EVT StVT = St->getMemoryVT();
50878 SDLoc dl(St);
50879 SDValue StoredVal = St->getValue();
50880 EVT VT = StoredVal.getValueType();
50881 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
50882
50883 // Convert a store of vXi1 into a store of iX and a bitcast.
50884 if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&
50885 VT.getVectorElementType() == MVT::i1) {
50886
50888 StoredVal = DAG.getBitcast(NewVT, StoredVal);
50889
50890 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
50891 St->getPointerInfo(), St->getOriginalAlign(),
50892 St->getMemOperand()->getFlags());
50893 }
50894
50895 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
50896 // This will avoid a copy to k-register.
50897 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
50898 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
50899 StoredVal.getOperand(0).getValueType() == MVT::i8) {
50900 SDValue Val = StoredVal.getOperand(0);
50901 // We must store zeros to the unused bits.
50902 Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);
50903 return DAG.getStore(St->getChain(), dl, Val,
50904 St->getBasePtr(), St->getPointerInfo(),
50905 St->getOriginalAlign(),
50906 St->getMemOperand()->getFlags());
50907 }
50908
50909 // Widen v2i1/v4i1 stores to v8i1.
50910 if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
50911 Subtarget.hasAVX512()) {
50912 unsigned NumConcats = 8 / VT.getVectorNumElements();
50913 // We must store zeros to the unused bits.
50914 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));
50915 Ops[0] = StoredVal;
50916 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
50917 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
50918 St->getPointerInfo(), St->getOriginalAlign(),
50919 St->getMemOperand()->getFlags());
50920 }
50921
50922 // Turn vXi1 stores of constants into a scalar store.
50923 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
50924 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
50926 // If its a v64i1 store without 64-bit support, we need two stores.
50927 if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
50928 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
50929 StoredVal->ops().slice(0, 32));
50931 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
50932 StoredVal->ops().slice(32, 32));
50934
50935 SDValue Ptr0 = St->getBasePtr();
50936 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::getFixed(4), dl);
50937
50938 SDValue Ch0 =
50939 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
50940 St->getOriginalAlign(),
50941 St->getMemOperand()->getFlags());
50942 SDValue Ch1 =
50943 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
50945 St->getOriginalAlign(),
50946 St->getMemOperand()->getFlags());
50947 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
50948 }
50949
50950 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
50951 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
50952 St->getPointerInfo(), St->getOriginalAlign(),
50953 St->getMemOperand()->getFlags());
50954 }
50955
50956 // If we are saving a 32-byte vector and 32-byte stores are slow, such as on
50957 // Sandy Bridge, perform two 16-byte stores.
50958 unsigned Fast;
50959 if (VT.is256BitVector() && StVT == VT &&
50960 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
50961 *St->getMemOperand(), &Fast) &&
50962 !Fast) {
50963 unsigned NumElems = VT.getVectorNumElements();
50964 if (NumElems < 2)
50965 return SDValue();
50966
50967 return splitVectorStore(St, DAG);
50968 }
50969
50970 // Split under-aligned vector non-temporal stores.
50971 if (St->isNonTemporal() && StVT == VT &&
50972 St->getAlign().value() < VT.getStoreSize()) {
50973 // ZMM/YMM nt-stores - either it can be stored as a series of shorter
50974 // vectors or the legalizer can scalarize it to use MOVNTI.
50975 if (VT.is256BitVector() || VT.is512BitVector()) {
50976 unsigned NumElems = VT.getVectorNumElements();
50977 if (NumElems < 2)
50978 return SDValue();
50979 return splitVectorStore(St, DAG);
50980 }
50981
50982 // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64
50983 // to use MOVNTI.
50984 if (VT.is128BitVector() && Subtarget.hasSSE2()) {
50985 MVT NTVT = Subtarget.hasSSE4A()
50986 ? MVT::v2f64
50987 : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);
50988 return scalarizeVectorStore(St, NTVT, DAG);
50989 }
50990 }
50991
50992 // Try to optimize v16i16->v16i8 truncating stores when BWI is not
50993 // supported, but avx512f is by extending to v16i32 and truncating.
50994 if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&
50995 St->getValue().getOpcode() == ISD::TRUNCATE &&
50996 St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&
50997 TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&
50998 St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {
50999 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,
51000 St->getValue().getOperand(0));
51001 return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),
51002 MVT::v16i8, St->getMemOperand());
51003 }
51004
51005 // Try to fold a VTRUNCUS or VTRUNCS into a truncating store.
51006 if (!St->isTruncatingStore() &&
51007 (StoredVal.getOpcode() == X86ISD::VTRUNCUS ||
51008 StoredVal.getOpcode() == X86ISD::VTRUNCS) &&
51009 StoredVal.hasOneUse() &&
51010 TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {
51011 bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;
51012 return EmitTruncSStore(IsSigned, St->getChain(),
51013 dl, StoredVal.getOperand(0), St->getBasePtr(),
51014 VT, St->getMemOperand(), DAG);
51015 }
51016
51017 // Try to fold a extract_element(VTRUNC) pattern into a truncating store.
51018 if (!St->isTruncatingStore()) {
51019 auto IsExtractedElement = [](SDValue V) {
51020 if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())
51021 V = V.getOperand(0);
51022 unsigned Opc = V.getOpcode();
51023 if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) &&
51024 isNullConstant(V.getOperand(1)) && V.hasOneUse() &&
51025 V.getOperand(0).hasOneUse())
51026 return V.getOperand(0);
51027 return SDValue();
51028 };
51029 if (SDValue Extract = IsExtractedElement(StoredVal)) {
51030 SDValue Trunc = peekThroughOneUseBitcasts(Extract);
51031 if (Trunc.getOpcode() == X86ISD::VTRUNC) {
51032 SDValue Src = Trunc.getOperand(0);
51033 MVT DstVT = Trunc.getSimpleValueType();
51034 MVT SrcVT = Src.getSimpleValueType();
51035 unsigned NumSrcElts = SrcVT.getVectorNumElements();
51036 unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;
51037 MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
51038 if (NumTruncBits == VT.getSizeInBits() &&
51039 TLI.isTruncStoreLegal(SrcVT, TruncVT)) {
51040 return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),
51041 TruncVT, St->getMemOperand());
51042 }
51043 }
51044 }
51045 }
51046
51047 // Optimize trunc store (of multiple scalars) to shuffle and store.
51048 // First, pack all of the elements in one place. Next, store to memory
51049 // in fewer chunks.
51050 if (St->isTruncatingStore() && VT.isVector()) {
51051 // Check if we can detect an AVG pattern from the truncation. If yes,
51052 // replace the trunc store by a normal store with the result of X86ISD::AVG
51053 // instruction.
51054 if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))
51055 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
51056 Subtarget, dl))
51057 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
51058 St->getPointerInfo(), St->getOriginalAlign(),
51059 St->getMemOperand()->getFlags());
51060
51061 if (TLI.isTruncStoreLegal(VT, StVT)) {
51062 if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))
51063 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
51064 dl, Val, St->getBasePtr(),
51065 St->getMemoryVT(), St->getMemOperand(), DAG);
51066 if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),
51067 DAG, dl))
51068 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
51069 dl, Val, St->getBasePtr(),
51070 St->getMemoryVT(), St->getMemOperand(), DAG);
51071 }
51072
51073 return SDValue();
51074 }
51075
51076 // Cast ptr32 and ptr64 pointers to the default address space before a store.
51077 unsigned AddrSpace = St->getAddressSpace();
51078 if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
51079 AddrSpace == X86AS::PTR32_UPTR) {
51080 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
51081 if (PtrVT != St->getBasePtr().getSimpleValueType()) {
51082 SDValue Cast =
51083 DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
51084 return DAG.getTruncStore(
51085 St->getChain(), dl, StoredVal, Cast, St->getPointerInfo(), StVT,
51086 St->getOriginalAlign(), St->getMemOperand()->getFlags(),
51087 St->getAAInfo());
51088 }
51089 }
51090
51091 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
51092 // the FP state in cases where an emms may be missing.
51093 // A preferable solution to the general problem is to figure out the right
51094 // places to insert EMMS. This qualifies as a quick hack.
51095
51096 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
51097 if (VT.getSizeInBits() != 64)
51098 return SDValue();
51099
51100 const Function &F = DAG.getMachineFunction().getFunction();
51101 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
51102 bool F64IsLegal =
51103 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
51104
51105 if (!F64IsLegal || Subtarget.is64Bit())
51106 return SDValue();
51107
51108 if (VT == MVT::i64 && isa<LoadSDNode>(St->getValue()) &&
51109 cast<LoadSDNode>(St->getValue())->isSimple() &&
51110 St->getChain().hasOneUse() && St->isSimple()) {
51111 auto *Ld = cast<LoadSDNode>(St->getValue());
51112
51113 if (!ISD::isNormalLoad(Ld))
51114 return SDValue();
51115
51116 // Avoid the transformation if there are multiple uses of the loaded value.
51117 if (!Ld->hasNUsesOfValue(1, 0))
51118 return SDValue();
51119
51120 SDLoc LdDL(Ld);
51121 SDLoc StDL(N);
51122 // Lower to a single movq load/store pair.
51123 SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
51124 Ld->getBasePtr(), Ld->getMemOperand());
51125
51126 // Make sure new load is placed in same chain order.
51127 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
51128 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
51129 St->getMemOperand());
51130 }
51131
51132 // This is similar to the above case, but here we handle a scalar 64-bit
51133 // integer store that is extracted from a vector on a 32-bit target.
51134 // If we have SSE2, then we can treat it like a floating-point double
51135 // to get past legalization. The execution dependencies fixup pass will
51136 // choose the optimal machine instruction for the store if this really is
51137 // an integer or v2f32 rather than an f64.
51138 if (VT == MVT::i64 &&
51140 SDValue OldExtract = St->getOperand(1);
51141 SDValue ExtOp0 = OldExtract.getOperand(0);
51142 unsigned VecSize = ExtOp0.getValueSizeInBits();
51143 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
51144 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
51145 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
51146 BitCast, OldExtract.getOperand(1));
51147 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
51148 St->getPointerInfo(), St->getOriginalAlign(),
51149 St->getMemOperand()->getFlags());
51150 }
51151
51152 return SDValue();
51153}
51154
51157 const X86Subtarget &Subtarget) {
51158 auto *St = cast<MemIntrinsicSDNode>(N);
51159
51160 SDValue StoredVal = N->getOperand(1);
51161 MVT VT = StoredVal.getSimpleValueType();
51162 EVT MemVT = St->getMemoryVT();
51163
51164 // Figure out which elements we demand.
51165 unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
51166 APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
51167
51168 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51169 if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {
51170 if (N->getOpcode() != ISD::DELETED_NODE)
51171 DCI.AddToWorklist(N);
51172 return SDValue(N, 0);
51173 }
51174
51175 return SDValue();
51176}
51177
51178/// Return 'true' if this vector operation is "horizontal"
51179/// and return the operands for the horizontal operation in LHS and RHS. A
51180/// horizontal operation performs the binary operation on successive elements
51181/// of its first operand, then on successive elements of its second operand,
51182/// returning the resulting values in a vector. For example, if
51183/// A = < float a0, float a1, float a2, float a3 >
51184/// and
51185/// B = < float b0, float b1, float b2, float b3 >
51186/// then the result of doing a horizontal operation on A and B is
51187/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
51188/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
51189/// A horizontal-op B, for some already available A and B, and if so then LHS is
51190/// set to A, RHS to B, and the routine returns 'true'.
51191static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
51192 SelectionDAG &DAG, const X86Subtarget &Subtarget,
51193 bool IsCommutative,
51194 SmallVectorImpl<int> &PostShuffleMask) {
51195 // If either operand is undef, bail out. The binop should be simplified.
51196 if (LHS.isUndef() || RHS.isUndef())
51197 return false;
51198
51199 // Look for the following pattern:
51200 // A = < float a0, float a1, float a2, float a3 >
51201 // B = < float b0, float b1, float b2, float b3 >
51202 // and
51203 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
51204 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
51205 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
51206 // which is A horizontal-op B.
51207
51208 MVT VT = LHS.getSimpleValueType();
51209 assert((VT.is128BitVector() || VT.is256BitVector()) &&
51210 "Unsupported vector type for horizontal add/sub");
51211 unsigned NumElts = VT.getVectorNumElements();
51212
51213 auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
51214 SmallVectorImpl<int> &ShuffleMask) {
51215 bool UseSubVector = false;
51216 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
51217 Op.getOperand(0).getValueType().is256BitVector() &&
51218 llvm::isNullConstant(Op.getOperand(1))) {
51219 Op = Op.getOperand(0);
51220 UseSubVector = true;
51221 }
51223 SmallVector<int, 16> SrcMask, ScaledMask;
51225 if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&
51226 !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {
51227 return Op.getValueSizeInBits() == BC.getValueSizeInBits();
51228 })) {
51229 resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);
51230 if (!UseSubVector && SrcOps.size() <= 2 &&
51231 scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {
51232 N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();
51233 N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
51234 ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());
51235 }
51236 if (UseSubVector && SrcOps.size() == 1 &&
51237 scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {
51238 std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));
51239 ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);
51240 ShuffleMask.assign(Mask.begin(), Mask.end());
51241 }
51242 }
51243 };
51244
51245 // View LHS in the form
51246 // LHS = VECTOR_SHUFFLE A, B, LMask
51247 // If LHS is not a shuffle, then pretend it is the identity shuffle:
51248 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
51249 // NOTE: A default initialized SDValue represents an UNDEF of type VT.
51250 SDValue A, B;
51252 GetShuffle(LHS, A, B, LMask);
51253
51254 // Likewise, view RHS in the form
51255 // RHS = VECTOR_SHUFFLE C, D, RMask
51256 SDValue C, D;
51258 GetShuffle(RHS, C, D, RMask);
51259
51260 // At least one of the operands should be a vector shuffle.
51261 unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);
51262 if (NumShuffles == 0)
51263 return false;
51264
51265 if (LMask.empty()) {
51266 A = LHS;
51267 for (unsigned i = 0; i != NumElts; ++i)
51268 LMask.push_back(i);
51269 }
51270
51271 if (RMask.empty()) {
51272 C = RHS;
51273 for (unsigned i = 0; i != NumElts; ++i)
51274 RMask.push_back(i);
51275 }
51276
51277 // If we have an unary mask, ensure the other op is set to null.
51278 if (isUndefOrInRange(LMask, 0, NumElts))
51279 B = SDValue();
51280 else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))
51281 A = SDValue();
51282
51283 if (isUndefOrInRange(RMask, 0, NumElts))
51284 D = SDValue();
51285 else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))
51286 C = SDValue();
51287
51288 // If A and B occur in reverse order in RHS, then canonicalize by commuting
51289 // RHS operands and shuffle mask.
51290 if (A != C) {
51291 std::swap(C, D);
51293 }
51294 // Check that the shuffles are both shuffling the same vectors.
51295 if (!(A == C && B == D))
51296 return false;
51297
51298 PostShuffleMask.clear();
51299 PostShuffleMask.append(NumElts, SM_SentinelUndef);
51300
51301 // LHS and RHS are now:
51302 // LHS = shuffle A, B, LMask
51303 // RHS = shuffle A, B, RMask
51304 // Check that the masks correspond to performing a horizontal operation.
51305 // AVX defines horizontal add/sub to operate independently on 128-bit lanes,
51306 // so we just repeat the inner loop if this is a 256-bit op.
51307 unsigned Num128BitChunks = VT.getSizeInBits() / 128;
51308 unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;
51309 unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;
51310 assert((NumEltsPer128BitChunk % 2 == 0) &&
51311 "Vector type should have an even number of elements in each lane");
51312 for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {
51313 for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {
51314 // Ignore undefined components.
51315 int LIdx = LMask[i + j], RIdx = RMask[i + j];
51316 if (LIdx < 0 || RIdx < 0 ||
51317 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
51318 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
51319 continue;
51320
51321 // Check that successive odd/even elements are being operated on. If not,
51322 // this is not a horizontal operation.
51323 if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&
51324 !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))
51325 return false;
51326
51327 // Compute the post-shuffle mask index based on where the element
51328 // is stored in the HOP result, and where it needs to be moved to.
51329 int Base = LIdx & ~1u;
51330 int Index = ((Base % NumEltsPer128BitChunk) / 2) +
51331 ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));
51332
51333 // The low half of the 128-bit result must choose from A.
51334 // The high half of the 128-bit result must choose from B,
51335 // unless B is undef. In that case, we are always choosing from A.
51336 if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))
51337 Index += NumEltsPer64BitChunk;
51338 PostShuffleMask[i + j] = Index;
51339 }
51340 }
51341
51342 SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
51343 SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
51344
51345 bool IsIdentityPostShuffle =
51346 isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);
51347 if (IsIdentityPostShuffle)
51348 PostShuffleMask.clear();
51349
51350 // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).
51351 if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&
51352 isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))
51353 return false;
51354
51355 // If the source nodes are already used in HorizOps then always accept this.
51356 // Shuffle folding should merge these back together.
51357 bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {
51358 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
51359 });
51360 bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {
51361 return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
51362 });
51363 bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;
51364
51365 // Assume a SingleSource HOP if we only shuffle one input and don't need to
51366 // shuffle the result.
51367 if (!ForceHorizOp &&
51368 !shouldUseHorizontalOp(NewLHS == NewRHS &&
51369 (NumShuffles < 2 || !IsIdentityPostShuffle),
51370 DAG, Subtarget))
51371 return false;
51372
51373 LHS = DAG.getBitcast(VT, NewLHS);
51374 RHS = DAG.getBitcast(VT, NewRHS);
51375 return true;
51376}
51377
51378// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.
51380 const X86Subtarget &Subtarget) {
51381 EVT VT = N->getValueType(0);
51382 unsigned Opcode = N->getOpcode();
51383 bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);
51384 SmallVector<int, 8> PostShuffleMask;
51385
51386 switch (Opcode) {
51387 case ISD::FADD:
51388 case ISD::FSUB:
51389 if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
51390 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
51391 SDValue LHS = N->getOperand(0);
51392 SDValue RHS = N->getOperand(1);
51393 auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;
51394 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
51395 PostShuffleMask)) {
51396 SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
51397 if (!PostShuffleMask.empty())
51398 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
51399 DAG.getUNDEF(VT), PostShuffleMask);
51400 return HorizBinOp;
51401 }
51402 }
51403 break;
51404 case ISD::ADD:
51405 case ISD::SUB:
51406 if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
51407 VT == MVT::v16i16 || VT == MVT::v8i32)) {
51408 SDValue LHS = N->getOperand(0);
51409 SDValue RHS = N->getOperand(1);
51410 auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;
51411 if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,
51412 PostShuffleMask)) {
51413 auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,
51414 ArrayRef<SDValue> Ops) {
51415 return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);
51416 };
51417 SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
51418 {LHS, RHS}, HOpBuilder);
51419 if (!PostShuffleMask.empty())
51420 HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,
51421 DAG.getUNDEF(VT), PostShuffleMask);
51422 return HorizBinOp;
51423 }
51424 }
51425 break;
51426 }
51427
51428 return SDValue();
51429}
51430
51431// Try to combine the following nodes
51432// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64
51433// <i32 -2147483648[float -0.000000e+00]> 0
51434// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD
51435// <(load 4 from constant-pool)> t0, t29
51436// [t30: v16i32 = bitcast t27]
51437// t6: v16i32 = xor t7, t27[t30]
51438// t11: v16f32 = bitcast t6
51439// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8
51440// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:
51441// t22: v16f32 = bitcast t7
51442// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22
51443// t24: v32f16 = bitcast t23
51445 const X86Subtarget &Subtarget) {
51446 EVT VT = N->getValueType(0);
51447 SDValue LHS = N->getOperand(0);
51448 SDValue RHS = N->getOperand(1);
51449 int CombineOpcode =
51450 N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;
51451 auto combineConjugation = [&](SDValue &r) {
51452 if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {
51453 SDValue XOR = LHS.getOperand(0);
51454 if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
51455 KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
51456 if (XORRHS.isConstant()) {
51457 APInt ConjugationInt32 = APInt(32, 0x80000000, true);
51458 APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);
51459 if ((XORRHS.getBitWidth() == 32 &&
51460 XORRHS.getConstant() == ConjugationInt32) ||
51461 (XORRHS.getBitWidth() == 64 &&
51462 XORRHS.getConstant() == ConjugationInt64)) {
51463 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
51464 SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));
51465 SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);
51466 r = DAG.getBitcast(VT, FCMulC);
51467 return true;
51468 }
51469 }
51470 }
51471 }
51472 return false;
51473 };
51474 SDValue Res;
51475 if (combineConjugation(Res))
51476 return Res;
51477 std::swap(LHS, RHS);
51478 if (combineConjugation(Res))
51479 return Res;
51480 return Res;
51481}
51482
51483// Try to combine the following nodes:
51484// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)
51486 const X86Subtarget &Subtarget) {
51487 auto AllowContract = [&DAG](const SDNodeFlags &Flags) {
51489 Flags.hasAllowContract();
51490 };
51491
51492 auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {
51493 return DAG.getTarget().Options.NoSignedZerosFPMath ||
51494 Flags.hasNoSignedZeros();
51495 };
51496 auto IsVectorAllNegativeZero = [&DAG](SDValue Op) {
51497 APInt AI = APInt(32, 0x80008000, true);
51498 KnownBits Bits = DAG.computeKnownBits(Op);
51499 return Bits.getBitWidth() == 32 && Bits.isConstant() &&
51500 Bits.getConstant() == AI;
51501 };
51502
51503 if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||
51504 !AllowContract(N->getFlags()))
51505 return SDValue();
51506
51507 EVT VT = N->getValueType(0);
51508 if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)
51509 return SDValue();
51510
51511 SDValue LHS = N->getOperand(0);
51512 SDValue RHS = N->getOperand(1);
51513 bool IsConj;
51514 SDValue FAddOp1, MulOp0, MulOp1;
51515 auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,
51516 &IsVectorAllNegativeZero,
51517 &HasNoSignedZero](SDValue N) -> bool {
51518 if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)
51519 return false;
51520 SDValue Op0 = N.getOperand(0);
51521 unsigned Opcode = Op0.getOpcode();
51522 if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {
51523 if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {
51524 MulOp0 = Op0.getOperand(0);
51525 MulOp1 = Op0.getOperand(1);
51526 IsConj = Opcode == X86ISD::VFCMULC;
51527 return true;
51528 }
51529 if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&
51531 HasNoSignedZero(Op0->getFlags())) ||
51532 IsVectorAllNegativeZero(Op0->getOperand(2)))) {
51533 MulOp0 = Op0.getOperand(0);
51534 MulOp1 = Op0.getOperand(1);
51535 IsConj = Opcode == X86ISD::VFCMADDC;
51536 return true;
51537 }
51538 }
51539 return false;
51540 };
51541
51542 if (GetCFmulFrom(LHS))
51543 FAddOp1 = RHS;
51544 else if (GetCFmulFrom(RHS))
51545 FAddOp1 = LHS;
51546 else
51547 return SDValue();
51548
51549 MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);
51550 FAddOp1 = DAG.getBitcast(CVT, FAddOp1);
51551 unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;
51552 // FIXME: How do we handle when fast math flags of FADD are different from
51553 // CFMUL's?
51554 SDValue CFmul =
51555 DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());
51556 return DAG.getBitcast(VT, CFmul);
51557}
51558
51559/// Do target-specific dag combines on floating-point adds/subs.
51561 const X86Subtarget &Subtarget) {
51562 if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))
51563 return HOp;
51564
51565 if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))
51566 return COp;
51567
51568 return SDValue();
51569}
51570
51572 const X86Subtarget &Subtarget) {
51573 EVT VT = N->getValueType(0);
51574 SDValue Src = N->getOperand(0);
51575 EVT SrcVT = Src.getValueType();
51576 SDLoc DL(N);
51577
51578 if (!Subtarget.hasDQI() || !Subtarget.hasVLX() || VT != MVT::v2i64 ||
51579 SrcVT != MVT::v2f32)
51580 return SDValue();
51581
51582 return DAG.getNode(X86ISD::CVTP2SI, DL, VT,
51583 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, Src,
51584 DAG.getUNDEF(SrcVT)));
51585}
51586
51587/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
51588/// the codegen.
51589/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
51590/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove
51591/// anything that is guaranteed to be transformed by DAGCombiner.
51593 const X86Subtarget &Subtarget,
51594 const SDLoc &DL) {
51595 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
51596 SDValue Src = N->getOperand(0);
51597 unsigned SrcOpcode = Src.getOpcode();
51598 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51599
51600 EVT VT = N->getValueType(0);
51601 EVT SrcVT = Src.getValueType();
51602
51603 auto IsFreeTruncation = [VT](SDValue Op) {
51604 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
51605
51606 // See if this has been extended from a smaller/equal size to
51607 // the truncation size, allowing a truncation to combine with the extend.
51608 unsigned Opcode = Op.getOpcode();
51609 if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||
51610 Opcode == ISD::ZERO_EXTEND) &&
51611 Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
51612 return true;
51613
51614 // See if this is a single use constant which can be constant folded.
51615 // NOTE: We don't peek throught bitcasts here because there is currently
51616 // no support for constant folding truncate+bitcast+vector_of_constants. So
51617 // we'll just send up with a truncate on both operands which will
51618 // get turned back into (truncate (binop)) causing an infinite loop.
51619 return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
51620 };
51621
51622 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
51623 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
51624 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
51625 return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);
51626 };
51627
51628 // Don't combine if the operation has other uses.
51629 if (!Src.hasOneUse())
51630 return SDValue();
51631
51632 // Only support vector truncation for now.
51633 // TODO: i64 scalar math would benefit as well.
51634 if (!VT.isVector())
51635 return SDValue();
51636
51637 // In most cases its only worth pre-truncating if we're only facing the cost
51638 // of one truncation.
51639 // i.e. if one of the inputs will constant fold or the input is repeated.
51640 switch (SrcOpcode) {
51641 case ISD::MUL:
51642 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
51643 // better to truncate if we have the chance.
51644 if (SrcVT.getScalarType() == MVT::i64 &&
51645 TLI.isOperationLegal(SrcOpcode, VT) &&
51646 !TLI.isOperationLegal(SrcOpcode, SrcVT))
51647 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
51648 [[fallthrough]];
51649 case ISD::AND:
51650 case ISD::XOR:
51651 case ISD::OR:
51652 case ISD::ADD:
51653 case ISD::SUB: {
51654 SDValue Op0 = Src.getOperand(0);
51655 SDValue Op1 = Src.getOperand(1);
51656 if (TLI.isOperationLegal(SrcOpcode, VT) &&
51657 (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
51658 return TruncateArithmetic(Op0, Op1);
51659 break;
51660 }
51661 }
51662
51663 return SDValue();
51664}
51665
51666// Try to form a MULHU or MULHS node by looking for
51667// (trunc (srl (mul ext, ext), 16))
51668// TODO: This is X86 specific because we want to be able to handle wide types
51669// before type legalization. But we can only do it if the vector will be
51670// legalized via widening/splitting. Type legalization can't handle promotion
51671// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
51672// combiner.
51673static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
51674 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
51675 // First instruction should be a right shift of a multiply.
51676 if (Src.getOpcode() != ISD::SRL ||
51677 Src.getOperand(0).getOpcode() != ISD::MUL)
51678 return SDValue();
51679
51680 if (!Subtarget.hasSSE2())
51681 return SDValue();
51682
51683 // Only handle vXi16 types that are at least 128-bits unless they will be
51684 // widened.
51685 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
51686 return SDValue();
51687
51688 // Input type should be at least vXi32.
51689 EVT InVT = Src.getValueType();
51690 if (InVT.getVectorElementType().getSizeInBits() < 32)
51691 return SDValue();
51692
51693 // Need a shift by 16.
51694 APInt ShiftAmt;
51695 if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
51696 ShiftAmt != 16)
51697 return SDValue();
51698
51699 SDValue LHS = Src.getOperand(0).getOperand(0);
51700 SDValue RHS = Src.getOperand(0).getOperand(1);
51701
51702 // Count leading sign/zero bits on both inputs - if there are enough then
51703 // truncation back to vXi16 will be cheap - either as a pack/shuffle
51704 // sequence or using AVX512 truncations. If the inputs are sext/zext then the
51705 // truncations may actually be free by peeking through to the ext source.
51706 auto IsSext = [&DAG](SDValue V) {
51707 return DAG.ComputeMaxSignificantBits(V) <= 16;
51708 };
51709 auto IsZext = [&DAG](SDValue V) {
51710 return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;
51711 };
51712
51713 bool IsSigned = IsSext(LHS) && IsSext(RHS);
51714 bool IsUnsigned = IsZext(LHS) && IsZext(RHS);
51715 if (!IsSigned && !IsUnsigned)
51716 return SDValue();
51717
51718 // Check if both inputs are extensions, which will be removed by truncation.
51719 bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND ||
51720 LHS.getOpcode() == ISD::ZERO_EXTEND) &&
51721 (RHS.getOpcode() == ISD::SIGN_EXTEND ||
51722 RHS.getOpcode() == ISD::ZERO_EXTEND) &&
51723 LHS.getOperand(0).getScalarValueSizeInBits() <= 16 &&
51724 RHS.getOperand(0).getScalarValueSizeInBits() <= 16;
51725
51726 // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
51727 // the (bitcasted) inputs directly, and then cheaply pack/truncate the result
51728 // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU
51729 // will have to split anyway.
51730 unsigned InSizeInBits = InVT.getSizeInBits();
51731 if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&
51732 !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&
51733 (InSizeInBits % 16) == 0) {
51734 EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
51735 InVT.getSizeInBits() / 16);
51736 SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),
51737 DAG.getBitcast(BCVT, RHS));
51738 return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));
51739 }
51740
51741 // Truncate back to source type.
51742 LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);
51743 RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);
51744
51745 unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;
51746 return DAG.getNode(Opc, DL, VT, LHS, RHS);
51747}
51748
51749// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes
51750// from one vector with signed bytes from another vector, adds together
51751// adjacent pairs of 16-bit products, and saturates the result before
51752// truncating to 16-bits.
51753//
51754// Which looks something like this:
51755// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),
51756// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))
51758 const X86Subtarget &Subtarget,
51759 const SDLoc &DL) {
51760 if (!VT.isVector() || !Subtarget.hasSSSE3())
51761 return SDValue();
51762
51763 unsigned NumElems = VT.getVectorNumElements();
51764 EVT ScalarVT = VT.getVectorElementType();
51765 if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))
51766 return SDValue();
51767
51768 SDValue SSatVal = detectSSatPattern(In, VT);
51769 if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)
51770 return SDValue();
51771
51772 // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs
51773 // of multiplies from even/odd elements.
51774 SDValue N0 = SSatVal.getOperand(0);
51775 SDValue N1 = SSatVal.getOperand(1);
51776
51777 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
51778 return SDValue();
51779
51780 SDValue N00 = N0.getOperand(0);
51781 SDValue N01 = N0.getOperand(1);
51782 SDValue N10 = N1.getOperand(0);
51783 SDValue N11 = N1.getOperand(1);
51784
51785 // TODO: Handle constant vectors and use knownbits/computenumsignbits?
51786 // Canonicalize zero_extend to LHS.
51787 if (N01.getOpcode() == ISD::ZERO_EXTEND)
51788 std::swap(N00, N01);
51789 if (N11.getOpcode() == ISD::ZERO_EXTEND)
51790 std::swap(N10, N11);
51791
51792 // Ensure we have a zero_extend and a sign_extend.
51793 if (N00.getOpcode() != ISD::ZERO_EXTEND ||
51794 N01.getOpcode() != ISD::SIGN_EXTEND ||
51795 N10.getOpcode() != ISD::ZERO_EXTEND ||
51796 N11.getOpcode() != ISD::SIGN_EXTEND)
51797 return SDValue();
51798
51799 // Peek through the extends.
51800 N00 = N00.getOperand(0);
51801 N01 = N01.getOperand(0);
51802 N10 = N10.getOperand(0);
51803 N11 = N11.getOperand(0);
51804
51805 // Ensure the extend is from vXi8.
51806 if (N00.getValueType().getVectorElementType() != MVT::i8 ||
51807 N01.getValueType().getVectorElementType() != MVT::i8 ||
51808 N10.getValueType().getVectorElementType() != MVT::i8 ||
51809 N11.getValueType().getVectorElementType() != MVT::i8)
51810 return SDValue();
51811
51812 // All inputs should be build_vectors.
51813 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
51814 N01.getOpcode() != ISD::BUILD_VECTOR ||
51815 N10.getOpcode() != ISD::BUILD_VECTOR ||
51817 return SDValue();
51818
51819 // N00/N10 are zero extended. N01/N11 are sign extended.
51820
51821 // For each element, we need to ensure we have an odd element from one vector
51822 // multiplied by the odd element of another vector and the even element from
51823 // one of the same vectors being multiplied by the even element from the
51824 // other vector. So we need to make sure for each element i, this operator
51825 // is being performed:
51826 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
51827 SDValue ZExtIn, SExtIn;
51828 for (unsigned i = 0; i != NumElems; ++i) {
51829 SDValue N00Elt = N00.getOperand(i);
51830 SDValue N01Elt = N01.getOperand(i);
51831 SDValue N10Elt = N10.getOperand(i);
51832 SDValue N11Elt = N11.getOperand(i);
51833 // TODO: Be more tolerant to undefs.
51834 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
51835 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
51836 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
51838 return SDValue();
51839 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
51840 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
51841 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
51842 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
51843 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
51844 return SDValue();
51845 unsigned IdxN00 = ConstN00Elt->getZExtValue();
51846 unsigned IdxN01 = ConstN01Elt->getZExtValue();
51847 unsigned IdxN10 = ConstN10Elt->getZExtValue();
51848 unsigned IdxN11 = ConstN11Elt->getZExtValue();
51849 // Add is commutative so indices can be reordered.
51850 if (IdxN00 > IdxN10) {
51851 std::swap(IdxN00, IdxN10);
51852 std::swap(IdxN01, IdxN11);
51853 }
51854 // N0 indices be the even element. N1 indices must be the next odd element.
51855 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
51856 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
51857 return SDValue();
51858 SDValue N00In = N00Elt.getOperand(0);
51859 SDValue N01In = N01Elt.getOperand(0);
51860 SDValue N10In = N10Elt.getOperand(0);
51861 SDValue N11In = N11Elt.getOperand(0);
51862 // First time we find an input capture it.
51863 if (!ZExtIn) {
51864 ZExtIn = N00In;
51865 SExtIn = N01In;
51866 }
51867 if (ZExtIn != N00In || SExtIn != N01In ||
51868 ZExtIn != N10In || SExtIn != N11In)
51869 return SDValue();
51870 }
51871
51872 auto ExtractVec = [&DAG, &DL, NumElems](SDValue &Ext) {
51873 EVT ExtVT = Ext.getValueType();
51874 if (ExtVT.getVectorNumElements() != NumElems * 2) {
51875 MVT NVT = MVT::getVectorVT(MVT::i8, NumElems * 2);
51876 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, Ext,
51877 DAG.getIntPtrConstant(0, DL));
51878 }
51879 };
51880 ExtractVec(ZExtIn);
51881 ExtractVec(SExtIn);
51882
51883 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
51884 ArrayRef<SDValue> Ops) {
51885 // Shrink by adding truncate nodes and let DAGCombine fold with the
51886 // sources.
51887 EVT InVT = Ops[0].getValueType();
51888 assert(InVT.getScalarType() == MVT::i8 &&
51889 "Unexpected scalar element type");
51890 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
51891 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
51892 InVT.getVectorNumElements() / 2);
51893 return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);
51894 };
51895 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },
51896 PMADDBuilder);
51897}
51898
51900 const X86Subtarget &Subtarget) {
51901 EVT VT = N->getValueType(0);
51902 SDValue Src = N->getOperand(0);
51903 SDLoc DL(N);
51904
51905 // Attempt to pre-truncate inputs to arithmetic ops instead.
51906 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
51907 return V;
51908
51909 // Try to detect AVG pattern first.
51910 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
51911 return Avg;
51912
51913 // Try to detect PMADD
51914 if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))
51915 return PMAdd;
51916
51917 // Try to combine truncation with signed/unsigned saturation.
51918 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
51919 return Val;
51920
51921 // Try to combine PMULHUW/PMULHW for vXi16.
51922 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
51923 return V;
51924
51925 // The bitcast source is a direct mmx result.
51926 // Detect bitcasts between i32 to x86mmx
51927 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
51928 SDValue BCSrc = Src.getOperand(0);
51929 if (BCSrc.getValueType() == MVT::x86mmx)
51930 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
51931 }
51932
51933 // Try to combine (trunc (vNi64 (lrint x))) to (vNi32 (lrint x)).
51934 if (Src.getOpcode() == ISD::LRINT && VT.getScalarType() == MVT::i32 &&
51935 Src.hasOneUse())
51936 return DAG.getNode(ISD::LRINT, DL, VT, Src.getOperand(0));
51937
51938 return SDValue();
51939}
51940
51943 EVT VT = N->getValueType(0);
51944 SDValue In = N->getOperand(0);
51945 SDLoc DL(N);
51946
51947 if (SDValue SSatVal = detectSSatPattern(In, VT))
51948 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
51949 if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))
51950 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
51951
51952 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
51953 APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));
51954 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
51955 return SDValue(N, 0);
51956
51957 return SDValue();
51958}
51959
51960/// Returns the negated value if the node \p N flips sign of FP value.
51961///
51962/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)
51963/// or FSUB(0, x)
51964/// AVX512F does not have FXOR, so FNEG is lowered as
51965/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
51966/// In this case we go though all bitcasts.
51967/// This also recognizes splat of a negated value and returns the splat of that
51968/// value.
51969static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
51970 if (N->getOpcode() == ISD::FNEG)
51971 return N->getOperand(0);
51972
51973 // Don't recurse exponentially.
51975 return SDValue();
51976
51977 unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();
51978
51980 EVT VT = Op->getValueType(0);
51981
51982 // Make sure the element size doesn't change.
51983 if (VT.getScalarSizeInBits() != ScalarSize)
51984 return SDValue();
51985
51986 unsigned Opc = Op.getOpcode();
51987 switch (Opc) {
51988 case ISD::VECTOR_SHUFFLE: {
51989 // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
51990 // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
51991 if (!Op.getOperand(1).isUndef())
51992 return SDValue();
51993 if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
51994 if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
51995 return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
51996 cast<ShuffleVectorSDNode>(Op)->getMask());
51997 break;
51998 }
52000 // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
52001 // -V, INDEX).
52002 SDValue InsVector = Op.getOperand(0);
52003 SDValue InsVal = Op.getOperand(1);
52004 if (!InsVector.isUndef())
52005 return SDValue();
52006 if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))
52007 if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
52008 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
52009 NegInsVal, Op.getOperand(2));
52010 break;
52011 }
52012 case ISD::FSUB:
52013 case ISD::XOR:
52014 case X86ISD::FXOR: {
52015 SDValue Op1 = Op.getOperand(1);
52016 SDValue Op0 = Op.getOperand(0);
52017
52018 // For XOR and FXOR, we want to check if constant
52019 // bits of Op1 are sign bit masks. For FSUB, we
52020 // have to check if constant bits of Op0 are sign
52021 // bit masks and hence we swap the operands.
52022 if (Opc == ISD::FSUB)
52023 std::swap(Op0, Op1);
52024
52025 APInt UndefElts;
52026 SmallVector<APInt, 16> EltBits;
52027 // Extract constant bits and see if they are all
52028 // sign bit masks. Ignore the undef elements.
52029 if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
52030 /* AllowWholeUndefs */ true,
52031 /* AllowPartialUndefs */ false)) {
52032 for (unsigned I = 0, E = EltBits.size(); I < E; I++)
52033 if (!UndefElts[I] && !EltBits[I].isSignMask())
52034 return SDValue();
52035
52036 // Only allow bitcast from correctly-sized constant.
52037 Op0 = peekThroughBitcasts(Op0);
52038 if (Op0.getScalarValueSizeInBits() == ScalarSize)
52039 return Op0;
52040 }
52041 break;
52042 } // case
52043 } // switch
52044
52045 return SDValue();
52046}
52047
52048static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
52049 bool NegRes) {
52050 if (NegMul) {
52051 switch (Opcode) {
52052 // clang-format off
52053 default: llvm_unreachable("Unexpected opcode");
52054 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
52055 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
52056 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
52057 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
52058 case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
52059 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
52060 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
52061 case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
52062 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
52063 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
52064 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
52065 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
52066 // clang-format on
52067 }
52068 }
52069
52070 if (NegAcc) {
52071 switch (Opcode) {
52072 // clang-format off
52073 default: llvm_unreachable("Unexpected opcode");
52074 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
52075 case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
52076 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
52077 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
52078 case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
52079 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
52080 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
52081 case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
52082 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
52083 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
52084 case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
52085 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
52086 case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
52087 case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
52088 case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
52089 case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
52090 // clang-format on
52091 }
52092 }
52093
52094 if (NegRes) {
52095 switch (Opcode) {
52096 // For accuracy reason, we never combine fneg and fma under strict FP.
52097 // clang-format off
52098 default: llvm_unreachable("Unexpected opcode");
52099 case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
52100 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
52101 case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;
52102 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
52103 case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;
52104 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
52105 case X86ISD::FNMSUB: Opcode = ISD::FMA; break;
52106 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
52107 // clang-format on
52108 }
52109 }
52110
52111 return Opcode;
52112}
52113
52114/// Do target-specific dag combines on floating point negations.
52117 const X86Subtarget &Subtarget) {
52118 EVT OrigVT = N->getValueType(0);
52119 SDValue Arg = isFNEG(DAG, N);
52120 if (!Arg)
52121 return SDValue();
52122
52123 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52124 EVT VT = Arg.getValueType();
52125 EVT SVT = VT.getScalarType();
52126 SDLoc DL(N);
52127
52128 // Let legalize expand this if it isn't a legal type yet.
52129 if (!TLI.isTypeLegal(VT))
52130 return SDValue();
52131
52132 // If we're negating a FMUL node on a target with FMA, then we can avoid the
52133 // use of a constant by performing (-0 - A*B) instead.
52134 // FIXME: Check rounding control flags as well once it becomes available.
52135 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
52136 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
52137 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
52138 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
52139 Arg.getOperand(1), Zero);
52140 return DAG.getBitcast(OrigVT, NewNode);
52141 }
52142
52143 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
52144 bool LegalOperations = !DCI.isBeforeLegalizeOps();
52145 if (SDValue NegArg =
52146 TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
52147 return DAG.getBitcast(OrigVT, NegArg);
52148
52149 return SDValue();
52150}
52151
52153 bool LegalOperations,
52154 bool ForCodeSize,
52156 unsigned Depth) const {
52157 // fneg patterns are removable even if they have multiple uses.
52158 if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
52160 return DAG.getBitcast(Op.getValueType(), Arg);
52161 }
52162
52163 EVT VT = Op.getValueType();
52164 EVT SVT = VT.getScalarType();
52165 unsigned Opc = Op.getOpcode();
52166 SDNodeFlags Flags = Op.getNode()->getFlags();
52167 switch (Opc) {
52168 case ISD::FMA:
52169 case X86ISD::FMSUB:
52170 case X86ISD::FNMADD:
52171 case X86ISD::FNMSUB:
52172 case X86ISD::FMADD_RND:
52173 case X86ISD::FMSUB_RND:
52174 case X86ISD::FNMADD_RND:
52175 case X86ISD::FNMSUB_RND: {
52176 if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
52177 !(SVT == MVT::f32 || SVT == MVT::f64) ||
52179 break;
52180
52181 // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)
52182 // if it may have signed zeros.
52183 if (!Flags.hasNoSignedZeros())
52184 break;
52185
52186 // This is always negatible for free but we might be able to remove some
52187 // extra operand negations as well.
52189 for (int i = 0; i != 3; ++i)
52190 NewOps[i] = getCheaperNegatedExpression(
52191 Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
52192
52193 bool NegA = !!NewOps[0];
52194 bool NegB = !!NewOps[1];
52195 bool NegC = !!NewOps[2];
52196 unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
52197
52198 Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
52200
52201 // Fill in the non-negated ops with the original values.
52202 for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
52203 if (!NewOps[i])
52204 NewOps[i] = Op.getOperand(i);
52205 return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
52206 }
52207 case X86ISD::FRCP:
52208 if (SDValue NegOp0 =
52209 getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
52210 ForCodeSize, Cost, Depth + 1))
52211 return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
52212 break;
52213 }
52214
52215 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
52216 ForCodeSize, Cost, Depth);
52217}
52218
52220 const X86Subtarget &Subtarget) {
52221 MVT VT = N->getSimpleValueType(0);
52222 // If we have integer vector types available, use the integer opcodes.
52223 if (!VT.isVector() || !Subtarget.hasSSE2())
52224 return SDValue();
52225
52226 SDLoc dl(N);
52227
52228 unsigned IntBits = VT.getScalarSizeInBits();
52229 MVT IntSVT = MVT::getIntegerVT(IntBits);
52230 MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);
52231
52232 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
52233 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
52234 unsigned IntOpcode;
52235 switch (N->getOpcode()) {
52236 // clang-format off
52237 default: llvm_unreachable("Unexpected FP logic op");
52238 case X86ISD::FOR: IntOpcode = ISD::OR; break;
52239 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
52240 case X86ISD::FAND: IntOpcode = ISD::AND; break;
52241 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
52242 // clang-format on
52243 }
52244 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
52245 return DAG.getBitcast(VT, IntOp);
52246}
52247
52248
52249/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
52251 if (N->getOpcode() != ISD::XOR)
52252 return SDValue();
52253
52254 SDValue LHS = N->getOperand(0);
52255 if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
52256 return SDValue();
52257
52259 X86::CondCode(LHS->getConstantOperandVal(0)));
52260 SDLoc DL(N);
52261 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
52262}
52263
52265 const X86Subtarget &Subtarget) {
52266 assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
52267 "Invalid opcode for combing with CTLZ");
52268 if (Subtarget.hasFastLZCNT())
52269 return SDValue();
52270
52271 EVT VT = N->getValueType(0);
52272 if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&
52273 (VT != MVT::i64 || !Subtarget.is64Bit()))
52274 return SDValue();
52275
52276 SDValue N0 = N->getOperand(0);
52277 SDValue N1 = N->getOperand(1);
52278
52279 if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&
52281 return SDValue();
52282
52283 SDValue OpCTLZ;
52284 SDValue OpSizeTM1;
52285
52286 if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
52287 OpCTLZ = N1;
52288 OpSizeTM1 = N0;
52289 } else if (N->getOpcode() == ISD::SUB) {
52290 return SDValue();
52291 } else {
52292 OpCTLZ = N0;
52293 OpSizeTM1 = N1;
52294 }
52295
52296 if (!OpCTLZ.hasOneUse())
52297 return SDValue();
52298 auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);
52299 if (!C)
52300 return SDValue();
52301
52302 if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
52303 return SDValue();
52304 SDLoc DL(N);
52305 EVT OpVT = VT;
52306 SDValue Op = OpCTLZ.getOperand(0);
52307 if (VT == MVT::i8) {
52308 // Zero extend to i32 since there is not an i8 bsr.
52309 OpVT = MVT::i32;
52310 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);
52311 }
52312
52313 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
52314 Op = DAG.getNode(X86ISD::BSR, DL, VTs, Op);
52315 if (VT == MVT::i8)
52316 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);
52317
52318 return Op;
52319}
52320
52323 const X86Subtarget &Subtarget) {
52324 SDValue N0 = N->getOperand(0);
52325 SDValue N1 = N->getOperand(1);
52326 EVT VT = N->getValueType(0);
52327
52328 // If this is SSE1 only convert to FXOR to avoid scalarization.
52329 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
52330 return DAG.getBitcast(MVT::v4i32,
52331 DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
52332 DAG.getBitcast(MVT::v4f32, N0),
52333 DAG.getBitcast(MVT::v4f32, N1)));
52334 }
52335
52336 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
52337 return Cmp;
52338
52339 if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
52340 return R;
52341
52342 if (SDValue R = combineBitOpWithShift(N, DAG))
52343 return R;
52344
52345 if (SDValue R = combineBitOpWithPACK(N, DAG))
52346 return R;
52347
52348 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
52349 return FPLogic;
52350
52351 if (SDValue R = combineXorSubCTLZ(N, DAG, Subtarget))
52352 return R;
52353
52354 if (DCI.isBeforeLegalizeOps())
52355 return SDValue();
52356
52357 if (SDValue SetCC = foldXor1SetCC(N, DAG))
52358 return SetCC;
52359
52360 if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))
52361 return R;
52362
52363 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
52364 return RV;
52365
52366 // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.
52367 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52368 if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&
52369 N0.getOperand(0).getValueType().isVector() &&
52370 N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
52371 TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
52372 return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),
52373 N0.getOperand(0).getValueType()));
52374 }
52375
52376 // Handle AVX512 mask widening.
52377 // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))
52378 if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&
52379 VT.getVectorElementType() == MVT::i1 &&
52381 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
52382 return DAG.getNode(
52384 DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),
52385 N0.getOperand(2));
52386 }
52387
52388 // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))
52389 // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))
52390 // TODO: Under what circumstances could this be performed in DAGCombine?
52391 if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&
52392 N0.getOperand(0).getOpcode() == N->getOpcode()) {
52393 SDValue TruncExtSrc = N0.getOperand(0);
52394 auto *N1C = dyn_cast<ConstantSDNode>(N1);
52395 auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
52396 if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
52397 SDLoc DL(N);
52398 SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
52399 SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
52400 return DAG.getNode(ISD::XOR, DL, VT, LHS,
52401 DAG.getNode(ISD::XOR, DL, VT, RHS, N1));
52402 }
52403 }
52404
52405 if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))
52406 return R;
52407
52408 return combineFneg(N, DAG, DCI, Subtarget);
52409}
52410
52413 const X86Subtarget &Subtarget) {
52414 SDValue N0 = N->getOperand(0);
52415 EVT VT = N->getValueType(0);
52416
52417 // Convert a (iX bitreverse(bitcast(vXi1 X))) -> (iX bitcast(shuffle(X)))
52418 if (VT.isInteger() && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse()) {
52419 SDValue Src = N0.getOperand(0);
52420 EVT SrcVT = Src.getValueType();
52421 if (SrcVT.isVector() && SrcVT.getScalarType() == MVT::i1 &&
52422 (DCI.isBeforeLegalize() ||
52423 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) &&
52424 Subtarget.hasSSSE3()) {
52425 unsigned NumElts = SrcVT.getVectorNumElements();
52426 SmallVector<int, 32> ReverseMask(NumElts);
52427 for (unsigned I = 0; I != NumElts; ++I)
52428 ReverseMask[I] = (NumElts - 1) - I;
52429 SDValue Rev =
52430 DAG.getVectorShuffle(SrcVT, SDLoc(N), Src, Src, ReverseMask);
52431 return DAG.getBitcast(VT, Rev);
52432 }
52433 }
52434
52435 return SDValue();
52436}
52437
52440 const X86Subtarget &Subtarget) {
52441 EVT VT = N->getValueType(0);
52442 unsigned NumBits = VT.getSizeInBits();
52443
52444 // TODO - Constant Folding.
52445
52446 // Simplify the inputs.
52447 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52448 APInt DemandedMask(APInt::getAllOnes(NumBits));
52449 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
52450 return SDValue(N, 0);
52451
52452 return SDValue();
52453}
52454
52456 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
52457}
52458
52459/// If a value is a scalar FP zero or a vector FP zero (potentially including
52460/// undefined elements), return a zero constant that may be used to fold away
52461/// that value. In the case of a vector, the returned constant will not contain
52462/// undefined elements even if the input parameter does. This makes it suitable
52463/// to be used as a replacement operand with operations (eg, bitwise-and) where
52464/// an undef should not propagate.
52466 const X86Subtarget &Subtarget) {
52468 return SDValue();
52469
52470 if (V.getValueType().isVector())
52471 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
52472
52473 return V;
52474}
52475
52477 const X86Subtarget &Subtarget) {
52478 SDValue N0 = N->getOperand(0);
52479 SDValue N1 = N->getOperand(1);
52480 EVT VT = N->getValueType(0);
52481 SDLoc DL(N);
52482
52483 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
52484 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
52485 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
52486 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
52487 return SDValue();
52488
52489 auto isAllOnesConstantFP = [](SDValue V) {
52490 if (V.getSimpleValueType().isVector())
52491 return ISD::isBuildVectorAllOnes(V.getNode());
52492 auto *C = dyn_cast<ConstantFPSDNode>(V);
52493 return C && C->getConstantFPValue()->isAllOnesValue();
52494 };
52495
52496 // fand (fxor X, -1), Y --> fandn X, Y
52497 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
52498 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
52499
52500 // fand X, (fxor Y, -1) --> fandn Y, X
52501 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
52502 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
52503
52504 return SDValue();
52505}
52506
52507/// Do target-specific dag combines on X86ISD::FAND nodes.
52509 const X86Subtarget &Subtarget) {
52510 // FAND(0.0, x) -> 0.0
52511 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
52512 return V;
52513
52514 // FAND(x, 0.0) -> 0.0
52515 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
52516 return V;
52517
52518 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
52519 return V;
52520
52521 return lowerX86FPLogicOp(N, DAG, Subtarget);
52522}
52523
52524/// Do target-specific dag combines on X86ISD::FANDN nodes.
52526 const X86Subtarget &Subtarget) {
52527 // FANDN(0.0, x) -> x
52528 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
52529 return N->getOperand(1);
52530
52531 // FANDN(x, 0.0) -> 0.0
52532 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
52533 return V;
52534
52535 return lowerX86FPLogicOp(N, DAG, Subtarget);
52536}
52537
52538/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
52541 const X86Subtarget &Subtarget) {
52542 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
52543
52544 // F[X]OR(0.0, x) -> x
52545 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
52546 return N->getOperand(1);
52547
52548 // F[X]OR(x, 0.0) -> x
52549 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
52550 return N->getOperand(0);
52551
52552 if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
52553 return NewVal;
52554
52555 return lowerX86FPLogicOp(N, DAG, Subtarget);
52556}
52557
52558/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
52560 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
52561
52562 // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
52563 if (!DAG.getTarget().Options.NoNaNsFPMath ||
52565 return SDValue();
52566
52567 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
52568 // into FMINC and FMAXC, which are Commutative operations.
52569 unsigned NewOp = 0;
52570 switch (N->getOpcode()) {
52571 default: llvm_unreachable("unknown opcode");
52572 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
52573 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
52574 }
52575
52576 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
52577 N->getOperand(0), N->getOperand(1));
52578}
52579
52581 const X86Subtarget &Subtarget) {
52582 EVT VT = N->getValueType(0);
52583 if (Subtarget.useSoftFloat() || isSoftF16(VT, Subtarget))
52584 return SDValue();
52585
52586 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52587
52588 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
52589 (Subtarget.hasSSE2() && VT == MVT::f64) ||
52590 (Subtarget.hasFP16() && VT == MVT::f16) ||
52591 (VT.isVector() && TLI.isTypeLegal(VT))))
52592 return SDValue();
52593
52594 SDValue Op0 = N->getOperand(0);
52595 SDValue Op1 = N->getOperand(1);
52596 SDLoc DL(N);
52597 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
52598
52599 // If we don't have to respect NaN inputs, this is a direct translation to x86
52600 // min/max instructions.
52601 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
52602 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
52603
52604 // If one of the operands is known non-NaN use the native min/max instructions
52605 // with the non-NaN input as second operand.
52606 if (DAG.isKnownNeverNaN(Op1))
52607 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
52608 if (DAG.isKnownNeverNaN(Op0))
52609 return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());
52610
52611 // If we have to respect NaN inputs, this takes at least 3 instructions.
52612 // Favor a library call when operating on a scalar and minimizing code size.
52613 if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())
52614 return SDValue();
52615
52616 EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
52617 VT);
52618
52619 // There are 4 possibilities involving NaN inputs, and these are the required
52620 // outputs:
52621 // Op1
52622 // Num NaN
52623 // ----------------
52624 // Num | Max | Op0 |
52625 // Op0 ----------------
52626 // NaN | Op1 | NaN |
52627 // ----------------
52628 //
52629 // The SSE FP max/min instructions were not designed for this case, but rather
52630 // to implement:
52631 // Min = Op1 < Op0 ? Op1 : Op0
52632 // Max = Op1 > Op0 ? Op1 : Op0
52633 //
52634 // So they always return Op0 if either input is a NaN. However, we can still
52635 // use those instructions for fmaxnum by selecting away a NaN input.
52636
52637 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
52638 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
52639 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
52640
52641 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
52642 // are NaN, the NaN value of Op1 is the result.
52643 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
52644}
52645
52648 EVT VT = N->getValueType(0);
52649 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52650
52651 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
52652 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
52653 return SDValue(N, 0);
52654
52655 // Convert a full vector load into vzload when not all bits are needed.
52656 SDValue In = N->getOperand(0);
52657 MVT InVT = In.getSimpleValueType();
52658 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
52659 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
52660 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
52661 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
52662 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
52663 MVT MemVT = MVT::getIntegerVT(NumBits);
52664 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
52665 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
52666 SDLoc dl(N);
52667 SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
52668 DAG.getBitcast(InVT, VZLoad));
52669 DCI.CombineTo(N, Convert);
52670 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
52672 return SDValue(N, 0);
52673 }
52674 }
52675
52676 return SDValue();
52677}
52678
52681 bool IsStrict = N->isTargetStrictFPOpcode();
52682 EVT VT = N->getValueType(0);
52683
52684 // Convert a full vector load into vzload when not all bits are needed.
52685 SDValue In = N->getOperand(IsStrict ? 1 : 0);
52686 MVT InVT = In.getSimpleValueType();
52687 if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
52688 ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
52689 assert(InVT.is128BitVector() && "Expected 128-bit input vector");
52690 LoadSDNode *LN = cast<LoadSDNode>(In);
52691 unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
52692 MVT MemVT = MVT::getFloatingPointVT(NumBits);
52693 MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
52694 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
52695 SDLoc dl(N);
52696 if (IsStrict) {
52697 SDValue Convert =
52698 DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
52699 {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
52700 DCI.CombineTo(N, Convert, Convert.getValue(1));
52701 } else {
52702 SDValue Convert =
52703 DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
52704 DCI.CombineTo(N, Convert);
52705 }
52706 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
52708 return SDValue(N, 0);
52709 }
52710 }
52711
52712 return SDValue();
52713}
52714
52715/// Do target-specific dag combines on X86ISD::ANDNP nodes.
52718 const X86Subtarget &Subtarget) {
52719 SDValue N0 = N->getOperand(0);
52720 SDValue N1 = N->getOperand(1);
52721 MVT VT = N->getSimpleValueType(0);
52722 int NumElts = VT.getVectorNumElements();
52723 unsigned EltSizeInBits = VT.getScalarSizeInBits();
52724 SDLoc DL(N);
52725
52726 // ANDNP(undef, x) -> 0
52727 // ANDNP(x, undef) -> 0
52728 if (N0.isUndef() || N1.isUndef())
52729 return DAG.getConstant(0, DL, VT);
52730
52731 // ANDNP(0, x) -> x
52733 return N1;
52734
52735 // ANDNP(x, 0) -> 0
52737 return DAG.getConstant(0, DL, VT);
52738
52739 // ANDNP(x, -1) -> NOT(x) -> XOR(x, -1)
52741 return DAG.getNOT(DL, N0, VT);
52742
52743 // Turn ANDNP back to AND if input is inverted.
52744 if (SDValue Not = IsNOT(N0, DAG))
52745 return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);
52746
52747 // Fold for better commutativity:
52748 // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
52749 if (N1->hasOneUse())
52750 if (SDValue Not = IsNOT(N1, DAG))
52751 return DAG.getNOT(
52752 DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
52753
52754 // Constant Folding
52755 APInt Undefs0, Undefs1;
52756 SmallVector<APInt> EltBits0, EltBits1;
52757 if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0,
52758 /*AllowWholeUndefs*/ true,
52759 /*AllowPartialUndefs*/ true)) {
52760 if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1,
52761 /*AllowWholeUndefs*/ true,
52762 /*AllowPartialUndefs*/ true)) {
52763 SmallVector<APInt> ResultBits;
52764 for (int I = 0; I != NumElts; ++I)
52765 ResultBits.push_back(~EltBits0[I] & EltBits1[I]);
52766 return getConstVector(ResultBits, VT, DAG, DL);
52767 }
52768
52769 // Constant fold NOT(N0) to allow us to use AND.
52770 // Ensure this is only performed if we can confirm that the bitcasted source
52771 // has oneuse to prevent an infinite loop with canonicalizeBitSelect.
52772 if (N0->hasOneUse()) {
52774 if (BC0.getOpcode() != ISD::BITCAST) {
52775 for (APInt &Elt : EltBits0)
52776 Elt = ~Elt;
52777 SDValue Not = getConstVector(EltBits0, VT, DAG, DL);
52778 return DAG.getNode(ISD::AND, DL, VT, Not, N1);
52779 }
52780 }
52781 }
52782
52783 // Attempt to recursively combine a bitmask ANDNP with shuffles.
52784 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
52785 SDValue Op(N, 0);
52786 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
52787 return Res;
52788
52789 // If either operand is a constant mask, then only the elements that aren't
52790 // zero are actually demanded by the other operand.
52791 auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {
52792 APInt UndefElts;
52793 SmallVector<APInt> EltBits;
52794 APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);
52795 APInt DemandedElts = APInt::getAllOnes(NumElts);
52796 if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,
52797 EltBits)) {
52798 DemandedBits.clearAllBits();
52799 DemandedElts.clearAllBits();
52800 for (int I = 0; I != NumElts; ++I) {
52801 if (UndefElts[I]) {
52802 // We can't assume an undef src element gives an undef dst - the
52803 // other src might be zero.
52804 DemandedBits.setAllBits();
52805 DemandedElts.setBit(I);
52806 } else if ((Invert && !EltBits[I].isAllOnes()) ||
52807 (!Invert && !EltBits[I].isZero())) {
52808 DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];
52809 DemandedElts.setBit(I);
52810 }
52811 }
52812 }
52813 return std::make_pair(DemandedBits, DemandedElts);
52814 };
52815 APInt Bits0, Elts0;
52816 APInt Bits1, Elts1;
52817 std::tie(Bits0, Elts0) = GetDemandedMasks(N1);
52818 std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);
52819
52820 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52821 if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||
52822 TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||
52823 TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||
52824 TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {
52825 if (N->getOpcode() != ISD::DELETED_NODE)
52826 DCI.AddToWorklist(N);
52827 return SDValue(N, 0);
52828 }
52829 }
52830
52831 return SDValue();
52832}
52833
52836 SDValue N1 = N->getOperand(1);
52837
52838 // BT ignores high bits in the bit index operand.
52839 unsigned BitWidth = N1.getValueSizeInBits();
52841 if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
52842 if (N->getOpcode() != ISD::DELETED_NODE)
52843 DCI.AddToWorklist(N);
52844 return SDValue(N, 0);
52845 }
52846
52847 return SDValue();
52848}
52849
52852 bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
52853 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
52854
52855 if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
52856 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
52857 APInt DemandedElts = APInt::getLowBitsSet(8, 4);
52858 if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {
52859 if (N->getOpcode() != ISD::DELETED_NODE)
52860 DCI.AddToWorklist(N);
52861 return SDValue(N, 0);
52862 }
52863
52864 // Convert a full vector load into vzload when not all bits are needed.
52865 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
52866 LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
52867 if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
52868 SDLoc dl(N);
52869 if (IsStrict) {
52870 SDValue Convert = DAG.getNode(
52871 N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
52872 {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
52873 DCI.CombineTo(N, Convert, Convert.getValue(1));
52874 } else {
52875 SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
52876 DAG.getBitcast(MVT::v8i16, VZLoad));
52877 DCI.CombineTo(N, Convert);
52878 }
52879
52880 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
52882 return SDValue(N, 0);
52883 }
52884 }
52885 }
52886
52887 return SDValue();
52888}
52889
52890// Try to combine sext_in_reg of a cmov of constants by extending the constants.
52892 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
52893
52894 EVT DstVT = N->getValueType(0);
52895
52896 SDValue N0 = N->getOperand(0);
52897 SDValue N1 = N->getOperand(1);
52898 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
52899
52900 if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)
52901 return SDValue();
52902
52903 // Look through single use any_extends / truncs.
52904 SDValue IntermediateBitwidthOp;
52905 if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
52906 N0.hasOneUse()) {
52907 IntermediateBitwidthOp = N0;
52908 N0 = N0.getOperand(0);
52909 }
52910
52911 // See if we have a single use cmov.
52912 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
52913 return SDValue();
52914
52915 SDValue CMovOp0 = N0.getOperand(0);
52916 SDValue CMovOp1 = N0.getOperand(1);
52917
52918 // Make sure both operands are constants.
52919 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
52920 !isa<ConstantSDNode>(CMovOp1.getNode()))
52921 return SDValue();
52922
52923 SDLoc DL(N);
52924
52925 // If we looked through an any_extend/trunc above, add one to the constants.
52926 if (IntermediateBitwidthOp) {
52927 unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();
52928 CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);
52929 CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);
52930 }
52931
52932 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);
52933 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);
52934
52935 EVT CMovVT = DstVT;
52936 // We do not want i16 CMOV's. Promote to i32 and truncate afterwards.
52937 if (DstVT == MVT::i16) {
52938 CMovVT = MVT::i32;
52939 CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);
52940 CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);
52941 }
52942
52943 SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,
52944 N0.getOperand(2), N0.getOperand(3));
52945
52946 if (CMovVT != DstVT)
52947 CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);
52948
52949 return CMov;
52950}
52951
52953 const X86Subtarget &Subtarget) {
52954 assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG);
52955
52956 if (SDValue V = combineSextInRegCmov(N, DAG))
52957 return V;
52958
52959 EVT VT = N->getValueType(0);
52960 SDValue N0 = N->getOperand(0);
52961 SDValue N1 = N->getOperand(1);
52962 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
52963 SDLoc dl(N);
52964
52965 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
52966 // both SSE and AVX2 since there is no sign-extended shift right
52967 // operation on a vector with 64-bit elements.
52968 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
52969 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
52970 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
52971 N0.getOpcode() == ISD::SIGN_EXTEND)) {
52972 SDValue N00 = N0.getOperand(0);
52973
52974 // EXTLOAD has a better solution on AVX2,
52975 // it may be replaced with X86ISD::VSEXT node.
52976 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
52977 if (!ISD::isNormalLoad(N00.getNode()))
52978 return SDValue();
52979
52980 // Attempt to promote any comparison mask ops before moving the
52981 // SIGN_EXTEND_INREG in the way.
52982 if (SDValue Promote = PromoteMaskArithmetic(N0, dl, DAG, Subtarget))
52983 return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
52984
52985 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
52986 SDValue Tmp =
52987 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
52988 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
52989 }
52990 }
52991 return SDValue();
52992}
52993
52994/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
52995/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
52996/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
52997/// opportunities to combine math ops, use an LEA, or use a complex addressing
52998/// mode. This can eliminate extend, add, and shift instructions.
53000 const X86Subtarget &Subtarget) {
53001 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
53002 Ext->getOpcode() != ISD::ZERO_EXTEND)
53003 return SDValue();
53004
53005 // TODO: This should be valid for other integer types.
53006 EVT VT = Ext->getValueType(0);
53007 if (VT != MVT::i64)
53008 return SDValue();
53009
53010 SDValue Add = Ext->getOperand(0);
53011 if (Add.getOpcode() != ISD::ADD)
53012 return SDValue();
53013
53014 SDValue AddOp0 = Add.getOperand(0);
53015 SDValue AddOp1 = Add.getOperand(1);
53016 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
53017 bool NSW = Add->getFlags().hasNoSignedWrap();
53018 bool NUW = Add->getFlags().hasNoUnsignedWrap();
53019 NSW = NSW || (Sext && DAG.willNotOverflowAdd(true, AddOp0, AddOp1));
53020 NUW = NUW || (!Sext && DAG.willNotOverflowAdd(false, AddOp0, AddOp1));
53021
53022 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
53023 // into the 'zext'
53024 if ((Sext && !NSW) || (!Sext && !NUW))
53025 return SDValue();
53026
53027 // Having a constant operand to the 'add' ensures that we are not increasing
53028 // the instruction count because the constant is extended for free below.
53029 // A constant operand can also become the displacement field of an LEA.
53030 auto *AddOp1C = dyn_cast<ConstantSDNode>(AddOp1);
53031 if (!AddOp1C)
53032 return SDValue();
53033
53034 // Don't make the 'add' bigger if there's no hope of combining it with some
53035 // other 'add' or 'shl' instruction.
53036 // TODO: It may be profitable to generate simpler LEA instructions in place
53037 // of single 'add' instructions, but the cost model for selecting an LEA
53038 // currently has a high threshold.
53039 bool HasLEAPotential = false;
53040 for (auto *User : Ext->uses()) {
53041 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
53042 HasLEAPotential = true;
53043 break;
53044 }
53045 }
53046 if (!HasLEAPotential)
53047 return SDValue();
53048
53049 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
53050 int64_t AddC = Sext ? AddOp1C->getSExtValue() : AddOp1C->getZExtValue();
53051 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
53052 SDValue NewConstant = DAG.getConstant(AddC, SDLoc(Add), VT);
53053
53054 // The wider add is guaranteed to not wrap because both operands are
53055 // sign-extended.
53056 SDNodeFlags Flags;
53057 Flags.setNoSignedWrap(NSW);
53058 Flags.setNoUnsignedWrap(NUW);
53059 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
53060}
53061
53062// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
53063// operands and the result of CMOV is not used anywhere else - promote CMOV
53064// itself instead of promoting its result. This could be beneficial, because:
53065// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
53066// (or more) pseudo-CMOVs only when they go one-after-another and
53067// getting rid of result extension code after CMOV will help that.
53068// 2) Promotion of constant CMOV arguments is free, hence the
53069// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
53070// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
53071// promotion is also good in terms of code-size.
53072// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
53073// promotion).
53075 SDValue CMovN = Extend->getOperand(0);
53076 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
53077 return SDValue();
53078
53079 EVT TargetVT = Extend->getValueType(0);
53080 unsigned ExtendOpcode = Extend->getOpcode();
53081 SDLoc DL(Extend);
53082
53083 EVT VT = CMovN.getValueType();
53084 SDValue CMovOp0 = CMovN.getOperand(0);
53085 SDValue CMovOp1 = CMovN.getOperand(1);
53086
53087 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
53088 !isa<ConstantSDNode>(CMovOp1.getNode()))
53089 return SDValue();
53090
53091 // Only extend to i32 or i64.
53092 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
53093 return SDValue();
53094
53095 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
53096 // are free.
53097 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
53098 return SDValue();
53099
53100 // If this a zero extend to i64, we should only extend to i32 and use a free
53101 // zero extend to finish.
53102 EVT ExtendVT = TargetVT;
53103 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
53104 ExtendVT = MVT::i32;
53105
53106 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
53107 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
53108
53109 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
53110 CMovN.getOperand(2), CMovN.getOperand(3));
53111
53112 // Finish extending if needed.
53113 if (ExtendVT != TargetVT)
53114 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
53115
53116 return Res;
53117}
53118
53119// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
53120// result type.
53122 const X86Subtarget &Subtarget) {
53123 SDValue N0 = N->getOperand(0);
53124 EVT VT = N->getValueType(0);
53125 SDLoc dl(N);
53126
53127 // Only do this combine with AVX512 for vector extends.
53128 if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)
53129 return SDValue();
53130
53131 // Only combine legal element types.
53132 EVT SVT = VT.getVectorElementType();
53133 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
53134 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
53135 return SDValue();
53136
53137 // We don't have CMPP Instruction for vxf16
53138 if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)
53139 return SDValue();
53140 // We can only do this if the vector size in 256 bits or less.
53141 unsigned Size = VT.getSizeInBits();
53142 if (Size > 256 && Subtarget.useAVX512Regs())
53143 return SDValue();
53144
53145 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
53146 // that's the only integer compares with we have.
53147 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
53149 return SDValue();
53150
53151 // Only do this combine if the extension will be fully consumed by the setcc.
53152 EVT N00VT = N0.getOperand(0).getValueType();
53153 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
53154 if (Size != MatchingVecType.getSizeInBits())
53155 return SDValue();
53156
53157 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
53158
53159 if (N->getOpcode() == ISD::ZERO_EXTEND)
53160 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
53161
53162 return Res;
53163}
53164
53167 const X86Subtarget &Subtarget) {
53168 SDValue N0 = N->getOperand(0);
53169 EVT VT = N->getValueType(0);
53170 SDLoc DL(N);
53171
53172 // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
53173 if (!DCI.isBeforeLegalizeOps() &&
53175 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
53176 N0->getOperand(1));
53177 bool ReplaceOtherUses = !N0.hasOneUse();
53178 DCI.CombineTo(N, Setcc);
53179 // Replace other uses with a truncate of the widened setcc_carry.
53180 if (ReplaceOtherUses) {
53181 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
53182 N0.getValueType(), Setcc);
53183 DCI.CombineTo(N0.getNode(), Trunc);
53184 }
53185
53186 return SDValue(N, 0);
53187 }
53188
53189 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
53190 return NewCMov;
53191
53192 if (!DCI.isBeforeLegalizeOps())
53193 return SDValue();
53194
53195 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
53196 return V;
53197
53198 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,
53199 DAG, DCI, Subtarget))
53200 return V;
53201
53202 if (VT.isVector()) {
53203 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget))
53204 return R;
53205
53207 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));
53208 }
53209
53210 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
53211 return NewAdd;
53212
53213 return SDValue();
53214}
53215
53216// Inverting a constant vector is profitable if it can be eliminated and the
53217// inverted vector is already present in DAG. Otherwise, it will be loaded
53218// anyway.
53219//
53220// We determine which of the values can be completely eliminated and invert it.
53221// If both are eliminable, select a vector with the first negative element.
53224 "ConstantFP build vector expected");
53225 // Check if we can eliminate V. We assume if a value is only used in FMAs, we
53226 // can eliminate it. Since this function is invoked for each FMA with this
53227 // vector.
53228 auto IsNotFMA = [](SDNode *Use) {
53229 return Use->getOpcode() != ISD::FMA && Use->getOpcode() != ISD::STRICT_FMA;
53230 };
53231 if (llvm::any_of(V->uses(), IsNotFMA))
53232 return SDValue();
53233
53235 EVT VT = V.getValueType();
53236 EVT EltVT = VT.getVectorElementType();
53237 for (const SDValue &Op : V->op_values()) {
53238 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
53239 Ops.push_back(DAG.getConstantFP(-Cst->getValueAPF(), SDLoc(Op), EltVT));
53240 } else {
53241 assert(Op.isUndef());
53242 Ops.push_back(DAG.getUNDEF(EltVT));
53243 }
53244 }
53245
53246 SDNode *NV = DAG.getNodeIfExists(ISD::BUILD_VECTOR, DAG.getVTList(VT), Ops);
53247 if (!NV)
53248 return SDValue();
53249
53250 // If an inverted version cannot be eliminated, choose it instead of the
53251 // original version.
53252 if (llvm::any_of(NV->uses(), IsNotFMA))
53253 return SDValue(NV, 0);
53254
53255 // If the inverted version also can be eliminated, we have to consistently
53256 // prefer one of the values. We prefer a constant with a negative value on
53257 // the first place.
53258 // N.B. We need to skip undefs that may precede a value.
53259 for (const SDValue &Op : V->op_values()) {
53260 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
53261 if (Cst->isNegative())
53262 return SDValue();
53263 break;
53264 }
53265 }
53266 return SDValue(NV, 0);
53267}
53268
53271 const X86Subtarget &Subtarget) {
53272 SDLoc dl(N);
53273 EVT VT = N->getValueType(0);
53274 bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
53275
53276 // Let legalize expand this if it isn't a legal type yet.
53277 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53278 if (!TLI.isTypeLegal(VT))
53279 return SDValue();
53280
53281 SDValue A = N->getOperand(IsStrict ? 1 : 0);
53282 SDValue B = N->getOperand(IsStrict ? 2 : 1);
53283 SDValue C = N->getOperand(IsStrict ? 3 : 2);
53284
53285 // If the operation allows fast-math and the target does not support FMA,
53286 // split this into mul+add to avoid libcall(s).
53287 SDNodeFlags Flags = N->getFlags();
53288 if (!IsStrict && Flags.hasAllowReassociation() &&
53289 TLI.isOperationExpand(ISD::FMA, VT)) {
53290 SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);
53291 return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);
53292 }
53293
53294 EVT ScalarVT = VT.getScalarType();
53295 if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
53296 !Subtarget.hasAnyFMA()) &&
53297 !(ScalarVT == MVT::f16 && Subtarget.hasFP16()))
53298 return SDValue();
53299
53300 auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
53301 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
53302 bool LegalOperations = !DCI.isBeforeLegalizeOps();
53303 if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
53304 CodeSize)) {
53305 V = NegV;
53306 return true;
53307 }
53308 // Look through extract_vector_elts. If it comes from an FNEG, create a
53309 // new extract from the FNEG input.
53310 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
53311 isNullConstant(V.getOperand(1))) {
53312 SDValue Vec = V.getOperand(0);
53313 if (SDValue NegV = TLI.getCheaperNegatedExpression(
53314 Vec, DAG, LegalOperations, CodeSize)) {
53315 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
53316 NegV, V.getOperand(1));
53317 return true;
53318 }
53319 }
53320 // Lookup if there is an inverted version of constant vector V in DAG.
53321 if (ISD::isBuildVectorOfConstantFPSDNodes(V.getNode())) {
53322 if (SDValue NegV = getInvertedVectorForFMA(V, DAG)) {
53323 V = NegV;
53324 return true;
53325 }
53326 }
53327 return false;
53328 };
53329
53330 // Do not convert the passthru input of scalar intrinsics.
53331 // FIXME: We could allow negations of the lower element only.
53332 bool NegA = invertIfNegative(A);
53333 bool NegB = invertIfNegative(B);
53334 bool NegC = invertIfNegative(C);
53335
53336 if (!NegA && !NegB && !NegC)
53337 return SDValue();
53338
53339 unsigned NewOpcode =
53340 negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
53341
53342 // Propagate fast-math-flags to new FMA node.
53343 SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);
53344 if (IsStrict) {
53345 assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
53346 return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
53347 {N->getOperand(0), A, B, C});
53348 } else {
53349 if (N->getNumOperands() == 4)
53350 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
53351 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
53352 }
53353}
53354
53355// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
53356// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)
53359 SDLoc dl(N);
53360 EVT VT = N->getValueType(0);
53361 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53362 bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
53363 bool LegalOperations = !DCI.isBeforeLegalizeOps();
53364
53365 SDValue N2 = N->getOperand(2);
53366
53367 SDValue NegN2 =
53368 TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
53369 if (!NegN2)
53370 return SDValue();
53371 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
53372
53373 if (N->getNumOperands() == 4)
53374 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
53375 NegN2, N->getOperand(3));
53376 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
53377 NegN2);
53378}
53379
53382 const X86Subtarget &Subtarget) {
53383 SDLoc dl(N);
53384 SDValue N0 = N->getOperand(0);
53385 EVT VT = N->getValueType(0);
53386
53387 // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
53388 // FIXME: Is this needed? We don't seem to have any tests for it.
53389 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
53391 SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
53392 N0->getOperand(1));
53393 bool ReplaceOtherUses = !N0.hasOneUse();
53394 DCI.CombineTo(N, Setcc);
53395 // Replace other uses with a truncate of the widened setcc_carry.
53396 if (ReplaceOtherUses) {
53397 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
53398 N0.getValueType(), Setcc);
53399 DCI.CombineTo(N0.getNode(), Trunc);
53400 }
53401
53402 return SDValue(N, 0);
53403 }
53404
53405 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
53406 return NewCMov;
53407
53408 if (DCI.isBeforeLegalizeOps())
53409 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
53410 return V;
53411
53412 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
53413 DAG, DCI, Subtarget))
53414 return V;
53415
53416 if (VT.isVector())
53417 if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), dl, DAG, Subtarget))
53418 return R;
53419
53420 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
53421 return NewAdd;
53422
53423 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
53424 return R;
53425
53426 // TODO: Combine with any target/faux shuffle.
53427 if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&
53429 SDValue N00 = N0.getOperand(0);
53430 SDValue N01 = N0.getOperand(1);
53431 unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();
53432 APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);
53433 if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&
53434 (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {
53435 return concatSubVectors(N00, N01, DAG, dl);
53436 }
53437 }
53438
53439 return SDValue();
53440}
53441
53442/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
53443/// pre-promote its result type since vXi1 vectors don't get promoted
53444/// during type legalization.
53447 const SDLoc &DL, SelectionDAG &DAG,
53448 const X86Subtarget &Subtarget) {
53449 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
53450 VT.getVectorElementType() == MVT::i1 &&
53451 (OpVT.getVectorElementType() == MVT::i8 ||
53452 OpVT.getVectorElementType() == MVT::i16)) {
53453 SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
53454 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
53455 }
53456 return SDValue();
53457}
53458
53461 const X86Subtarget &Subtarget) {
53462 const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
53463 const SDValue LHS = N->getOperand(0);
53464 const SDValue RHS = N->getOperand(1);
53465 EVT VT = N->getValueType(0);
53466 EVT OpVT = LHS.getValueType();
53467 SDLoc DL(N);
53468
53469 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
53470 if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,
53471 Subtarget))
53472 return V;
53473
53474 if (VT == MVT::i1) {
53475 X86::CondCode X86CC;
53476 if (SDValue V =
53477 MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))
53478 return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));
53479 }
53480
53481 if (OpVT.isScalarInteger()) {
53482 // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)
53483 // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)
53484 auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {
53485 if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {
53486 if (N0.getOperand(0) == N1)
53487 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
53488 N0.getOperand(1));
53489 if (N0.getOperand(1) == N1)
53490 return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),
53491 N0.getOperand(0));
53492 }
53493 return SDValue();
53494 };
53495 if (SDValue AndN = MatchOrCmpEq(LHS, RHS))
53496 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
53497 if (SDValue AndN = MatchOrCmpEq(RHS, LHS))
53498 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
53499
53500 // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)
53501 // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)
53502 auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {
53503 if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {
53504 if (N0.getOperand(0) == N1)
53505 return DAG.getNode(ISD::AND, DL, OpVT, N1,
53506 DAG.getNOT(DL, N0.getOperand(1), OpVT));
53507 if (N0.getOperand(1) == N1)
53508 return DAG.getNode(ISD::AND, DL, OpVT, N1,
53509 DAG.getNOT(DL, N0.getOperand(0), OpVT));
53510 }
53511 return SDValue();
53512 };
53513 if (SDValue AndN = MatchAndCmpEq(LHS, RHS))
53514 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
53515 if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
53516 return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
53517
53518 // cmpeq(trunc(x),C) --> cmpeq(x,C)
53519 // cmpne(trunc(x),C) --> cmpne(x,C)
53520 // iff x upper bits are zero.
53521 if (LHS.getOpcode() == ISD::TRUNCATE &&
53522 LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&
53523 isa<ConstantSDNode>(RHS) && !DCI.isBeforeLegalize()) {
53524 EVT SrcVT = LHS.getOperand(0).getValueType();
53526 OpVT.getScalarSizeInBits());
53527 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53528 if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&
53529 TLI.isTypeLegal(LHS.getOperand(0).getValueType()))
53530 return DAG.getSetCC(DL, VT, LHS.getOperand(0),
53531 DAG.getZExtOrTrunc(RHS, DL, SrcVT), CC);
53532 }
53533
53534 // With C as a power of 2 and C != 0 and C != INT_MIN:
53535 // icmp eq Abs(X) C ->
53536 // (icmp eq A, C) | (icmp eq A, -C)
53537 // icmp ne Abs(X) C ->
53538 // (icmp ne A, C) & (icmp ne A, -C)
53539 // Both of these patterns can be better optimized in
53540 // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar
53541 // integers which is checked above.
53542 if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {
53543 if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {
53544 const APInt &CInt = C->getAPIntValue();
53545 // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.
53546 if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {
53547 SDValue BaseOp = LHS.getOperand(0);
53548 SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);
53549 SDValue SETCC1 = DAG.getSetCC(
53550 DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);
53551 return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,
53552 SETCC0, SETCC1);
53553 }
53554 }
53555 }
53556 }
53557 }
53558
53559 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
53561 // Using temporaries to avoid messing up operand ordering for later
53562 // transformations if this doesn't work.
53563 SDValue Op0 = LHS;
53564 SDValue Op1 = RHS;
53565 ISD::CondCode TmpCC = CC;
53566 // Put build_vector on the right.
53567 if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
53568 std::swap(Op0, Op1);
53569 TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
53570 }
53571
53572 bool IsSEXT0 =
53573 (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
53574 (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
53575 bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
53576
53577 if (IsSEXT0 && IsVZero1) {
53578 assert(VT == Op0.getOperand(0).getValueType() &&
53579 "Unexpected operand type");
53580 if (TmpCC == ISD::SETGT)
53581 return DAG.getConstant(0, DL, VT);
53582 if (TmpCC == ISD::SETLE)
53583 return DAG.getConstant(1, DL, VT);
53584 if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
53585 return DAG.getNOT(DL, Op0.getOperand(0), VT);
53586
53587 assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
53588 "Unexpected condition code!");
53589 return Op0.getOperand(0);
53590 }
53591 }
53592
53593 // Try and make unsigned vector comparison signed. On pre AVX512 targets there
53594 // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to
53595 // use `PCMPGT` if the result is mean to stay in a vector (and if its going to
53596 // a mask, there are signed AVX512 comparisons).
53597 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {
53598 bool CanMakeSigned = false;
53600 KnownBits CmpKnown =
53602 // If we know LHS/RHS share the same sign bit at each element we can
53603 // make this signed.
53604 // NOTE: `computeKnownBits` on a vector type aggregates common bits
53605 // across all lanes. So a pattern where the sign varies from lane to
53606 // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be
53607 // missed. We could get around this by demanding each lane
53608 // independently, but this isn't the most important optimization and
53609 // that may eat into compile time.
53610 CanMakeSigned =
53611 CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();
53612 }
53613 if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {
53614 SDValue LHSOut = LHS;
53615 SDValue RHSOut = RHS;
53616 ISD::CondCode NewCC = CC;
53617 switch (CC) {
53618 case ISD::SETGE:
53619 case ISD::SETUGE:
53620 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,
53621 /*NSW*/ true))
53622 LHSOut = NewLHS;
53623 else if (SDValue NewRHS = incDecVectorConstant(
53624 RHS, DAG, /*IsInc*/ false, /*NSW*/ true))
53625 RHSOut = NewRHS;
53626 else
53627 break;
53628
53629 [[fallthrough]];
53630 case ISD::SETUGT:
53631 NewCC = ISD::SETGT;
53632 break;
53633
53634 case ISD::SETLE:
53635 case ISD::SETULE:
53636 if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,
53637 /*NSW*/ true))
53638 LHSOut = NewLHS;
53639 else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,
53640 /*NSW*/ true))
53641 RHSOut = NewRHS;
53642 else
53643 break;
53644
53645 [[fallthrough]];
53646 case ISD::SETULT:
53647 // Will be swapped to SETGT in LowerVSETCC*.
53648 NewCC = ISD::SETLT;
53649 break;
53650 default:
53651 break;
53652 }
53653 if (NewCC != CC) {
53654 if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,
53655 NewCC, DL, DAG, Subtarget))
53656 return R;
53657 return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);
53658 }
53659 }
53660 }
53661
53662 if (SDValue R =
53663 truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
53664 return R;
53665
53666 // In the middle end transforms:
53667 // `(or (icmp eq X, C), (icmp eq X, C+1))`
53668 // -> `(icmp ult (add x, -C), 2)`
53669 // Likewise inverted cases with `ugt`.
53670 //
53671 // Since x86, pre avx512, doesn't have unsigned vector compares, this results
53672 // in worse codegen. So, undo the middle-end transform and go back to `(or
53673 // (icmp eq), (icmp eq))` form.
53674 // Also skip AVX1 with ymm vectors, as the umin approach combines better than
53675 // the xmm approach.
53676 //
53677 // NB: We don't handle the similiar simplication of `(and (icmp ne), (icmp
53678 // ne))` as it doesn't end up instruction positive.
53679 // TODO: We might want to do this for avx512 as well if we `sext` the result.
53680 if (VT.isVector() && OpVT.isVector() && OpVT.isInteger() &&
53681 ISD::isUnsignedIntSetCC(CC) && LHS.getOpcode() == ISD::ADD &&
53682 !Subtarget.hasAVX512() &&
53683 (OpVT.getSizeInBits() <= 128 || !Subtarget.hasAVX() ||
53684 Subtarget.hasAVX2()) &&
53685 LHS.hasOneUse()) {
53686
53687 APInt CmpC;
53688 SDValue AddC = LHS.getOperand(1);
53689 if (ISD::isConstantSplatVector(RHS.getNode(), CmpC) &&
53691 // See which form we have depending on the constant/condition.
53692 SDValue C0 = SDValue();
53693 SDValue C1 = SDValue();
53694
53695 // If we had `(add x, -1)` and can lower with `umin`, don't transform as
53696 // we will end up generating an additional constant. Keeping in the
53697 // current form has a slight latency cost, but it probably worth saving a
53698 // constant.
53701 // Pass
53702 }
53703 // Normal Cases
53704 else if ((CC == ISD::SETULT && CmpC == 2) ||
53705 (CC == ISD::SETULE && CmpC == 1)) {
53706 // These will constant fold.
53707 C0 = DAG.getNegative(AddC, DL, OpVT);
53708 C1 = DAG.getNode(ISD::SUB, DL, OpVT, C0,
53709 DAG.getAllOnesConstant(DL, OpVT));
53710 }
53711 // Inverted Cases
53712 else if ((CC == ISD::SETUGT && (-CmpC) == 3) ||
53713 (CC == ISD::SETUGE && (-CmpC) == 2)) {
53714 // These will constant fold.
53715 C0 = DAG.getNOT(DL, AddC, OpVT);
53716 C1 = DAG.getNode(ISD::ADD, DL, OpVT, C0,
53717 DAG.getAllOnesConstant(DL, OpVT));
53718 }
53719 if (C0 && C1) {
53720 SDValue NewLHS =
53721 DAG.getSetCC(DL, VT, LHS.getOperand(0), C0, ISD::SETEQ);
53722 SDValue NewRHS =
53723 DAG.getSetCC(DL, VT, LHS.getOperand(0), C1, ISD::SETEQ);
53724 return DAG.getNode(ISD::OR, DL, VT, NewLHS, NewRHS);
53725 }
53726 }
53727 }
53728
53729 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
53730 // to avoid scalarization via legalization because v4i32 is not a legal type.
53731 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
53732 LHS.getValueType() == MVT::v4f32)
53733 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
53734
53735 // X pred 0.0 --> X pred -X
53736 // If the negation of X already exists, use it in the comparison. This removes
53737 // the need to materialize 0.0 and allows matching to SSE's MIN/MAX
53738 // instructions in patterns with a 'select' node.
53740 SDVTList FNegVT = DAG.getVTList(OpVT);
53741 if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))
53742 return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);
53743 }
53744
53745 return SDValue();
53746}
53747
53750 const X86Subtarget &Subtarget) {
53751 SDValue Src = N->getOperand(0);
53752 MVT SrcVT = Src.getSimpleValueType();
53753 MVT VT = N->getSimpleValueType(0);
53754 unsigned NumBits = VT.getScalarSizeInBits();
53755 unsigned NumElts = SrcVT.getVectorNumElements();
53756 unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();
53757 assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types");
53758
53759 // Perform constant folding.
53760 APInt UndefElts;
53761 SmallVector<APInt, 32> EltBits;
53762 if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits,
53763 /*AllowWholeUndefs*/ true,
53764 /*AllowPartialUndefs*/ true)) {
53765 APInt Imm(32, 0);
53766 for (unsigned Idx = 0; Idx != NumElts; ++Idx)
53767 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
53768 Imm.setBit(Idx);
53769
53770 return DAG.getConstant(Imm, SDLoc(N), VT);
53771 }
53772
53773 // Look through int->fp bitcasts that don't change the element width.
53774 unsigned EltWidth = SrcVT.getScalarSizeInBits();
53775 if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&
53776 Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)
53777 return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));
53778
53779 // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results
53780 // with scalar comparisons.
53781 if (SDValue NotSrc = IsNOT(Src, DAG)) {
53782 SDLoc DL(N);
53783 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
53784 NotSrc = DAG.getBitcast(SrcVT, NotSrc);
53785 return DAG.getNode(ISD::XOR, DL, VT,
53786 DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),
53787 DAG.getConstant(NotMask, DL, VT));
53788 }
53789
53790 // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk
53791 // results with scalar comparisons.
53792 if (Src.getOpcode() == X86ISD::PCMPGT &&
53793 ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {
53794 SDLoc DL(N);
53795 APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);
53796 return DAG.getNode(ISD::XOR, DL, VT,
53797 DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),
53798 DAG.getConstant(NotMask, DL, VT));
53799 }
53800
53801 // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))
53802 // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))
53803 // iff pow2splat(c1).
53804 // Use KnownBits to determine if only a single bit is non-zero
53805 // in each element (pow2 or zero), and shift that bit to the msb.
53806 if (Src.getOpcode() == X86ISD::PCMPEQ) {
53807 KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));
53808 KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));
53809 unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();
53810 if (KnownLHS.countMaxPopulation() == 1 &&
53811 (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&
53812 ShiftAmt == KnownRHS.countMinLeadingZeros()))) {
53813 SDLoc DL(N);
53814 MVT ShiftVT = SrcVT;
53815 SDValue ShiftLHS = Src.getOperand(0);
53816 SDValue ShiftRHS = Src.getOperand(1);
53817 if (ShiftVT.getScalarType() == MVT::i8) {
53818 // vXi8 shifts - we only care about the signbit so can use PSLLW.
53819 ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
53820 ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);
53821 ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);
53822 }
53823 ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
53824 ShiftLHS, ShiftAmt, DAG);
53825 ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,
53826 ShiftRHS, ShiftAmt, DAG);
53827 ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);
53828 ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);
53829 SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);
53830 return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));
53831 }
53832 }
53833
53834 // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)
53835 if (N->isOnlyUserOf(Src.getNode())) {
53837 if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {
53838 APInt UndefElts;
53839 SmallVector<APInt, 32> EltBits;
53840 if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,
53841 UndefElts, EltBits)) {
53842 APInt Mask = APInt::getZero(NumBits);
53843 for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
53844 if (!UndefElts[Idx] && EltBits[Idx].isNegative())
53845 Mask.setBit(Idx);
53846 }
53847 SDLoc DL(N);
53848 SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));
53849 SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);
53850 return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,
53851 DAG.getConstant(Mask, DL, VT));
53852 }
53853 }
53854 }
53855
53856 // Simplify the inputs.
53857 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53858 APInt DemandedMask(APInt::getAllOnes(NumBits));
53859 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
53860 return SDValue(N, 0);
53861
53862 return SDValue();
53863}
53864
53867 const X86Subtarget &Subtarget) {
53868 MVT VT = N->getSimpleValueType(0);
53869 unsigned NumBits = VT.getScalarSizeInBits();
53870
53871 // Simplify the inputs.
53872 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53873 APInt DemandedMask(APInt::getAllOnes(NumBits));
53874 if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
53875 return SDValue(N, 0);
53876
53877 return SDValue();
53878}
53879
53882 auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);
53883 SDValue Mask = MemOp->getMask();
53884
53885 // With vector masks we only demand the upper bit of the mask.
53886 if (Mask.getScalarValueSizeInBits() != 1) {
53887 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53888 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
53889 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
53890 if (N->getOpcode() != ISD::DELETED_NODE)
53891 DCI.AddToWorklist(N);
53892 return SDValue(N, 0);
53893 }
53894 }
53895
53896 return SDValue();
53897}
53898
53901 SelectionDAG &DAG) {
53902 SDLoc DL(GorS);
53903
53904 if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
53905 SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
53906 Gather->getMask(), Base, Index, Scale } ;
53907 return DAG.getMaskedGather(Gather->getVTList(),
53908 Gather->getMemoryVT(), DL, Ops,
53909 Gather->getMemOperand(),
53910 Gather->getIndexType(),
53911 Gather->getExtensionType());
53912 }
53913 auto *Scatter = cast<MaskedScatterSDNode>(GorS);
53914 SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
53915 Scatter->getMask(), Base, Index, Scale };
53916 return DAG.getMaskedScatter(Scatter->getVTList(),
53917 Scatter->getMemoryVT(), DL,
53918 Ops, Scatter->getMemOperand(),
53919 Scatter->getIndexType(),
53920 Scatter->isTruncatingStore());
53921}
53922
53925 SDLoc DL(N);
53926 auto *GorS = cast<MaskedGatherScatterSDNode>(N);
53927 SDValue Index = GorS->getIndex();
53928 SDValue Base = GorS->getBasePtr();
53929 SDValue Scale = GorS->getScale();
53930 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
53931
53932 if (DCI.isBeforeLegalize()) {
53933 unsigned IndexWidth = Index.getScalarValueSizeInBits();
53934
53935 // Shrink constant indices if they are larger than 32-bits.
53936 // Only do this before legalize types since v2i64 could become v2i32.
53937 // FIXME: We could check that the type is legal if we're after legalize
53938 // types, but then we would need to construct test cases where that happens.
53939 // FIXME: We could support more than just constant vectors, but we need to
53940 // careful with costing. A truncate that can be optimized out would be fine.
53941 // Otherwise we might only want to create a truncate if it avoids a split.
53942 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {
53943 if (BV->isConstant() && IndexWidth > 32 &&
53944 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
53945 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
53946 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
53947 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
53948 }
53949 }
53950
53951 // Shrink any sign/zero extends from 32 or smaller to larger than 32 if
53952 // there are sufficient sign bits. Only do this before legalize types to
53953 // avoid creating illegal types in truncate.
53954 if ((Index.getOpcode() == ISD::SIGN_EXTEND ||
53955 Index.getOpcode() == ISD::ZERO_EXTEND) &&
53956 IndexWidth > 32 &&
53957 Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&
53958 DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {
53959 EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);
53960 Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
53961 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
53962 }
53963 }
53964
53965 EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
53966 // Try to move splat constant adders from the index operand to the base
53967 // pointer operand. Taking care to multiply by the scale. We can only do
53968 // this when index element type is the same as the pointer type.
53969 // Otherwise we need to be sure the math doesn't wrap before the scale.
53970 if (Index.getOpcode() == ISD::ADD &&
53971 Index.getValueType().getVectorElementType() == PtrVT &&
53972 isa<ConstantSDNode>(Scale)) {
53973 uint64_t ScaleAmt = Scale->getAsZExtVal();
53974 if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) {
53975 BitVector UndefElts;
53976 if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {
53977 // FIXME: Allow non-constant?
53978 if (UndefElts.none()) {
53979 // Apply the scale.
53980 APInt Adder = C->getAPIntValue() * ScaleAmt;
53981 // Add it to the existing base.
53982 Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
53983 DAG.getConstant(Adder, DL, PtrVT));
53984 Index = Index.getOperand(0);
53985 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
53986 }
53987 }
53988
53989 // It's also possible base is just a constant. In that case, just
53990 // replace it with 0 and move the displacement into the index.
53991 if (BV->isConstant() && isa<ConstantSDNode>(Base) &&
53992 isOneConstant(Scale)) {
53993 SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base);
53994 // Combine the constant build_vector and the constant base.
53995 Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
53996 Index.getOperand(1), Splat);
53997 // Add to the LHS of the original Index add.
53998 Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(),
53999 Index.getOperand(0), Splat);
54000 Base = DAG.getConstant(0, DL, Base.getValueType());
54001 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
54002 }
54003 }
54004 }
54005
54006 if (DCI.isBeforeLegalizeOps()) {
54007 unsigned IndexWidth = Index.getScalarValueSizeInBits();
54008
54009 // Make sure the index is either i32 or i64
54010 if (IndexWidth != 32 && IndexWidth != 64) {
54011 MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;
54012 EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);
54013 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
54014 return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
54015 }
54016 }
54017
54018 // With vector masks we only demand the upper bit of the mask.
54019 SDValue Mask = GorS->getMask();
54020 if (Mask.getScalarValueSizeInBits() != 1) {
54021 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
54022 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
54023 if (N->getOpcode() != ISD::DELETED_NODE)
54024 DCI.AddToWorklist(N);
54025 return SDValue(N, 0);
54026 }
54027 }
54028
54029 return SDValue();
54030}
54031
54032// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
54034 const X86Subtarget &Subtarget) {
54035 SDLoc DL(N);
54036 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
54037 SDValue EFLAGS = N->getOperand(1);
54038
54039 // Try to simplify the EFLAGS and condition code operands.
54040 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
54041 return getSETCC(CC, Flags, DL, DAG);
54042
54043 return SDValue();
54044}
54045
54046/// Optimize branch condition evaluation.
54048 const X86Subtarget &Subtarget) {
54049 SDLoc DL(N);
54050 SDValue EFLAGS = N->getOperand(3);
54051 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
54052
54053 // Try to simplify the EFLAGS and condition code operands.
54054 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
54055 // RAUW them under us.
54056 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
54057 SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);
54058 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
54059 N->getOperand(1), Cond, Flags);
54060 }
54061
54062 return SDValue();
54063}
54064
54065// TODO: Could we move this to DAGCombine?
54067 SelectionDAG &DAG) {
54068 // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
54069 // to optimize away operation when it's from a constant.
54070 //
54071 // The general transformation is:
54072 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
54073 // AND(VECTOR_CMP(x,y), constant2)
54074 // constant2 = UNARYOP(constant)
54075
54076 // Early exit if this isn't a vector operation, the operand of the
54077 // unary operation isn't a bitwise AND, or if the sizes of the operations
54078 // aren't the same.
54079 EVT VT = N->getValueType(0);
54080 bool IsStrict = N->isStrictFPOpcode();
54081 unsigned NumEltBits = VT.getScalarSizeInBits();
54082 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
54083 if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
54084 DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
54085 VT.getSizeInBits() != Op0.getValueSizeInBits())
54086 return SDValue();
54087
54088 // Now check that the other operand of the AND is a constant. We could
54089 // make the transformation for non-constant splats as well, but it's unclear
54090 // that would be a benefit as it would not eliminate any operations, just
54091 // perform one more step in scalar code before moving to the vector unit.
54092 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
54093 // Bail out if the vector isn't a constant.
54094 if (!BV->isConstant())
54095 return SDValue();
54096
54097 // Everything checks out. Build up the new and improved node.
54098 SDLoc DL(N);
54099 EVT IntVT = BV->getValueType(0);
54100 // Create a new constant of the appropriate type for the transformed
54101 // DAG.
54102 SDValue SourceConst;
54103 if (IsStrict)
54104 SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
54105 {N->getOperand(0), SDValue(BV, 0)});
54106 else
54107 SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
54108 // The AND node needs bitcasts to/from an integer vector type around it.
54109 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
54110 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
54111 MaskConst);
54112 SDValue Res = DAG.getBitcast(VT, NewAnd);
54113 if (IsStrict)
54114 return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
54115 return Res;
54116 }
54117
54118 return SDValue();
54119}
54120
54121/// If we are converting a value to floating-point, try to replace scalar
54122/// truncate of an extracted vector element with a bitcast. This tries to keep
54123/// the sequence on XMM registers rather than moving between vector and GPRs.
54125 // TODO: This is currently only used by combineSIntToFP, but it is generalized
54126 // to allow being called by any similar cast opcode.
54127 // TODO: Consider merging this into lowering: vectorizeExtractedCast().
54128 SDValue Trunc = N->getOperand(0);
54129 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
54130 return SDValue();
54131
54132 SDValue ExtElt = Trunc.getOperand(0);
54133 if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54134 !isNullConstant(ExtElt.getOperand(1)))
54135 return SDValue();
54136
54137 EVT TruncVT = Trunc.getValueType();
54138 EVT SrcVT = ExtElt.getValueType();
54139 unsigned DestWidth = TruncVT.getSizeInBits();
54140 unsigned SrcWidth = SrcVT.getSizeInBits();
54141 if (SrcWidth % DestWidth != 0)
54142 return SDValue();
54143
54144 // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)
54145 EVT SrcVecVT = ExtElt.getOperand(0).getValueType();
54146 unsigned VecWidth = SrcVecVT.getSizeInBits();
54147 unsigned NumElts = VecWidth / DestWidth;
54148 EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);
54149 SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));
54150 SDLoc DL(N);
54151 SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,
54152 BitcastVec, ExtElt.getOperand(1));
54153 return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);
54154}
54155
54157 const X86Subtarget &Subtarget) {
54158 bool IsStrict = N->isStrictFPOpcode();
54159 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
54160 EVT VT = N->getValueType(0);
54161 EVT InVT = Op0.getValueType();
54162
54163 // Using i16 as an intermediate type is a bad idea, unless we have HW support
54164 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
54165 // if hasFP16 support:
54166 // UINT_TO_FP(vXi1~15) -> SINT_TO_FP(ZEXT(vXi1~15 to vXi16))
54167 // UINT_TO_FP(vXi17~31) -> SINT_TO_FP(ZEXT(vXi17~31 to vXi32))
54168 // else
54169 // UINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
54170 // UINT_TO_FP(vXi33~63) -> SINT_TO_FP(ZEXT(vXi33~63 to vXi64))
54171 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
54172 unsigned ScalarSize = InVT.getScalarSizeInBits();
54173 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
54174 ScalarSize >= 64)
54175 return SDValue();
54176 SDLoc dl(N);
54177 EVT DstVT =
54179 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
54180 : ScalarSize < 32 ? MVT::i32
54181 : MVT::i64,
54182 InVT.getVectorNumElements());
54183 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
54184 if (IsStrict)
54185 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
54186 {N->getOperand(0), P});
54187 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
54188 }
54189
54190 // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))
54191 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
54192 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
54193 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
54194 VT.getScalarType() != MVT::f16) {
54195 SDLoc dl(N);
54196 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
54197 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
54198
54199 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
54200 if (IsStrict)
54201 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
54202 {N->getOperand(0), P});
54203 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
54204 }
54205
54206 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
54207 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
54208 // the optimization here.
54209 SDNodeFlags Flags = N->getFlags();
54210 if (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0)) {
54211 if (IsStrict)
54212 return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
54213 {N->getOperand(0), Op0});
54214 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
54215 }
54216
54217 return SDValue();
54218}
54219
54222 const X86Subtarget &Subtarget) {
54223 // First try to optimize away the conversion entirely when it's
54224 // conditionally from a constant. Vectors only.
54225 bool IsStrict = N->isStrictFPOpcode();
54227 return Res;
54228
54229 // Now move on to more general possibilities.
54230 SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
54231 EVT VT = N->getValueType(0);
54232 EVT InVT = Op0.getValueType();
54233
54234 // Using i16 as an intermediate type is a bad idea, unless we have HW support
54235 // for it. Therefore for type sizes equal or smaller than 32 just go with i32.
54236 // if hasFP16 support:
54237 // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))
54238 // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))
54239 // else
54240 // SINT_TO_FP(vXi1~31) -> SINT_TO_FP(ZEXT(vXi1~31 to vXi32))
54241 // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))
54242 if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {
54243 unsigned ScalarSize = InVT.getScalarSizeInBits();
54244 if ((ScalarSize == 16 && Subtarget.hasFP16()) || ScalarSize == 32 ||
54245 ScalarSize >= 64)
54246 return SDValue();
54247 SDLoc dl(N);
54248 EVT DstVT =
54250 (Subtarget.hasFP16() && ScalarSize < 16) ? MVT::i16
54251 : ScalarSize < 32 ? MVT::i32
54252 : MVT::i64,
54253 InVT.getVectorNumElements());
54254 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
54255 if (IsStrict)
54256 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
54257 {N->getOperand(0), P});
54258 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
54259 }
54260
54261 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
54262 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
54263 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
54264 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&
54265 VT.getScalarType() != MVT::f16) {
54266 SDLoc dl(N);
54267 EVT DstVT = InVT.changeVectorElementType(MVT::i32);
54268 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
54269 if (IsStrict)
54270 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
54271 {N->getOperand(0), P});
54272 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
54273 }
54274
54275 // Without AVX512DQ we only support i64 to float scalar conversion. For both
54276 // vectors and scalars, see if we know that the upper bits are all the sign
54277 // bit, in which case we can truncate the input to i32 and convert from that.
54278 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
54279 unsigned BitWidth = InVT.getScalarSizeInBits();
54280 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
54281 if (NumSignBits >= (BitWidth - 31)) {
54282 EVT TruncVT = MVT::i32;
54283 if (InVT.isVector())
54284 TruncVT = InVT.changeVectorElementType(TruncVT);
54285 SDLoc dl(N);
54286 if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
54287 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
54288 if (IsStrict)
54289 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
54290 {N->getOperand(0), Trunc});
54291 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
54292 }
54293 // If we're after legalize and the type is v2i32 we need to shuffle and
54294 // use CVTSI2P.
54295 assert(InVT == MVT::v2i64 && "Unexpected VT!");
54296 SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
54297 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
54298 { 0, 2, -1, -1 });
54299 if (IsStrict)
54300 return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
54301 {N->getOperand(0), Shuf});
54302 return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
54303 }
54304 }
54305
54306 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
54307 // a 32-bit target where SSE doesn't support i64->FP operations.
54308 if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
54309 Op0.getOpcode() == ISD::LOAD) {
54310 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
54311
54312 // This transformation is not supported if the result type is f16 or f128.
54313 if (VT == MVT::f16 || VT == MVT::f128)
54314 return SDValue();
54315
54316 // If we have AVX512DQ we can use packed conversion instructions unless
54317 // the VT is f80.
54318 if (Subtarget.hasDQI() && VT != MVT::f80)
54319 return SDValue();
54320
54321 if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
54322 Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
54323 std::pair<SDValue, SDValue> Tmp =
54324 Subtarget.getTargetLowering()->BuildFILD(
54325 VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
54326 Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
54327 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
54328 return Tmp.first;
54329 }
54330 }
54331
54332 if (IsStrict)
54333 return SDValue();
54334
54335 if (SDValue V = combineToFPTruncExtElt(N, DAG))
54336 return V;
54337
54338 return SDValue();
54339}
54340
54342 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
54343
54344 for (const SDNode *User : Flags->uses()) {
54346 switch (User->getOpcode()) {
54347 default:
54348 // Be conservative.
54349 return true;
54350 case X86ISD::SETCC:
54352 CC = (X86::CondCode)User->getConstantOperandVal(0);
54353 break;
54354 case X86ISD::BRCOND:
54355 case X86ISD::CMOV:
54356 CC = (X86::CondCode)User->getConstantOperandVal(2);
54357 break;
54358 }
54359
54360 switch (CC) {
54361 // clang-format off
54362 default: break;
54363 case X86::COND_A: case X86::COND_AE:
54364 case X86::COND_B: case X86::COND_BE:
54365 case X86::COND_O: case X86::COND_NO:
54366 case X86::COND_G: case X86::COND_GE:
54367 case X86::COND_L: case X86::COND_LE:
54368 return true;
54369 // clang-format on
54370 }
54371 }
54372
54373 return false;
54374}
54375
54376static bool onlyZeroFlagUsed(SDValue Flags) {
54377 assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
54378
54379 for (const SDNode *User : Flags->uses()) {
54380 unsigned CCOpNo;
54381 switch (User->getOpcode()) {
54382 default:
54383 // Be conservative.
54384 return false;
54385 case X86ISD::SETCC:
54387 CCOpNo = 0;
54388 break;
54389 case X86ISD::BRCOND:
54390 case X86ISD::CMOV:
54391 CCOpNo = 2;
54392 break;
54393 }
54394
54395 X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
54396 if (CC != X86::COND_E && CC != X86::COND_NE)
54397 return false;
54398 }
54399
54400 return true;
54401}
54402
54404 const X86Subtarget &Subtarget) {
54405 // Only handle test patterns.
54406 if (!isNullConstant(N->getOperand(1)))
54407 return SDValue();
54408
54409 // If we have a CMP of a truncated binop, see if we can make a smaller binop
54410 // and use its flags directly.
54411 // TODO: Maybe we should try promoting compares that only use the zero flag
54412 // first if we can prove the upper bits with computeKnownBits?
54413 SDLoc dl(N);
54414 SDValue Op = N->getOperand(0);
54415 EVT VT = Op.getValueType();
54416 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
54417
54418 // If we have a constant logical shift that's only used in a comparison
54419 // against zero turn it into an equivalent AND. This allows turning it into
54420 // a TEST instruction later.
54421 if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&
54422 Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&
54423 onlyZeroFlagUsed(SDValue(N, 0))) {
54424 unsigned BitWidth = VT.getSizeInBits();
54425 const APInt &ShAmt = Op.getConstantOperandAPInt(1);
54426 if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.
54427 unsigned MaskBits = BitWidth - ShAmt.getZExtValue();
54428 APInt Mask = Op.getOpcode() == ISD::SRL
54429 ? APInt::getHighBitsSet(BitWidth, MaskBits)
54430 : APInt::getLowBitsSet(BitWidth, MaskBits);
54431 if (Mask.isSignedIntN(32)) {
54432 Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),
54433 DAG.getConstant(Mask, dl, VT));
54434 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
54435 DAG.getConstant(0, dl, VT));
54436 }
54437 }
54438 }
54439
54440 // If we're extracting from a avx512 bool vector and comparing against zero,
54441 // then try to just bitcast the vector to an integer to use TEST/BT directly.
54442 // (and (extract_elt (kshiftr vXi1, C), 0), 1) -> (and (bc vXi1), 1<<C)
54443 if (Op.getOpcode() == ISD::AND && isOneConstant(Op.getOperand(1)) &&
54444 Op.hasOneUse() && onlyZeroFlagUsed(SDValue(N, 0))) {
54445 SDValue Src = Op.getOperand(0);
54446 if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
54447 isNullConstant(Src.getOperand(1)) &&
54448 Src.getOperand(0).getValueType().getScalarType() == MVT::i1) {
54449 SDValue BoolVec = Src.getOperand(0);
54450 unsigned ShAmt = 0;
54451 if (BoolVec.getOpcode() == X86ISD::KSHIFTR) {
54452 ShAmt = BoolVec.getConstantOperandVal(1);
54453 BoolVec = BoolVec.getOperand(0);
54454 }
54455 BoolVec = widenMaskVector(BoolVec, false, Subtarget, DAG, dl);
54456 EVT VecVT = BoolVec.getValueType();
54457 unsigned BitWidth = VecVT.getVectorNumElements();
54458 EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), BitWidth);
54459 if (TLI.isTypeLegal(VecVT) && TLI.isTypeLegal(BCVT)) {
54460 APInt Mask = APInt::getOneBitSet(BitWidth, ShAmt);
54461 Op = DAG.getBitcast(BCVT, BoolVec);
54462 Op = DAG.getNode(ISD::AND, dl, BCVT, Op,
54463 DAG.getConstant(Mask, dl, BCVT));
54464 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
54465 DAG.getConstant(0, dl, BCVT));
54466 }
54467 }
54468 }
54469
54470 // Peek through any zero-extend if we're only testing for a zero result.
54471 if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {
54472 SDValue Src = Op.getOperand(0);
54473 EVT SrcVT = Src.getValueType();
54474 if (SrcVT.getScalarSizeInBits() >= 8 && TLI.isTypeLegal(SrcVT))
54475 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,
54476 DAG.getConstant(0, dl, SrcVT));
54477 }
54478
54479 // Look for a truncate.
54480 if (Op.getOpcode() != ISD::TRUNCATE)
54481 return SDValue();
54482
54483 SDValue Trunc = Op;
54484 Op = Op.getOperand(0);
54485
54486 // See if we can compare with zero against the truncation source,
54487 // which should help using the Z flag from many ops. Only do this for
54488 // i32 truncated op to prevent partial-reg compares of promoted ops.
54489 EVT OpVT = Op.getValueType();
54490 APInt UpperBits =
54492 if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&
54493 onlyZeroFlagUsed(SDValue(N, 0))) {
54494 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
54495 DAG.getConstant(0, dl, OpVT));
54496 }
54497
54498 // After this the truncate and arithmetic op must have a single use.
54499 if (!Trunc.hasOneUse() || !Op.hasOneUse())
54500 return SDValue();
54501
54502 unsigned NewOpc;
54503 switch (Op.getOpcode()) {
54504 default: return SDValue();
54505 case ISD::AND:
54506 // Skip and with constant. We have special handling for and with immediate
54507 // during isel to generate test instructions.
54508 if (isa<ConstantSDNode>(Op.getOperand(1)))
54509 return SDValue();
54510 NewOpc = X86ISD::AND;
54511 break;
54512 case ISD::OR: NewOpc = X86ISD::OR; break;
54513 case ISD::XOR: NewOpc = X86ISD::XOR; break;
54514 case ISD::ADD:
54515 // If the carry or overflow flag is used, we can't truncate.
54517 return SDValue();
54518 NewOpc = X86ISD::ADD;
54519 break;
54520 case ISD::SUB:
54521 // If the carry or overflow flag is used, we can't truncate.
54523 return SDValue();
54524 NewOpc = X86ISD::SUB;
54525 break;
54526 }
54527
54528 // We found an op we can narrow. Truncate its inputs.
54529 SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));
54530 SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));
54531
54532 // Use a X86 specific opcode to avoid DAG combine messing with it.
54533 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
54534 Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);
54535
54536 // For AND, keep a CMP so that we can match the test pattern.
54537 if (NewOpc == X86ISD::AND)
54538 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
54539 DAG.getConstant(0, dl, VT));
54540
54541 // Return the flags.
54542 return Op.getValue(1);
54543}
54544
54547 assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&
54548 "Expected X86ISD::ADD or X86ISD::SUB");
54549
54550 SDLoc DL(N);
54551 SDValue LHS = N->getOperand(0);
54552 SDValue RHS = N->getOperand(1);
54553 MVT VT = LHS.getSimpleValueType();
54554 bool IsSub = X86ISD::SUB == N->getOpcode();
54555 unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;
54556
54557 // If we don't use the flag result, simplify back to a generic ADD/SUB.
54558 if (!N->hasAnyUseOfValue(1)) {
54559 SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);
54560 return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);
54561 }
54562
54563 // Fold any similar generic ADD/SUB opcodes to reuse this node.
54564 auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
54565 SDValue Ops[] = {N0, N1};
54566 SDVTList VTs = DAG.getVTList(N->getValueType(0));
54567 if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
54568 SDValue Op(N, 0);
54569 if (Negate)
54570 Op = DAG.getNegative(Op, DL, VT);
54571 DCI.CombineTo(GenericAddSub, Op);
54572 }
54573 };
54574 MatchGeneric(LHS, RHS, false);
54575 MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
54576
54577 // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
54578 // EFLAGS result doesn't change.
54579 return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,
54580 /*ZeroSecondOpOnly*/ true);
54581}
54582
54584 SDValue LHS = N->getOperand(0);
54585 SDValue RHS = N->getOperand(1);
54586 SDValue BorrowIn = N->getOperand(2);
54587
54588 if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {
54589 MVT VT = N->getSimpleValueType(0);
54590 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
54591 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);
54592 }
54593
54594 // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)
54595 // iff the flag result is dead.
54596 if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&
54597 !N->hasAnyUseOfValue(1))
54598 return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),
54599 LHS.getOperand(1), BorrowIn);
54600
54601 return SDValue();
54602}
54603
54604// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
54607 SDValue LHS = N->getOperand(0);
54608 SDValue RHS = N->getOperand(1);
54609 SDValue CarryIn = N->getOperand(2);
54610 auto *LHSC = dyn_cast<ConstantSDNode>(LHS);
54611 auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
54612
54613 // Canonicalize constant to RHS.
54614 if (LHSC && !RHSC)
54615 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,
54616 CarryIn);
54617
54618 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
54619 // the result is either zero or one (depending on the input carry bit).
54620 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
54621 if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&
54622 // We don't have a good way to replace an EFLAGS use, so only do this when
54623 // dead right now.
54624 SDValue(N, 1).use_empty()) {
54625 SDLoc DL(N);
54626 EVT VT = N->getValueType(0);
54627 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
54628 SDValue Res1 = DAG.getNode(
54629 ISD::AND, DL, VT,
54631 DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),
54632 DAG.getConstant(1, DL, VT));
54633 return DCI.CombineTo(N, Res1, CarryOut);
54634 }
54635
54636 // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
54637 // iff the flag result is dead.
54638 // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.
54639 if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {
54640 SDLoc DL(N);
54641 APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();
54642 return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),
54643 DAG.getConstant(0, DL, LHS.getValueType()),
54644 DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);
54645 }
54646
54647 if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {
54648 MVT VT = N->getSimpleValueType(0);
54649 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
54650 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);
54651 }
54652
54653 // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
54654 // iff the flag result is dead.
54655 if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&
54656 !N->hasAnyUseOfValue(1))
54657 return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),
54658 LHS.getOperand(1), CarryIn);
54659
54660 return SDValue();
54661}
54662
54664 const SDLoc &DL, EVT VT,
54665 const X86Subtarget &Subtarget) {
54666 // Example of pattern we try to detect:
54667 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
54668 //(add (build_vector (extract_elt t, 0),
54669 // (extract_elt t, 2),
54670 // (extract_elt t, 4),
54671 // (extract_elt t, 6)),
54672 // (build_vector (extract_elt t, 1),
54673 // (extract_elt t, 3),
54674 // (extract_elt t, 5),
54675 // (extract_elt t, 7)))
54676
54677 if (!Subtarget.hasSSE2())
54678 return SDValue();
54679
54680 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
54682 return SDValue();
54683
54684 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
54685 VT.getVectorNumElements() < 4 ||
54687 return SDValue();
54688
54689 // Check if one of Op0,Op1 is of the form:
54690 // (build_vector (extract_elt Mul, 0),
54691 // (extract_elt Mul, 2),
54692 // (extract_elt Mul, 4),
54693 // ...
54694 // the other is of the form:
54695 // (build_vector (extract_elt Mul, 1),
54696 // (extract_elt Mul, 3),
54697 // (extract_elt Mul, 5),
54698 // ...
54699 // and identify Mul.
54700 SDValue Mul;
54701 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
54702 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
54703 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
54704 // TODO: Be more tolerant to undefs.
54705 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54706 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54707 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54708 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
54709 return SDValue();
54710 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
54711 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
54712 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
54713 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
54714 if (!Const0L || !Const1L || !Const0H || !Const1H)
54715 return SDValue();
54716 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
54717 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
54718 // Commutativity of mul allows factors of a product to reorder.
54719 if (Idx0L > Idx1L)
54720 std::swap(Idx0L, Idx1L);
54721 if (Idx0H > Idx1H)
54722 std::swap(Idx0H, Idx1H);
54723 // Commutativity of add allows pairs of factors to reorder.
54724 if (Idx0L > Idx0H) {
54725 std::swap(Idx0L, Idx0H);
54726 std::swap(Idx1L, Idx1H);
54727 }
54728 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
54729 Idx1H != 2 * i + 3)
54730 return SDValue();
54731 if (!Mul) {
54732 // First time an extract_elt's source vector is visited. Must be a MUL
54733 // with 2X number of vector elements than the BUILD_VECTOR.
54734 // Both extracts must be from same MUL.
54735 Mul = Op0L->getOperand(0);
54736 if (Mul->getOpcode() != ISD::MUL ||
54737 Mul.getValueType().getVectorNumElements() != 2 * e)
54738 return SDValue();
54739 }
54740 // Check that the extract is from the same MUL previously seen.
54741 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
54742 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
54743 return SDValue();
54744 }
54745
54746 // Check if the Mul source can be safely shrunk.
54747 ShrinkMode Mode;
54748 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
54749 Mode == ShrinkMode::MULU16)
54750 return SDValue();
54751
54752 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54753 VT.getVectorNumElements() * 2);
54754 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
54755 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
54756
54757 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
54758 ArrayRef<SDValue> Ops) {
54759 EVT InVT = Ops[0].getValueType();
54760 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
54761 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
54762 InVT.getVectorNumElements() / 2);
54763 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
54764 };
54765 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
54766}
54767
54768// Attempt to turn this pattern into PMADDWD.
54769// (add (mul (sext (build_vector)), (sext (build_vector))),
54770// (mul (sext (build_vector)), (sext (build_vector)))
54772 const SDLoc &DL, EVT VT,
54773 const X86Subtarget &Subtarget) {
54774 if (!Subtarget.hasSSE2())
54775 return SDValue();
54776
54777 if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)
54778 return SDValue();
54779
54780 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
54781 VT.getVectorNumElements() < 4 ||
54783 return SDValue();
54784
54785 SDValue N00 = N0.getOperand(0);
54786 SDValue N01 = N0.getOperand(1);
54787 SDValue N10 = N1.getOperand(0);
54788 SDValue N11 = N1.getOperand(1);
54789
54790 // All inputs need to be sign extends.
54791 // TODO: Support ZERO_EXTEND from known positive?
54792 if (N00.getOpcode() != ISD::SIGN_EXTEND ||
54793 N01.getOpcode() != ISD::SIGN_EXTEND ||
54794 N10.getOpcode() != ISD::SIGN_EXTEND ||
54795 N11.getOpcode() != ISD::SIGN_EXTEND)
54796 return SDValue();
54797
54798 // Peek through the extends.
54799 N00 = N00.getOperand(0);
54800 N01 = N01.getOperand(0);
54801 N10 = N10.getOperand(0);
54802 N11 = N11.getOperand(0);
54803
54804 // Must be extending from vXi16.
54805 EVT InVT = N00.getValueType();
54806 if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||
54807 N10.getValueType() != InVT || N11.getValueType() != InVT)
54808 return SDValue();
54809
54810 // All inputs should be build_vectors.
54811 if (N00.getOpcode() != ISD::BUILD_VECTOR ||
54812 N01.getOpcode() != ISD::BUILD_VECTOR ||
54813 N10.getOpcode() != ISD::BUILD_VECTOR ||
54815 return SDValue();
54816
54817 // For each element, we need to ensure we have an odd element from one vector
54818 // multiplied by the odd element of another vector and the even element from
54819 // one of the same vectors being multiplied by the even element from the
54820 // other vector. So we need to make sure for each element i, this operator
54821 // is being performed:
54822 // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]
54823 SDValue In0, In1;
54824 for (unsigned i = 0; i != N00.getNumOperands(); ++i) {
54825 SDValue N00Elt = N00.getOperand(i);
54826 SDValue N01Elt = N01.getOperand(i);
54827 SDValue N10Elt = N10.getOperand(i);
54828 SDValue N11Elt = N11.getOperand(i);
54829 // TODO: Be more tolerant to undefs.
54830 if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54831 N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54832 N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
54834 return SDValue();
54835 auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));
54836 auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));
54837 auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));
54838 auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));
54839 if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)
54840 return SDValue();
54841 unsigned IdxN00 = ConstN00Elt->getZExtValue();
54842 unsigned IdxN01 = ConstN01Elt->getZExtValue();
54843 unsigned IdxN10 = ConstN10Elt->getZExtValue();
54844 unsigned IdxN11 = ConstN11Elt->getZExtValue();
54845 // Add is commutative so indices can be reordered.
54846 if (IdxN00 > IdxN10) {
54847 std::swap(IdxN00, IdxN10);
54848 std::swap(IdxN01, IdxN11);
54849 }
54850 // N0 indices be the even element. N1 indices must be the next odd element.
54851 if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||
54852 IdxN01 != 2 * i || IdxN11 != 2 * i + 1)
54853 return SDValue();
54854 SDValue N00In = N00Elt.getOperand(0);
54855 SDValue N01In = N01Elt.getOperand(0);
54856 SDValue N10In = N10Elt.getOperand(0);
54857 SDValue N11In = N11Elt.getOperand(0);
54858
54859 // First time we find an input capture it.
54860 if (!In0) {
54861 In0 = N00In;
54862 In1 = N01In;
54863
54864 // The input vectors must be at least as wide as the output.
54865 // If they are larger than the output, we extract subvector below.
54866 if (In0.getValueSizeInBits() < VT.getSizeInBits() ||
54867 In1.getValueSizeInBits() < VT.getSizeInBits())
54868 return SDValue();
54869 }
54870 // Mul is commutative so the input vectors can be in any order.
54871 // Canonicalize to make the compares easier.
54872 if (In0 != N00In)
54873 std::swap(N00In, N01In);
54874 if (In0 != N10In)
54875 std::swap(N10In, N11In);
54876 if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)
54877 return SDValue();
54878 }
54879
54880 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
54881 ArrayRef<SDValue> Ops) {
54882 EVT OpVT = Ops[0].getValueType();
54883 assert(OpVT.getScalarType() == MVT::i16 &&
54884 "Unexpected scalar element type");
54885 assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch");
54886 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
54887 OpVT.getVectorNumElements() / 2);
54888 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
54889 };
54890
54891 // If the output is narrower than an input, extract the low part of the input
54892 // vector.
54893 EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
54894 VT.getVectorNumElements() * 2);
54895 if (OutVT16.bitsLT(In0.getValueType())) {
54896 In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,
54897 DAG.getIntPtrConstant(0, DL));
54898 }
54899 if (OutVT16.bitsLT(In1.getValueType())) {
54900 In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,
54901 DAG.getIntPtrConstant(0, DL));
54902 }
54903 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },
54904 PMADDBuilder);
54905}
54906
54907// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))
54908// If upper element in each pair of both VPMADDWD are zero then we can merge
54909// the operand elements and use the implicit add of VPMADDWD.
54910// TODO: Add support for VPMADDUBSW (which isn't commutable).
54912 const SDLoc &DL, EVT VT) {
54913 if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)
54914 return SDValue();
54915
54916 // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.
54917 if (VT.getSizeInBits() > 128)
54918 return SDValue();
54919
54920 unsigned NumElts = VT.getVectorNumElements();
54921 MVT OpVT = N0.getOperand(0).getSimpleValueType();
54923 APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));
54924
54925 bool Op0HiZero =
54926 DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||
54927 DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);
54928 bool Op1HiZero =
54929 DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||
54930 DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);
54931
54932 // TODO: Check for zero lower elements once we have actual codegen that
54933 // creates them.
54934 if (!Op0HiZero || !Op1HiZero)
54935 return SDValue();
54936
54937 // Create a shuffle mask packing the lower elements from each VPMADDWD.
54938 SmallVector<int> Mask;
54939 for (int i = 0; i != (int)NumElts; ++i) {
54940 Mask.push_back(2 * i);
54941 Mask.push_back(2 * (i + NumElts));
54942 }
54943
54944 SDValue LHS =
54945 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);
54946 SDValue RHS =
54947 DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);
54948 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);
54949}
54950
54951/// CMOV of constants requires materializing constant operands in registers.
54952/// Try to fold those constants into an 'add' instruction to reduce instruction
54953/// count. We do this with CMOV rather the generic 'select' because there are
54954/// earlier folds that may be used to turn select-of-constants into logic hacks.
54956 const X86Subtarget &Subtarget) {
54957 // If an operand is zero, add-of-0 gets simplified away, so that's clearly
54958 // better because we eliminate 1-2 instructions. This transform is still
54959 // an improvement without zero operands because we trade 2 move constants and
54960 // 1 add for 2 adds (LEA) as long as the constants can be represented as
54961 // immediate asm operands (fit in 32-bits).
54962 auto isSuitableCmov = [](SDValue V) {
54963 if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())
54964 return false;
54965 if (!isa<ConstantSDNode>(V.getOperand(0)) ||
54966 !isa<ConstantSDNode>(V.getOperand(1)))
54967 return false;
54968 return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||
54969 (V.getConstantOperandAPInt(0).isSignedIntN(32) &&
54970 V.getConstantOperandAPInt(1).isSignedIntN(32));
54971 };
54972
54973 // Match an appropriate CMOV as the first operand of the add.
54974 SDValue Cmov = N->getOperand(0);
54975 SDValue OtherOp = N->getOperand(1);
54976 if (!isSuitableCmov(Cmov))
54977 std::swap(Cmov, OtherOp);
54978 if (!isSuitableCmov(Cmov))
54979 return SDValue();
54980
54981 // Don't remove a load folding opportunity for the add. That would neutralize
54982 // any improvements from removing constant materializations.
54983 if (X86::mayFoldLoad(OtherOp, Subtarget))
54984 return SDValue();
54985
54986 EVT VT = N->getValueType(0);
54987 SDLoc DL(N);
54988 SDValue FalseOp = Cmov.getOperand(0);
54989 SDValue TrueOp = Cmov.getOperand(1);
54990
54991 // We will push the add through the select, but we can potentially do better
54992 // if we know there is another add in the sequence and this is pointer math.
54993 // In that case, we can absorb an add into the trailing memory op and avoid
54994 // a 3-operand LEA which is likely slower than a 2-operand LEA.
54995 // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
54996 if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
54997 !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
54998 all_of(N->uses(), [&](SDNode *Use) {
54999 auto *MemNode = dyn_cast<MemSDNode>(Use);
55000 return MemNode && MemNode->getBasePtr().getNode() == N;
55001 })) {
55002 // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
55003 // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
55004 // it is possible that choosing op1 might be better.
55005 SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
55006 FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
55007 TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
55008 Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
55009 Cmov.getOperand(2), Cmov.getOperand(3));
55010 return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
55011 }
55012
55013 // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
55014 FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
55015 TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
55016 return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),
55017 Cmov.getOperand(3));
55018}
55019
55022 const X86Subtarget &Subtarget) {
55023 EVT VT = N->getValueType(0);
55024 SDValue Op0 = N->getOperand(0);
55025 SDValue Op1 = N->getOperand(1);
55026 SDLoc DL(N);
55027
55028 if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG, Subtarget))
55029 return Select;
55030
55031 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))
55032 return MAdd;
55033 if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, DL, VT, Subtarget))
55034 return MAdd;
55035 if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))
55036 return MAdd;
55037
55038 // Try to synthesize horizontal adds from adds of shuffles.
55039 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
55040 return V;
55041
55042 // add(psadbw(X,0),psadbw(Y,0)) -> psadbw(add(X,Y),0)
55043 // iff X and Y won't overflow.
55044 if (Op0.getOpcode() == X86ISD::PSADBW && Op1.getOpcode() == X86ISD::PSADBW &&
55047 if (DAG.willNotOverflowAdd(false, Op0.getOperand(0), Op1.getOperand(0))) {
55048 MVT OpVT = Op0.getOperand(1).getSimpleValueType();
55049 SDValue Sum =
55050 DAG.getNode(ISD::ADD, DL, OpVT, Op0.getOperand(0), Op1.getOperand(0));
55051 return DAG.getNode(X86ISD::PSADBW, DL, VT, Sum,
55052 getZeroVector(OpVT, Subtarget, DAG, DL));
55053 }
55054 }
55055
55056 // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
55057 // (sub Y, (sext (vXi1 X))).
55058 // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
55059 // generic DAG combine without a legal type check, but adding this there
55060 // caused regressions.
55061 if (VT.isVector()) {
55062 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55063 if (Op0.getOpcode() == ISD::ZERO_EXTEND &&
55064 Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55065 TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {
55066 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));
55067 return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);
55068 }
55069
55070 if (Op1.getOpcode() == ISD::ZERO_EXTEND &&
55071 Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
55072 TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {
55073 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));
55074 return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);
55075 }
55076 }
55077
55078 // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
55079 if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&
55080 X86::isZeroNode(Op0.getOperand(1))) {
55081 assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use");
55082 return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,
55083 Op0.getOperand(0), Op0.getOperand(2));
55084 }
55085
55086 return combineAddOrSubToADCOrSBB(N, DAG);
55087}
55088
55089// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
55090// condition comes from the subtract node that produced -X. This matches the
55091// cmov expansion for absolute value. By swapping the operands we convert abs
55092// to nabs.
55094 SDValue N0 = N->getOperand(0);
55095 SDValue N1 = N->getOperand(1);
55096
55097 if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())
55098 return SDValue();
55099
55101 if (CC != X86::COND_S && CC != X86::COND_NS)
55102 return SDValue();
55103
55104 // Condition should come from a negate operation.
55105 SDValue Cond = N1.getOperand(3);
55106 if (Cond.getOpcode() != X86ISD::SUB || !isNullConstant(Cond.getOperand(0)))
55107 return SDValue();
55108 assert(Cond.getResNo() == 1 && "Unexpected result number");
55109
55110 // Get the X and -X from the negate.
55111 SDValue NegX = Cond.getValue(0);
55112 SDValue X = Cond.getOperand(1);
55113
55114 SDValue FalseOp = N1.getOperand(0);
55115 SDValue TrueOp = N1.getOperand(1);
55116
55117 // Cmov operands should be X and NegX. Order doesn't matter.
55118 if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))
55119 return SDValue();
55120
55121 // Build a new CMOV with the operands swapped.
55122 SDLoc DL(N);
55123 MVT VT = N->getSimpleValueType(0);
55124 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,
55125 N1.getOperand(2), Cond);
55126 // Convert sub to add.
55127 return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);
55128}
55129
55131 SDValue Op0 = N->getOperand(0);
55132 SDValue Op1 = N->getOperand(1);
55133
55134 // (sub C (zero_extend (setcc)))
55135 // =>
55136 // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate
55137 // Don't disturb (sub 0 setcc), which is easily done with neg.
55138 EVT VT = N->getValueType(0);
55139 auto *Op0C = dyn_cast<ConstantSDNode>(Op0);
55140 if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&
55141 !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&
55142 Op1.getOperand(0).hasOneUse()) {
55143 SDValue SetCC = Op1.getOperand(0);
55146 APInt NewImm = Op0C->getAPIntValue() - 1;
55147 SDLoc DL(Op1);
55148 SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);
55149 NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);
55150 return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,
55151 DAG.getConstant(NewImm, DL, VT));
55152 }
55153
55154 return SDValue();
55155}
55156
55159 const X86Subtarget &Subtarget) {
55160 SDValue Op0 = N->getOperand(0);
55161 SDValue Op1 = N->getOperand(1);
55162
55163 // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
55164 auto IsNonOpaqueConstant = [&](SDValue Op) {
55166 if (auto *Cst = dyn_cast<ConstantSDNode>(C))
55167 return !Cst->isOpaque();
55168 return true;
55169 }
55170 return false;
55171 };
55172
55173 // X86 can't encode an immediate LHS of a sub. See if we can push the
55174 // negation into a preceding instruction. If the RHS of the sub is a XOR with
55175 // one use and a constant, invert the immediate, saving one register.
55176 // However, ignore cases where C1 is 0, as those will become a NEG.
55177 // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)
55178 if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
55179 !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) &&
55180 Op1->hasOneUse()) {
55181 SDLoc DL(N);
55182 EVT VT = Op0.getValueType();
55183 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
55184 DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
55185 SDValue NewAdd =
55186 DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));
55187 return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);
55188 }
55189
55190 if (SDValue V = combineSubABS(N, DAG))
55191 return V;
55192
55193 // Try to synthesize horizontal subs from subs of shuffles.
55194 if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))
55195 return V;
55196
55197 // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)
55198 if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&
55199 X86::isZeroNode(Op1.getOperand(1))) {
55200 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
55201 return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,
55202 Op1.getOperand(0), Op1.getOperand(2));
55203 }
55204
55205 // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
55206 // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.
55207 if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&
55208 !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {
55209 assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
55210 SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
55211 Op1.getOperand(1), Op1.getOperand(2));
55212 return DAG.getNode(ISD::SUB, SDLoc(N), Op0.getValueType(), ADC.getValue(0),
55213 Op1.getOperand(0));
55214 }
55215
55216 if (SDValue V = combineXorSubCTLZ(N, DAG, Subtarget))
55217 return V;
55218
55219 if (SDValue V = combineAddOrSubToADCOrSBB(N, DAG))
55220 return V;
55221
55222 return combineSubSetcc(N, DAG);
55223}
55224
55226 const X86Subtarget &Subtarget) {
55227 MVT VT = N->getSimpleValueType(0);
55228 SDLoc DL(N);
55229
55230 if (N->getOperand(0) == N->getOperand(1)) {
55231 if (N->getOpcode() == X86ISD::PCMPEQ)
55232 return DAG.getConstant(-1, DL, VT);
55233 if (N->getOpcode() == X86ISD::PCMPGT)
55234 return DAG.getConstant(0, DL, VT);
55235 }
55236
55237 return SDValue();
55238}
55239
55240/// Helper that combines an array of subvector ops as if they were the operands
55241/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.
55242/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.
55246 const X86Subtarget &Subtarget) {
55247 assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
55248 unsigned EltSizeInBits = VT.getScalarSizeInBits();
55249
55250 if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
55251 return DAG.getUNDEF(VT);
55252
55253 if (llvm::all_of(Ops, [](SDValue Op) {
55254 return ISD::isBuildVectorAllZeros(Op.getNode());
55255 }))
55256 return getZeroVector(VT, Subtarget, DAG, DL);
55257
55258 SDValue Op0 = Ops[0];
55259 bool IsSplat = llvm::all_equal(Ops);
55260 unsigned NumOps = Ops.size();
55261 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55262 LLVMContext &Ctx = *DAG.getContext();
55263
55264 // Repeated subvectors.
55265 if (IsSplat &&
55266 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
55267 // If this broadcast is inserted into both halves, use a larger broadcast.
55268 if (Op0.getOpcode() == X86ISD::VBROADCAST)
55269 return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
55270
55271 // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
55272 if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
55273 (Subtarget.hasAVX2() ||
55275 VT.getScalarType(), Subtarget)))
55276 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
55277 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,
55278 Op0.getOperand(0),
55279 DAG.getIntPtrConstant(0, DL)));
55280
55281 // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
55282 if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
55283 (Subtarget.hasAVX2() ||
55284 (EltSizeInBits >= 32 &&
55285 X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
55286 Op0.getOperand(0).getValueType() == VT.getScalarType())
55287 return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
55288
55289 // concat_vectors(extract_subvector(broadcast(x)),
55290 // extract_subvector(broadcast(x))) -> broadcast(x)
55291 // concat_vectors(extract_subvector(subv_broadcast(x)),
55292 // extract_subvector(subv_broadcast(x))) -> subv_broadcast(x)
55293 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
55294 Op0.getOperand(0).getValueType() == VT) {
55295 SDValue SrcVec = Op0.getOperand(0);
55296 if (SrcVec.getOpcode() == X86ISD::VBROADCAST ||
55298 return Op0.getOperand(0);
55299 if (SrcVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
55300 Op0.getValueType() == cast<MemSDNode>(SrcVec)->getMemoryVT())
55301 return Op0.getOperand(0);
55302 }
55303
55304 // concat_vectors(permq(x),permq(x)) -> permq(concat_vectors(x,x))
55305 if (Op0.getOpcode() == X86ISD::VPERMI && Subtarget.useAVX512Regs() &&
55306 !X86::mayFoldLoad(Op0.getOperand(0), Subtarget))
55307 return DAG.getNode(Op0.getOpcode(), DL, VT,
55309 Op0.getOperand(0), Op0.getOperand(0)),
55310 Op0.getOperand(1));
55311 }
55312
55313 // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.
55314 // Only concat of subvector high halves which vperm2x128 is best at.
55315 // TODO: This should go in combineX86ShufflesRecursively eventually.
55316 if (VT.is256BitVector() && NumOps == 2) {
55317 SDValue Src0 = peekThroughBitcasts(Ops[0]);
55318 SDValue Src1 = peekThroughBitcasts(Ops[1]);
55319 if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
55321 EVT SrcVT0 = Src0.getOperand(0).getValueType();
55322 EVT SrcVT1 = Src1.getOperand(0).getValueType();
55323 unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();
55324 unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();
55325 if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&
55326 Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&
55327 Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {
55328 return DAG.getNode(X86ISD::VPERM2X128, DL, VT,
55329 DAG.getBitcast(VT, Src0.getOperand(0)),
55330 DAG.getBitcast(VT, Src1.getOperand(0)),
55331 DAG.getTargetConstant(0x31, DL, MVT::i8));
55332 }
55333 }
55334 }
55335
55336 // Repeated opcode.
55337 // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
55338 // but it currently struggles with different vector widths.
55339 if (llvm::all_of(Ops, [Op0](SDValue Op) {
55340 return Op.getOpcode() == Op0.getOpcode() && Op.hasOneUse();
55341 })) {
55342 auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {
55344 for (SDValue SubOp : SubOps)
55345 Subs.push_back(SubOp.getOperand(I));
55346 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
55347 };
55348 auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {
55349 bool AllConstants = true;
55350 bool AllSubVectors = true;
55351 for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
55352 SDValue Sub = SubOps[I].getOperand(Op);
55353 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
55354 SDValue BC = peekThroughBitcasts(Sub);
55355 AllConstants &= ISD::isBuildVectorOfConstantSDNodes(BC.getNode()) ||
55357 AllSubVectors &= Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
55358 Sub.getOperand(0).getValueType() == VT &&
55359 Sub.getConstantOperandAPInt(1) == (I * NumSubElts);
55360 }
55361 return AllConstants || AllSubVectors;
55362 };
55363
55364 switch (Op0.getOpcode()) {
55365 case X86ISD::VBROADCAST: {
55366 if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
55367 return Op.getOperand(0).getValueType().is128BitVector();
55368 })) {
55369 if (VT == MVT::v4f64 || VT == MVT::v4i64)
55370 return DAG.getNode(X86ISD::UNPCKL, DL, VT,
55371 ConcatSubOperand(VT, Ops, 0),
55372 ConcatSubOperand(VT, Ops, 0));
55373 // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.
55374 if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))
55375 return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI
55377 DL, VT, ConcatSubOperand(VT, Ops, 0),
55378 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
55379 }
55380 break;
55381 }
55382 case X86ISD::MOVDDUP:
55383 case X86ISD::MOVSHDUP:
55384 case X86ISD::MOVSLDUP: {
55385 if (!IsSplat)
55386 return DAG.getNode(Op0.getOpcode(), DL, VT,
55387 ConcatSubOperand(VT, Ops, 0));
55388 break;
55389 }
55390 case X86ISD::SHUFP: {
55391 // Add SHUFPD support if/when necessary.
55392 if (!IsSplat && VT.getScalarType() == MVT::f32 &&
55393 llvm::all_of(Ops, [Op0](SDValue Op) {
55394 return Op.getOperand(2) == Op0.getOperand(2);
55395 })) {
55396 return DAG.getNode(Op0.getOpcode(), DL, VT,
55397 ConcatSubOperand(VT, Ops, 0),
55398 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
55399 }
55400 break;
55401 }
55402 case X86ISD::UNPCKH:
55403 case X86ISD::UNPCKL: {
55404 // Don't concatenate build_vector patterns.
55405 if (!IsSplat && EltSizeInBits >= 32 &&
55406 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55407 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
55408 none_of(Ops, [](SDValue Op) {
55409 return peekThroughBitcasts(Op.getOperand(0)).getOpcode() ==
55411 peekThroughBitcasts(Op.getOperand(1)).getOpcode() ==
55413 })) {
55414 return DAG.getNode(Op0.getOpcode(), DL, VT,
55415 ConcatSubOperand(VT, Ops, 0),
55416 ConcatSubOperand(VT, Ops, 1));
55417 }
55418 break;
55419 }
55420 case X86ISD::PSHUFHW:
55421 case X86ISD::PSHUFLW:
55422 case X86ISD::PSHUFD:
55423 if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
55424 Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {
55425 return DAG.getNode(Op0.getOpcode(), DL, VT,
55426 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
55427 }
55428 [[fallthrough]];
55429 case X86ISD::VPERMILPI:
55430 if (!IsSplat && EltSizeInBits == 32 &&
55431 (VT.is256BitVector() ||
55432 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
55433 all_of(Ops, [&Op0](SDValue Op) {
55434 return Op0.getOperand(1) == Op.getOperand(1);
55435 })) {
55436 MVT FloatVT = VT.changeVectorElementType(MVT::f32);
55437 SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));
55438 Res =
55439 DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));
55440 return DAG.getBitcast(VT, Res);
55441 }
55442 if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) {
55443 uint64_t Idx0 = Ops[0].getConstantOperandVal(1);
55444 uint64_t Idx1 = Ops[1].getConstantOperandVal(1);
55445 uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3);
55446 return DAG.getNode(Op0.getOpcode(), DL, VT,
55447 ConcatSubOperand(VT, Ops, 0),
55448 DAG.getTargetConstant(Idx, DL, MVT::i8));
55449 }
55450 break;
55451 case X86ISD::PSHUFB:
55452 case X86ISD::PSADBW:
55453 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55454 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
55455 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
55456 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
55457 NumOps * SrcVT.getVectorNumElements());
55458 return DAG.getNode(Op0.getOpcode(), DL, VT,
55459 ConcatSubOperand(SrcVT, Ops, 0),
55460 ConcatSubOperand(SrcVT, Ops, 1));
55461 }
55462 break;
55463 case X86ISD::VPERMV:
55464 if (!IsSplat && NumOps == 2 &&
55465 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
55466 MVT OpVT = Op0.getSimpleValueType();
55467 int NumSrcElts = OpVT.getVectorNumElements();
55468 SmallVector<int, 64> ConcatMask;
55469 for (unsigned i = 0; i != NumOps; ++i) {
55470 SmallVector<int, 64> SubMask;
55472 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
55473 break;
55474 for (int M : SubMask) {
55475 if (0 <= M)
55476 M += i * NumSrcElts;
55477 ConcatMask.push_back(M);
55478 }
55479 }
55480 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
55481 SDValue Src = concatSubVectors(Ops[0].getOperand(1),
55482 Ops[1].getOperand(1), DAG, DL);
55483 MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
55484 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
55485 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
55486 return DAG.getNode(X86ISD::VPERMV, DL, VT, Mask, Src);
55487 }
55488 }
55489 break;
55490 case X86ISD::VPERMV3:
55491 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
55492 MVT OpVT = Op0.getSimpleValueType();
55493 int NumSrcElts = OpVT.getVectorNumElements();
55494 SmallVector<int, 64> ConcatMask;
55495 for (unsigned i = 0; i != NumOps; ++i) {
55496 SmallVector<int, 64> SubMask;
55498 if (!getTargetShuffleMask(Ops[i], false, SubOps, SubMask))
55499 break;
55500 for (int M : SubMask) {
55501 if (0 <= M) {
55502 M += M < NumSrcElts ? 0 : NumSrcElts;
55503 M += i * NumSrcElts;
55504 }
55505 ConcatMask.push_back(M);
55506 }
55507 }
55508 if (ConcatMask.size() == (NumOps * NumSrcElts)) {
55509 SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),
55510 Ops[1].getOperand(0), DAG, DL);
55511 SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),
55512 Ops[1].getOperand(2), DAG, DL);
55513 MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);
55514 MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);
55515 SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);
55516 return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);
55517 }
55518 }
55519 break;
55520 case X86ISD::VPERM2X128: {
55521 if (!IsSplat && VT.is512BitVector() && Subtarget.useAVX512Regs()) {
55522 assert(NumOps == 2 && "Bad concat_vectors operands");
55523 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
55524 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
55525 // TODO: Handle zero'd subvectors.
55526 if ((Imm0 & 0x88) == 0 && (Imm1 & 0x88) == 0) {
55527 int Mask[4] = {(int)(Imm0 & 0x03), (int)((Imm0 >> 4) & 0x3), (int)(Imm1 & 0x03),
55528 (int)((Imm1 >> 4) & 0x3)};
55529 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
55530 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
55531 Ops[0].getOperand(1), DAG, DL);
55532 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
55533 Ops[1].getOperand(1), DAG, DL);
55534 SDValue Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
55535 DAG.getBitcast(ShuffleVT, LHS),
55536 DAG.getBitcast(ShuffleVT, RHS),
55537 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
55538 return DAG.getBitcast(VT, Res);
55539 }
55540 }
55541 break;
55542 }
55543 case X86ISD::SHUF128: {
55544 if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {
55545 unsigned Imm0 = Ops[0].getConstantOperandVal(2);
55546 unsigned Imm1 = Ops[1].getConstantOperandVal(2);
55547 unsigned Imm = ((Imm0 & 1) << 0) | ((Imm0 & 2) << 1) | 0x08 |
55548 ((Imm1 & 1) << 4) | ((Imm1 & 2) << 5) | 0x80;
55549 SDValue LHS = concatSubVectors(Ops[0].getOperand(0),
55550 Ops[0].getOperand(1), DAG, DL);
55551 SDValue RHS = concatSubVectors(Ops[1].getOperand(0),
55552 Ops[1].getOperand(1), DAG, DL);
55553 return DAG.getNode(X86ISD::SHUF128, DL, VT, LHS, RHS,
55554 DAG.getTargetConstant(Imm, DL, MVT::i8));
55555 }
55556 break;
55557 }
55558 case ISD::TRUNCATE:
55559 if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {
55560 EVT SrcVT = Ops[0].getOperand(0).getValueType();
55561 if (SrcVT.is256BitVector() && SrcVT.isSimple() &&
55562 SrcVT == Ops[1].getOperand(0).getValueType() &&
55563 Subtarget.useAVX512Regs() &&
55564 Subtarget.getPreferVectorWidth() >= 512 &&
55565 (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {
55566 EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(Ctx);
55567 return DAG.getNode(ISD::TRUNCATE, DL, VT,
55568 ConcatSubOperand(NewSrcVT, Ops, 0));
55569 }
55570 }
55571 break;
55572 case X86ISD::VSHLI:
55573 case X86ISD::VSRLI:
55574 // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
55575 // TODO: Move this to LowerShiftByScalarImmediate?
55576 if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
55577 llvm::all_of(Ops, [](SDValue Op) {
55578 return Op.getConstantOperandAPInt(1) == 32;
55579 })) {
55580 SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
55581 SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
55582 if (Op0.getOpcode() == X86ISD::VSHLI) {
55583 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
55584 {8, 0, 8, 2, 8, 4, 8, 6});
55585 } else {
55586 Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
55587 {1, 8, 3, 8, 5, 8, 7, 8});
55588 }
55589 return DAG.getBitcast(VT, Res);
55590 }
55591 [[fallthrough]];
55592 case X86ISD::VSRAI:
55593 case X86ISD::VSHL:
55594 case X86ISD::VSRL:
55595 case X86ISD::VSRA:
55596 if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
55597 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
55598 (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
55599 llvm::all_of(Ops, [Op0](SDValue Op) {
55600 return Op0.getOperand(1) == Op.getOperand(1);
55601 })) {
55602 return DAG.getNode(Op0.getOpcode(), DL, VT,
55603 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
55604 }
55605 break;
55606 case X86ISD::VPERMI:
55607 case X86ISD::VROTLI:
55608 case X86ISD::VROTRI:
55609 if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
55610 llvm::all_of(Ops, [Op0](SDValue Op) {
55611 return Op0.getOperand(1) == Op.getOperand(1);
55612 })) {
55613 return DAG.getNode(Op0.getOpcode(), DL, VT,
55614 ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));
55615 }
55616 break;
55617 case ISD::AND:
55618 case ISD::OR:
55619 case ISD::XOR:
55620 case X86ISD::ANDNP:
55621 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55622 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
55623 return DAG.getNode(Op0.getOpcode(), DL, VT,
55624 ConcatSubOperand(VT, Ops, 0),
55625 ConcatSubOperand(VT, Ops, 1));
55626 }
55627 break;
55628 case X86ISD::PCMPEQ:
55629 case X86ISD::PCMPGT:
55630 if (!IsSplat && VT.is256BitVector() && Subtarget.hasInt256() &&
55631 (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1))) {
55632 return DAG.getNode(Op0.getOpcode(), DL, VT,
55633 ConcatSubOperand(VT, Ops, 0),
55634 ConcatSubOperand(VT, Ops, 1));
55635 }
55636 break;
55637 case ISD::CTPOP:
55638 case ISD::CTTZ:
55639 case ISD::CTLZ:
55642 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55643 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
55644 return DAG.getNode(Op0.getOpcode(), DL, VT,
55645 ConcatSubOperand(VT, Ops, 0));
55646 }
55647 break;
55649 if (!IsSplat &&
55650 (VT.is256BitVector() ||
55651 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
55652 llvm::all_of(Ops, [Op0](SDValue Op) {
55653 return Op0.getOperand(2) == Op.getOperand(2);
55654 })) {
55655 return DAG.getNode(Op0.getOpcode(), DL, VT,
55656 ConcatSubOperand(VT, Ops, 0),
55657 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
55658 }
55659 break;
55660 case ISD::ADD:
55661 case ISD::SUB:
55662 case ISD::MUL:
55663 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55664 (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
55665 (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {
55666 return DAG.getNode(Op0.getOpcode(), DL, VT,
55667 ConcatSubOperand(VT, Ops, 0),
55668 ConcatSubOperand(VT, Ops, 1));
55669 }
55670 break;
55671 // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and
55672 // their latency are short, so here we don't replace them unless we won't
55673 // introduce extra VINSERT.
55674 case ISD::FADD:
55675 case ISD::FSUB:
55676 case ISD::FMUL:
55677 if (!IsSplat && (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1)) &&
55678 (VT.is256BitVector() ||
55679 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
55680 return DAG.getNode(Op0.getOpcode(), DL, VT,
55681 ConcatSubOperand(VT, Ops, 0),
55682 ConcatSubOperand(VT, Ops, 1));
55683 }
55684 break;
55685 case ISD::FDIV:
55686 if (!IsSplat && (VT.is256BitVector() ||
55687 (VT.is512BitVector() && Subtarget.useAVX512Regs()))) {
55688 return DAG.getNode(Op0.getOpcode(), DL, VT,
55689 ConcatSubOperand(VT, Ops, 0),
55690 ConcatSubOperand(VT, Ops, 1));
55691 }
55692 break;
55693 case X86ISD::HADD:
55694 case X86ISD::HSUB:
55695 case X86ISD::FHADD:
55696 case X86ISD::FHSUB:
55697 if (!IsSplat && VT.is256BitVector() &&
55698 (VT.isFloatingPoint() || Subtarget.hasInt256())) {
55699 return DAG.getNode(Op0.getOpcode(), DL, VT,
55700 ConcatSubOperand(VT, Ops, 0),
55701 ConcatSubOperand(VT, Ops, 1));
55702 }
55703 break;
55704 case X86ISD::PACKSS:
55705 case X86ISD::PACKUS:
55706 if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55707 (VT.is512BitVector() && Subtarget.useBWIRegs()))) {
55708 MVT SrcVT = Op0.getOperand(0).getSimpleValueType();
55709 SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
55710 NumOps * SrcVT.getVectorNumElements());
55711 return DAG.getNode(Op0.getOpcode(), DL, VT,
55712 ConcatSubOperand(SrcVT, Ops, 0),
55713 ConcatSubOperand(SrcVT, Ops, 1));
55714 }
55715 break;
55716 case X86ISD::PALIGNR:
55717 if (!IsSplat &&
55718 ((VT.is256BitVector() && Subtarget.hasInt256()) ||
55719 (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
55720 llvm::all_of(Ops, [Op0](SDValue Op) {
55721 return Op0.getOperand(2) == Op.getOperand(2);
55722 })) {
55723 return DAG.getNode(Op0.getOpcode(), DL, VT,
55724 ConcatSubOperand(VT, Ops, 0),
55725 ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));
55726 }
55727 break;
55728 case X86ISD::BLENDI:
55729 if (NumOps == 2 && VT.is512BitVector() && Subtarget.useBWIRegs()) {
55730 uint64_t Mask0 = Ops[0].getConstantOperandVal(2);
55731 uint64_t Mask1 = Ops[1].getConstantOperandVal(2);
55732 // MVT::v16i16 has repeated blend mask.
55733 if (Op0.getSimpleValueType() == MVT::v16i16) {
55734 Mask0 = (Mask0 << 8) | Mask0;
55735 Mask1 = (Mask1 << 8) | Mask1;
55736 }
55737 uint64_t Mask = (Mask1 << (VT.getVectorNumElements() / 2)) | Mask0;
55739 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
55740 SDValue Sel =
55741 DAG.getBitcast(MaskVT, DAG.getConstant(Mask, DL, MaskSVT));
55742 return DAG.getSelect(DL, VT, Sel, ConcatSubOperand(VT, Ops, 1),
55743 ConcatSubOperand(VT, Ops, 0));
55744 }
55745 break;
55746 case ISD::VSELECT:
55747 if (!IsSplat && Subtarget.hasAVX512() &&
55748 (VT.is256BitVector() ||
55749 (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
55750 (EltSizeInBits >= 32 || Subtarget.hasBWI())) {
55751 EVT SelVT = Ops[0].getOperand(0).getValueType();
55752 if (SelVT.getVectorElementType() == MVT::i1) {
55753 SelVT = EVT::getVectorVT(Ctx, MVT::i1,
55754 NumOps * SelVT.getVectorNumElements());
55755 if (TLI.isTypeLegal(SelVT))
55756 return DAG.getNode(Op0.getOpcode(), DL, VT,
55757 ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
55758 ConcatSubOperand(VT, Ops, 1),
55759 ConcatSubOperand(VT, Ops, 2));
55760 }
55761 }
55762 [[fallthrough]];
55763 case X86ISD::BLENDV:
55764 if (!IsSplat && VT.is256BitVector() && NumOps == 2 &&
55765 (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
55766 IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {
55767 EVT SelVT = Ops[0].getOperand(0).getValueType();
55768 SelVT = SelVT.getDoubleNumVectorElementsVT(Ctx);
55769 if (TLI.isTypeLegal(SelVT))
55770 return DAG.getNode(Op0.getOpcode(), DL, VT,
55771 ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),
55772 ConcatSubOperand(VT, Ops, 1),
55773 ConcatSubOperand(VT, Ops, 2));
55774 }
55775 break;
55776 }
55777 }
55778
55779 // Fold subvector loads into one.
55780 // If needed, look through bitcasts to get to the load.
55781 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {
55782 unsigned Fast;
55783 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
55784 if (TLI->allowsMemoryAccess(Ctx, DAG.getDataLayout(), VT,
55785 *FirstLd->getMemOperand(), &Fast) &&
55786 Fast) {
55787 if (SDValue Ld =
55788 EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))
55789 return Ld;
55790 }
55791 }
55792
55793 // Attempt to fold target constant loads.
55794 if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {
55795 SmallVector<APInt> EltBits;
55796 APInt UndefElts = APInt::getZero(VT.getVectorNumElements());
55797 for (unsigned I = 0; I != NumOps; ++I) {
55798 APInt OpUndefElts;
55799 SmallVector<APInt> OpEltBits;
55800 if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,
55801 OpEltBits, /*AllowWholeUndefs*/ true,
55802 /*AllowPartialUndefs*/ false))
55803 break;
55804 EltBits.append(OpEltBits);
55805 UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());
55806 }
55807 if (EltBits.size() == VT.getVectorNumElements()) {
55808 Constant *C = getConstantVector(VT, EltBits, UndefElts, Ctx);
55809 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
55810 SDValue CV = DAG.getConstantPool(C, PVT);
55813 SDValue Ld = DAG.getLoad(VT, DL, DAG.getEntryNode(), CV, MPI);
55814 SDValue Sub = extractSubVector(Ld, 0, DAG, DL, Op0.getValueSizeInBits());
55815 DAG.ReplaceAllUsesOfValueWith(Op0, Sub);
55816 return Ld;
55817 }
55818 }
55819
55820 // If this simple subvector or scalar/subvector broadcast_load is inserted
55821 // into both halves, use a larger broadcast_load. Update other uses to use
55822 // an extracted subvector.
55823 if (IsSplat &&
55824 (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {
55825 if (ISD::isNormalLoad(Op0.getNode()) ||
55828 auto *Mem = cast<MemSDNode>(Op0);
55829 unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD
55832 if (SDValue BcastLd =
55833 getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {
55834 SDValue BcastSrc =
55835 extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());
55836 DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);
55837 return BcastLd;
55838 }
55839 }
55840 }
55841
55842 // If we're splatting a 128-bit subvector to 512-bits, use SHUF128 directly.
55843 if (IsSplat && NumOps == 4 && VT.is512BitVector() &&
55844 Subtarget.useAVX512Regs()) {
55845 MVT ShuffleVT = VT.isFloatingPoint() ? MVT::v8f64 : MVT::v8i64;
55846 SDValue Res = widenSubVector(Op0, false, Subtarget, DAG, DL, 512);
55847 Res = DAG.getBitcast(ShuffleVT, Res);
55848 Res = DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, Res, Res,
55849 getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));
55850 return DAG.getBitcast(VT, Res);
55851 }
55852
55853 return SDValue();
55854}
55855
55858 const X86Subtarget &Subtarget) {
55859 EVT VT = N->getValueType(0);
55860 EVT SrcVT = N->getOperand(0).getValueType();
55861 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
55862 SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());
55863
55864 if (VT.getVectorElementType() == MVT::i1) {
55865 // Attempt to constant fold.
55866 unsigned SubSizeInBits = SrcVT.getSizeInBits();
55868 for (unsigned I = 0, E = Ops.size(); I != E; ++I) {
55869 auto *C = dyn_cast<ConstantSDNode>(peekThroughBitcasts(Ops[I]));
55870 if (!C) break;
55871 Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);
55872 if (I == (E - 1)) {
55873 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
55874 if (TLI.isTypeLegal(IntVT))
55875 return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));
55876 }
55877 }
55878
55879 // Don't do anything else for i1 vectors.
55880 return SDValue();
55881 }
55882
55883 if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {
55884 if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,
55885 DCI, Subtarget))
55886 return R;
55887 }
55888
55889 return SDValue();
55890}
55891
55894 const X86Subtarget &Subtarget) {
55895 if (DCI.isBeforeLegalizeOps())
55896 return SDValue();
55897
55898 MVT OpVT = N->getSimpleValueType(0);
55899
55900 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
55901
55902 SDLoc dl(N);
55903 SDValue Vec = N->getOperand(0);
55904 SDValue SubVec = N->getOperand(1);
55905
55906 uint64_t IdxVal = N->getConstantOperandVal(2);
55907 MVT SubVecVT = SubVec.getSimpleValueType();
55908
55909 if (Vec.isUndef() && SubVec.isUndef())
55910 return DAG.getUNDEF(OpVT);
55911
55912 // Inserting undefs/zeros into zeros/undefs is a zero vector.
55913 if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&
55914 (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))
55915 return getZeroVector(OpVT, Subtarget, DAG, dl);
55916
55918 // If we're inserting into a zero vector and then into a larger zero vector,
55919 // just insert into the larger zero vector directly.
55920 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
55922 uint64_t Idx2Val = SubVec.getConstantOperandVal(2);
55923 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
55924 getZeroVector(OpVT, Subtarget, DAG, dl),
55925 SubVec.getOperand(1),
55926 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
55927 }
55928
55929 // If we're inserting into a zero vector and our input was extracted from an
55930 // insert into a zero vector of the same type and the extraction was at
55931 // least as large as the original insertion. Just insert the original
55932 // subvector into a zero vector.
55933 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
55934 isNullConstant(SubVec.getOperand(1)) &&
55936 SDValue Ins = SubVec.getOperand(0);
55937 if (isNullConstant(Ins.getOperand(2)) &&
55938 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
55939 Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=
55940 SubVecVT.getFixedSizeInBits())
55941 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
55942 getZeroVector(OpVT, Subtarget, DAG, dl),
55943 Ins.getOperand(1), N->getOperand(2));
55944 }
55945 }
55946
55947 // Stop here if this is an i1 vector.
55948 if (IsI1Vector)
55949 return SDValue();
55950
55951 // Eliminate an intermediate vector widening:
55952 // insert_subvector X, (insert_subvector undef, Y, 0), Idx -->
55953 // insert_subvector X, Y, Idx
55954 // TODO: This is a more general version of a DAGCombiner fold, can we move it
55955 // there?
55956 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
55957 SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))
55958 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
55959 SubVec.getOperand(1), N->getOperand(2));
55960
55961 // If this is an insert of an extract, combine to a shuffle. Don't do this
55962 // if the insert or extract can be represented with a subregister operation.
55963 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
55964 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
55965 (IdxVal != 0 ||
55966 !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
55967 int ExtIdxVal = SubVec.getConstantOperandVal(1);
55968 if (ExtIdxVal != 0) {
55969 int VecNumElts = OpVT.getVectorNumElements();
55970 int SubVecNumElts = SubVecVT.getVectorNumElements();
55971 SmallVector<int, 64> Mask(VecNumElts);
55972 // First create an identity shuffle mask.
55973 for (int i = 0; i != VecNumElts; ++i)
55974 Mask[i] = i;
55975 // Now insert the extracted portion.
55976 for (int i = 0; i != SubVecNumElts; ++i)
55977 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
55978
55979 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
55980 }
55981 }
55982
55983 // Match concat_vector style patterns.
55984 SmallVector<SDValue, 2> SubVectorOps;
55985 if (collectConcatOps(N, SubVectorOps, DAG)) {
55986 if (SDValue Fold =
55987 combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
55988 return Fold;
55989
55990 // If we're inserting all zeros into the upper half, change this to
55991 // a concat with zero. We will match this to a move
55992 // with implicit upper bit zeroing during isel.
55993 // We do this here because we don't want combineConcatVectorOps to
55994 // create INSERT_SUBVECTOR from CONCAT_VECTORS.
55995 if (SubVectorOps.size() == 2 &&
55996 ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))
55997 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
55998 getZeroVector(OpVT, Subtarget, DAG, dl),
55999 SubVectorOps[0], DAG.getIntPtrConstant(0, dl));
56000
56001 // Attempt to recursively combine to a shuffle.
56002 if (all_of(SubVectorOps, [](SDValue SubOp) {
56003 return isTargetShuffle(SubOp.getOpcode());
56004 })) {
56005 SDValue Op(N, 0);
56006 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
56007 return Res;
56008 }
56009 }
56010
56011 // If this is a broadcast insert into an upper undef, use a larger broadcast.
56012 if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)
56013 return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));
56014
56015 // If this is a broadcast load inserted into an upper undef, use a larger
56016 // broadcast load.
56017 if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&
56018 SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {
56019 auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);
56020 SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);
56021 SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };
56022 SDValue BcastLd =
56024 MemIntr->getMemoryVT(),
56025 MemIntr->getMemOperand());
56026 DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
56027 return BcastLd;
56028 }
56029
56030 // If we're splatting the lower half subvector of a full vector load into the
56031 // upper half, attempt to create a subvector broadcast.
56032 if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&
56033 Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {
56034 auto *VecLd = dyn_cast<LoadSDNode>(Vec);
56035 auto *SubLd = dyn_cast<LoadSDNode>(SubVec);
56036 if (VecLd && SubLd &&
56037 DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,
56038 SubVec.getValueSizeInBits() / 8, 0))
56039 return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,
56040 SubLd, 0, DAG);
56041 }
56042
56043 return SDValue();
56044}
56045
56046/// If we are extracting a subvector of a vector select and the select condition
56047/// is composed of concatenated vectors, try to narrow the select width. This
56048/// is a common pattern for AVX1 integer code because 256-bit selects may be
56049/// legal, but there is almost no integer math/logic available for 256-bit.
56050/// This function should only be called with legal types (otherwise, the calls
56051/// to get simple value types will assert).
56053 SelectionDAG &DAG) {
56054 SDValue Sel = Ext->getOperand(0);
56055 if (Sel.getOpcode() != ISD::VSELECT ||
56056 !isFreeToSplitVector(Sel.getOperand(0).getNode(), DAG))
56057 return SDValue();
56058
56059 // Note: We assume simple value types because this should only be called with
56060 // legal operations/types.
56061 // TODO: This can be extended to handle extraction to 256-bits.
56062 MVT VT = Ext->getSimpleValueType(0);
56063 if (!VT.is128BitVector())
56064 return SDValue();
56065
56066 MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();
56067 if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())
56068 return SDValue();
56069
56070 MVT WideVT = Ext->getOperand(0).getSimpleValueType();
56071 MVT SelVT = Sel.getSimpleValueType();
56072 assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&
56073 "Unexpected vector type with legal operations");
56074
56075 unsigned SelElts = SelVT.getVectorNumElements();
56076 unsigned CastedElts = WideVT.getVectorNumElements();
56077 unsigned ExtIdx = Ext->getConstantOperandVal(1);
56078 if (SelElts % CastedElts == 0) {
56079 // The select has the same or more (narrower) elements than the extract
56080 // operand. The extraction index gets scaled by that factor.
56081 ExtIdx *= (SelElts / CastedElts);
56082 } else if (CastedElts % SelElts == 0) {
56083 // The select has less (wider) elements than the extract operand. Make sure
56084 // that the extraction index can be divided evenly.
56085 unsigned IndexDivisor = CastedElts / SelElts;
56086 if (ExtIdx % IndexDivisor != 0)
56087 return SDValue();
56088 ExtIdx /= IndexDivisor;
56089 } else {
56090 llvm_unreachable("Element count of simple vector types are not divisible?");
56091 }
56092
56093 unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();
56094 unsigned NarrowElts = SelElts / NarrowingFactor;
56095 MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);
56096 SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);
56097 SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);
56098 SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);
56099 SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);
56100 return DAG.getBitcast(VT, NarrowSel);
56101}
56102
56105 const X86Subtarget &Subtarget) {
56106 // For AVX1 only, if we are extracting from a 256-bit and+not (which will
56107 // eventually get combined/lowered into ANDNP) with a concatenated operand,
56108 // split the 'and' into 128-bit ops to avoid the concatenate and extract.
56109 // We let generic combining take over from there to simplify the
56110 // insert/extract and 'not'.
56111 // This pattern emerges during AVX1 legalization. We handle it before lowering
56112 // to avoid complications like splitting constant vector loads.
56113
56114 // Capture the original wide type in the likely case that we need to bitcast
56115 // back to this type.
56116 if (!N->getValueType(0).isSimple())
56117 return SDValue();
56118
56119 MVT VT = N->getSimpleValueType(0);
56120 SDValue InVec = N->getOperand(0);
56121 unsigned IdxVal = N->getConstantOperandVal(1);
56122 SDValue InVecBC = peekThroughBitcasts(InVec);
56123 EVT InVecVT = InVec.getValueType();
56124 unsigned SizeInBits = VT.getSizeInBits();
56125 unsigned InSizeInBits = InVecVT.getSizeInBits();
56126 unsigned NumSubElts = VT.getVectorNumElements();
56127 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56128 SDLoc DL(N);
56129
56130 if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
56131 TLI.isTypeLegal(InVecVT) &&
56132 InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {
56133 auto isConcatenatedNot = [](SDValue V) {
56134 V = peekThroughBitcasts(V);
56135 if (!isBitwiseNot(V))
56136 return false;
56137 SDValue NotOp = V->getOperand(0);
56139 };
56140 if (isConcatenatedNot(InVecBC.getOperand(0)) ||
56141 isConcatenatedNot(InVecBC.getOperand(1))) {
56142 // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
56143 SDValue Concat = splitVectorIntBinary(InVecBC, DAG, SDLoc(InVecBC));
56144 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
56145 DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
56146 }
56147 }
56148
56149 if (DCI.isBeforeLegalizeOps())
56150 return SDValue();
56151
56152 if (SDValue V = narrowExtractedVectorSelect(N, DL, DAG))
56153 return V;
56154
56156 return getZeroVector(VT, Subtarget, DAG, DL);
56157
56158 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
56159 if (VT.getScalarType() == MVT::i1)
56160 return DAG.getConstant(1, DL, VT);
56161 return getOnesVector(VT, DAG, DL);
56162 }
56163
56164 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
56165 return DAG.getBuildVector(VT, DL, InVec->ops().slice(IdxVal, NumSubElts));
56166
56167 // If we are extracting from an insert into a larger vector, replace with a
56168 // smaller insert if we don't access less than the original subvector. Don't
56169 // do this for i1 vectors.
56170 // TODO: Relax the matching indices requirement?
56171 if (VT.getVectorElementType() != MVT::i1 &&
56172 InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&
56173 IdxVal == InVec.getConstantOperandVal(2) &&
56174 InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {
56175 SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,
56176 InVec.getOperand(0), N->getOperand(1));
56177 unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;
56178 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,
56179 InVec.getOperand(1),
56180 DAG.getVectorIdxConstant(NewIdxVal, DL));
56181 }
56182
56183 // If we're extracting an upper subvector from a broadcast we should just
56184 // extract the lowest subvector instead which should allow
56185 // SimplifyDemandedVectorElts do more simplifications.
56186 if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
56188 DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))
56189 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
56190
56191 // If we're extracting a broadcasted subvector, just use the lowest subvector.
56192 if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
56193 cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)
56194 return extractSubVector(InVec, 0, DAG, DL, SizeInBits);
56195
56196 // Attempt to extract from the source of a shuffle vector.
56197 if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {
56198 SmallVector<int, 32> ShuffleMask;
56199 SmallVector<int, 32> ScaledMask;
56200 SmallVector<SDValue, 2> ShuffleInputs;
56201 unsigned NumSubVecs = InSizeInBits / SizeInBits;
56202 // Decode the shuffle mask and scale it so its shuffling subvectors.
56203 if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
56204 scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
56205 unsigned SubVecIdx = IdxVal / NumSubElts;
56206 if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
56207 return DAG.getUNDEF(VT);
56208 if (ScaledMask[SubVecIdx] == SM_SentinelZero)
56209 return getZeroVector(VT, Subtarget, DAG, DL);
56210 SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
56211 if (Src.getValueSizeInBits() == InSizeInBits) {
56212 unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
56213 unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;
56214 return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
56215 DL, SizeInBits);
56216 }
56217 }
56218 }
56219
56220 auto IsExtractFree = [](SDValue V) {
56221 V = peekThroughBitcasts(V);
56222 if (ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
56223 return true;
56225 return true;
56226 return V.isUndef();
56227 };
56228
56229 // If we're extracting the lowest subvector and we're the only user,
56230 // we may be able to perform this with a smaller vector width.
56231 unsigned InOpcode = InVec.getOpcode();
56232 if (InVec.hasOneUse()) {
56233 if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {
56234 // v2f64 CVTDQ2PD(v4i32).
56235 if (InOpcode == ISD::SINT_TO_FP &&
56236 InVec.getOperand(0).getValueType() == MVT::v4i32) {
56237 return DAG.getNode(X86ISD::CVTSI2P, DL, VT, InVec.getOperand(0));
56238 }
56239 // v2f64 CVTUDQ2PD(v4i32).
56240 if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
56241 InVec.getOperand(0).getValueType() == MVT::v4i32) {
56242 return DAG.getNode(X86ISD::CVTUI2P, DL, VT, InVec.getOperand(0));
56243 }
56244 // v2f64 CVTPS2PD(v4f32).
56245 if (InOpcode == ISD::FP_EXTEND &&
56246 InVec.getOperand(0).getValueType() == MVT::v4f32) {
56247 return DAG.getNode(X86ISD::VFPEXT, DL, VT, InVec.getOperand(0));
56248 }
56249 }
56250 // v4i32 CVTPS2DQ(v4f32).
56251 if (InOpcode == ISD::FP_TO_SINT && VT == MVT::v4i32) {
56252 SDValue Src = InVec.getOperand(0);
56253 if (Src.getValueType().getScalarType() == MVT::f32)
56254 return DAG.getNode(InOpcode, DL, VT,
56255 extractSubVector(Src, IdxVal, DAG, DL, SizeInBits));
56256 }
56257 if (IdxVal == 0 &&
56258 (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) &&
56259 (SizeInBits == 128 || SizeInBits == 256) &&
56260 InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {
56261 SDValue Ext = InVec.getOperand(0);
56262 if (Ext.getValueSizeInBits() > SizeInBits)
56263 Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);
56264 unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);
56265 return DAG.getNode(ExtOp, DL, VT, Ext);
56266 }
56267 if (IdxVal == 0 && InOpcode == ISD::VSELECT &&
56268 InVec.getOperand(0).getValueType().is256BitVector() &&
56269 InVec.getOperand(1).getValueType().is256BitVector() &&
56270 InVec.getOperand(2).getValueType().is256BitVector()) {
56271 SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);
56272 SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);
56273 SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);
56274 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);
56275 }
56276 if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&
56277 (SizeInBits == 128 || SizeInBits == 256)) {
56278 SDValue InVecSrc = InVec.getOperand(0);
56279 unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;
56280 SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);
56281 return DAG.getNode(InOpcode, DL, VT, Ext);
56282 }
56283 if ((InOpcode == X86ISD::CMPP || InOpcode == X86ISD::PCMPEQ ||
56284 InOpcode == X86ISD::PCMPGT) &&
56285 (IsExtractFree(InVec.getOperand(0)) ||
56286 IsExtractFree(InVec.getOperand(1))) &&
56287 SizeInBits == 128) {
56288 SDValue Ext0 =
56289 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
56290 SDValue Ext1 =
56291 extractSubVector(InVec.getOperand(1), IdxVal, DAG, DL, SizeInBits);
56292 if (InOpcode == X86ISD::CMPP)
56293 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, InVec.getOperand(2));
56294 return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1);
56295 }
56296 if (InOpcode == X86ISD::MOVDDUP &&
56297 (SizeInBits == 128 || SizeInBits == 256)) {
56298 SDValue Ext0 =
56299 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
56300 return DAG.getNode(InOpcode, DL, VT, Ext0);
56301 }
56302 }
56303
56304 // Always split vXi64 logical shifts where we're extracting the upper 32-bits
56305 // as this is very likely to fold into a shuffle/truncation.
56306 if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&
56307 InVecVT.getScalarSizeInBits() == 64 &&
56308 InVec.getConstantOperandAPInt(1) == 32) {
56309 SDValue Ext =
56310 extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);
56311 return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));
56312 }
56313
56314 return SDValue();
56315}
56316
56318 EVT VT = N->getValueType(0);
56319 SDValue Src = N->getOperand(0);
56320 SDLoc DL(N);
56321
56322 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
56323 // This occurs frequently in our masked scalar intrinsic code and our
56324 // floating point select lowering with AVX512.
56325 // TODO: SimplifyDemandedBits instead?
56326 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&
56327 isOneConstant(Src.getOperand(1)))
56328 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));
56329
56330 // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.
56331 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
56332 Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&
56333 Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
56334 isNullConstant(Src.getOperand(1)))
56335 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),
56336 Src.getOperand(1));
56337
56338 // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.
56339 // TODO: Move to DAGCombine/SimplifyDemandedBits?
56340 if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {
56341 auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {
56342 if (Op.getValueType() != MVT::i64)
56343 return SDValue();
56344 unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;
56345 if (Op.getOpcode() == Opc &&
56346 Op.getOperand(0).getScalarValueSizeInBits() <= 32)
56347 return Op.getOperand(0);
56348 unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;
56349 if (auto *Ld = dyn_cast<LoadSDNode>(Op))
56350 if (Ld->getExtensionType() == Ext &&
56351 Ld->getMemoryVT().getScalarSizeInBits() <= 32)
56352 return Op;
56353 if (IsZeroExt) {
56354 KnownBits Known = DAG.computeKnownBits(Op);
56355 if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)
56356 return Op;
56357 }
56358 return SDValue();
56359 };
56360
56361 if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))
56362 return DAG.getBitcast(
56363 VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
56364 DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));
56365
56366 if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))
56367 return DAG.getBitcast(
56368 VT,
56369 DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,
56370 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
56371 DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));
56372 }
56373
56374 // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
56375 if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
56376 Src.getOperand(0).getValueType() == MVT::x86mmx)
56377 return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
56378
56379 // See if we're broadcasting the scalar value, in which case just reuse that.
56380 // Ensure the same SDValue from the SDNode use is being used.
56381 if (VT.getScalarType() == Src.getValueType())
56382 for (SDNode *User : Src->uses())
56383 if (User->getOpcode() == X86ISD::VBROADCAST &&
56384 Src == User->getOperand(0)) {
56385 unsigned SizeInBits = VT.getFixedSizeInBits();
56386 unsigned BroadcastSizeInBits =
56387 User->getValueSizeInBits(0).getFixedValue();
56388 if (BroadcastSizeInBits == SizeInBits)
56389 return SDValue(User, 0);
56390 if (BroadcastSizeInBits > SizeInBits)
56391 return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);
56392 // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test
56393 // coverage.
56394 }
56395
56396 return SDValue();
56397}
56398
56399// Simplify PMULDQ and PMULUDQ operations.
56402 const X86Subtarget &Subtarget) {
56403 SDValue LHS = N->getOperand(0);
56404 SDValue RHS = N->getOperand(1);
56405
56406 // Canonicalize constant to RHS.
56409 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);
56410
56411 // Multiply by zero.
56412 // Don't return RHS as it may contain UNDEFs.
56413 if (ISD::isBuildVectorAllZeros(RHS.getNode()))
56414 return DAG.getConstant(0, SDLoc(N), N->getValueType(0));
56415
56416 // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
56417 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56418 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))
56419 return SDValue(N, 0);
56420
56421 // If the input is an extend_invec and the SimplifyDemandedBits call didn't
56422 // convert it to any_extend_invec, due to the LegalOperations check, do the
56423 // conversion directly to a vector shuffle manually. This exposes combine
56424 // opportunities missed by combineEXTEND_VECTOR_INREG not calling
56425 // combineX86ShufflesRecursively on SSE4.1 targets.
56426 // FIXME: This is basically a hack around several other issues related to
56427 // ANY_EXTEND_VECTOR_INREG.
56428 if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&
56429 (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
56430 LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
56431 LHS.getOperand(0).getValueType() == MVT::v4i32) {
56432 SDLoc dl(N);
56433 LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),
56434 LHS.getOperand(0), { 0, -1, 1, -1 });
56435 LHS = DAG.getBitcast(MVT::v2i64, LHS);
56436 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
56437 }
56438 if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&
56439 (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||
56440 RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&
56441 RHS.getOperand(0).getValueType() == MVT::v4i32) {
56442 SDLoc dl(N);
56443 RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),
56444 RHS.getOperand(0), { 0, -1, 1, -1 });
56445 RHS = DAG.getBitcast(MVT::v2i64, RHS);
56446 return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);
56447 }
56448
56449 return SDValue();
56450}
56451
56452// Simplify VPMADDUBSW/VPMADDWD operations.
56455 EVT VT = N->getValueType(0);
56456 SDValue LHS = N->getOperand(0);
56457 SDValue RHS = N->getOperand(1);
56458
56459 // Multiply by zero.
56460 // Don't return LHS/RHS as it may contain UNDEFs.
56461 if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||
56463 return DAG.getConstant(0, SDLoc(N), VT);
56464
56465 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56466 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
56467 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
56468 return SDValue(N, 0);
56469
56470 return SDValue();
56471}
56472
56475 const X86Subtarget &Subtarget) {
56476 EVT VT = N->getValueType(0);
56477 SDValue In = N->getOperand(0);
56478 unsigned Opcode = N->getOpcode();
56479 unsigned InOpcode = In.getOpcode();
56480 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56481 SDLoc DL(N);
56482
56483 // Try to merge vector loads and extend_inreg to an extload.
56484 if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&
56485 In.hasOneUse()) {
56486 auto *Ld = cast<LoadSDNode>(In);
56487 if (Ld->isSimple()) {
56488 MVT SVT = In.getSimpleValueType().getVectorElementType();
56491 : ISD::ZEXTLOAD;
56492 EVT MemVT = VT.changeVectorElementType(SVT);
56493 if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
56494 SDValue Load = DAG.getExtLoad(
56495 Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
56496 MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());
56497 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
56498 return Load;
56499 }
56500 }
56501 }
56502
56503 // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).
56504 if (Opcode == InOpcode)
56505 return DAG.getNode(Opcode, DL, VT, In.getOperand(0));
56506
56507 // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))
56508 // -> EXTEND_VECTOR_INREG(X).
56509 // TODO: Handle non-zero subvector indices.
56510 if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&
56511 In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&
56512 In.getOperand(0).getOperand(0).getValueSizeInBits() ==
56513 In.getValueSizeInBits())
56514 return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));
56515
56516 // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).
56517 // TODO: Move to DAGCombine?
56518 if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&
56519 In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&
56520 In.getValueSizeInBits() == VT.getSizeInBits()) {
56521 unsigned NumElts = VT.getVectorNumElements();
56522 unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();
56523 EVT EltVT = In.getOperand(0).getValueType();
56524 SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));
56525 for (unsigned I = 0; I != NumElts; ++I)
56526 Elts[I * Scale] = In.getOperand(I);
56527 return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));
56528 }
56529
56530 // Attempt to combine as a shuffle on SSE41+ targets.
56531 if (Subtarget.hasSSE41()) {
56532 SDValue Op(N, 0);
56533 if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))
56534 if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
56535 return Res;
56536 }
56537
56538 return SDValue();
56539}
56540
56543 EVT VT = N->getValueType(0);
56544
56545 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
56546 return DAG.getConstant(0, SDLoc(N), VT);
56547
56548 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56549 APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());
56550 if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))
56551 return SDValue(N, 0);
56552
56553 return SDValue();
56554}
56555
56556// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
56557// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
56558// extra instructions between the conversion due to going to scalar and back.
56560 const X86Subtarget &Subtarget) {
56561 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
56562 return SDValue();
56563
56564 if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
56565 return SDValue();
56566
56567 if (N->getValueType(0) != MVT::f32 ||
56568 N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
56569 return SDValue();
56570
56571 SDLoc dl(N);
56572 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
56573 N->getOperand(0).getOperand(0));
56574 Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
56575 DAG.getTargetConstant(4, dl, MVT::i32));
56576 Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
56577 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
56578 DAG.getIntPtrConstant(0, dl));
56579}
56580
56582 const X86Subtarget &Subtarget) {
56583 EVT VT = N->getValueType(0);
56584 bool IsStrict = N->isStrictFPOpcode();
56585 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
56586 EVT SrcVT = Src.getValueType();
56587
56588 SDLoc dl(N);
56589 if (SrcVT.getScalarType() == MVT::bf16) {
56590 if (!IsStrict && Src.getOpcode() == ISD::FP_ROUND &&
56591 Src.getOperand(0).getValueType() == VT)
56592 return Src.getOperand(0);
56593
56594 if (!SrcVT.isVector())
56595 return SDValue();
56596
56597 assert(!IsStrict && "Strict FP doesn't support BF16");
56598 if (VT.getVectorElementType() == MVT::f64) {
56599 MVT TmpVT = VT.getSimpleVT().changeVectorElementType(MVT::f32);
56600 return DAG.getNode(ISD::FP_EXTEND, dl, VT,
56601 DAG.getNode(ISD::FP_EXTEND, dl, TmpVT, Src));
56602 }
56603 assert(VT.getVectorElementType() == MVT::f32 && "Unexpected fpext");
56604 MVT NVT = SrcVT.getSimpleVT().changeVectorElementType(MVT::i32);
56605 Src = DAG.getBitcast(SrcVT.changeTypeToInteger(), Src);
56606 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Src);
56607 Src = DAG.getNode(ISD::SHL, dl, NVT, Src, DAG.getConstant(16, dl, NVT));
56608 return DAG.getBitcast(VT, Src);
56609 }
56610
56611 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
56612 return SDValue();
56613
56614 if (Subtarget.hasFP16())
56615 return SDValue();
56616
56617 if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
56618 return SDValue();
56619
56620 if (VT.getVectorElementType() != MVT::f32 &&
56621 VT.getVectorElementType() != MVT::f64)
56622 return SDValue();
56623
56624 unsigned NumElts = VT.getVectorNumElements();
56625 if (NumElts == 1 || !isPowerOf2_32(NumElts))
56626 return SDValue();
56627
56628 // Convert the input to vXi16.
56629 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
56630 Src = DAG.getBitcast(IntVT, Src);
56631
56632 // Widen to at least 8 input elements.
56633 if (NumElts < 8) {
56634 unsigned NumConcats = 8 / NumElts;
56635 SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
56636 : DAG.getConstant(0, dl, IntVT);
56637 SmallVector<SDValue, 4> Ops(NumConcats, Fill);
56638 Ops[0] = Src;
56639 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
56640 }
56641
56642 // Destination is vXf32 with at least 4 elements.
56643 EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
56644 std::max(4U, NumElts));
56645 SDValue Cvt, Chain;
56646 if (IsStrict) {
56647 Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
56648 {N->getOperand(0), Src});
56649 Chain = Cvt.getValue(1);
56650 } else {
56651 Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
56652 }
56653
56654 if (NumElts < 4) {
56655 assert(NumElts == 2 && "Unexpected size");
56656 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
56657 DAG.getIntPtrConstant(0, dl));
56658 }
56659
56660 if (IsStrict) {
56661 // Extend to the original VT if necessary.
56662 if (Cvt.getValueType() != VT) {
56663 Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
56664 {Chain, Cvt});
56665 Chain = Cvt.getValue(1);
56666 }
56667 return DAG.getMergeValues({Cvt, Chain}, dl);
56668 }
56669
56670 // Extend to the original VT if necessary.
56671 return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
56672}
56673
56674// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract
56675// from. Limit this to cases where the loads have the same input chain and the
56676// output chains are unused. This avoids any memory ordering issues.
56679 assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||
56680 N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
56681 "Unknown broadcast load type");
56682
56683 // Only do this if the chain result is unused.
56684 if (N->hasAnyUseOfValue(1))
56685 return SDValue();
56686
56687 auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
56688
56689 SDValue Ptr = MemIntrin->getBasePtr();
56690 SDValue Chain = MemIntrin->getChain();
56691 EVT VT = N->getSimpleValueType(0);
56692 EVT MemVT = MemIntrin->getMemoryVT();
56693
56694 // Look at other users of our base pointer and try to find a wider broadcast.
56695 // The input chain and the size of the memory VT must match.
56696 for (SDNode *User : Ptr->uses())
56697 if (User != N && User->getOpcode() == N->getOpcode() &&
56698 cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
56699 cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
56700 cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
56701 MemVT.getSizeInBits() &&
56702 !User->hasAnyUseOfValue(1) &&
56703 User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {
56704 SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
56705 VT.getSizeInBits());
56706 Extract = DAG.getBitcast(VT, Extract);
56707 return DCI.CombineTo(N, Extract, SDValue(User, 1));
56708 }
56709
56710 return SDValue();
56711}
56712
56714 const X86Subtarget &Subtarget) {
56715 if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
56716 return SDValue();
56717
56718 bool IsStrict = N->isStrictFPOpcode();
56719 EVT VT = N->getValueType(0);
56720 SDValue Src = N->getOperand(IsStrict ? 1 : 0);
56721 EVT SrcVT = Src.getValueType();
56722
56723 if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
56724 SrcVT.getVectorElementType() != MVT::f32)
56725 return SDValue();
56726
56727 SDLoc dl(N);
56728
56729 SDValue Cvt, Chain;
56730 unsigned NumElts = VT.getVectorNumElements();
56731 if (Subtarget.hasFP16()) {
56732 // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64), ..)))
56733 // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64), ..))
56734 if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS) {
56735 SDValue Cvt0, Cvt1;
56736 SDValue Op0 = Src.getOperand(0);
56737 SDValue Op1 = Src.getOperand(1);
56738 bool IsOp0Strict = Op0->isStrictFPOpcode();
56739 if (Op0.getOpcode() != Op1.getOpcode() ||
56740 Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||
56741 Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {
56742 return SDValue();
56743 }
56744 int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};
56745 if (IsStrict) {
56746 assert(IsOp0Strict && "Op0 must be strict node");
56747 unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP
56750 Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
56751 {Op0.getOperand(0), Op0.getOperand(1)});
56752 Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},
56753 {Op1.getOperand(0), Op1.getOperand(1)});
56754 Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
56755 return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);
56756 }
56757 unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P
56759 Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));
56760 Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));
56761 return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);
56762 }
56763 return SDValue();
56764 }
56765
56766 if (NumElts == 1 || !isPowerOf2_32(NumElts))
56767 return SDValue();
56768
56769 // Widen to at least 4 input elements.
56770 if (NumElts < 4)
56771 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
56772 DAG.getConstantFP(0.0, dl, SrcVT));
56773
56774 // Destination is v8i16 with at least 8 elements.
56775 EVT CvtVT =
56776 EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));
56777 SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);
56778 if (IsStrict) {
56779 Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},
56780 {N->getOperand(0), Src, Rnd});
56781 Chain = Cvt.getValue(1);
56782 } else {
56783 Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);
56784 }
56785
56786 // Extract down to real number of elements.
56787 if (NumElts < 8) {
56789 Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
56790 DAG.getIntPtrConstant(0, dl));
56791 }
56792
56793 Cvt = DAG.getBitcast(VT, Cvt);
56794
56795 if (IsStrict)
56796 return DAG.getMergeValues({Cvt, Chain}, dl);
56797
56798 return Cvt;
56799}
56800
56802 SDValue Src = N->getOperand(0);
56803
56804 // Turn MOVDQ2Q+simple_load into an mmx load.
56805 if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
56806 LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
56807
56808 if (LN->isSimple()) {
56809 SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
56810 LN->getBasePtr(),
56811 LN->getPointerInfo(),
56812 LN->getOriginalAlign(),
56813 LN->getMemOperand()->getFlags());
56814 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
56815 return NewLd;
56816 }
56817 }
56818
56819 return SDValue();
56820}
56821
56824 unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();
56825 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
56826 if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))
56827 return SDValue(N, 0);
56828
56829 return SDValue();
56830}
56831
56833 DAGCombinerInfo &DCI) const {
56834 SelectionDAG &DAG = DCI.DAG;
56835 switch (N->getOpcode()) {
56836 // clang-format off
56837 default: break;
56839 return combineScalarToVector(N, DAG);
56841 case X86ISD::PEXTRW:
56842 case X86ISD::PEXTRB:
56843 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
56845 return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);
56847 return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);
56849 return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);
56850 case ISD::VSELECT:
56851 case ISD::SELECT:
56852 case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);
56853 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
56854 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
56855 case X86ISD::CMP: return combineCMP(N, DAG, Subtarget);
56856 case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);
56857 case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
56858 case X86ISD::ADD:
56859 case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);
56860 case X86ISD::SBB: return combineSBB(N, DAG);
56861 case X86ISD::ADC: return combineADC(N, DAG, DCI);
56862 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
56863 case ISD::SHL: return combineShiftLeft(N, DAG);
56864 case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);
56865 case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);
56866 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
56867 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
56868 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
56869 case ISD::BITREVERSE: return combineBITREVERSE(N, DAG, DCI, Subtarget);
56870 case X86ISD::BEXTR:
56871 case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);
56872 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
56873 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
56874 case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
56875 case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
56877 return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
56878 case ISD::SINT_TO_FP:
56880 return combineSIntToFP(N, DAG, DCI, Subtarget);
56881 case ISD::UINT_TO_FP:
56883 return combineUIntToFP(N, DAG, Subtarget);
56884 case ISD::LRINT:
56885 case ISD::LLRINT: return combineLRINT_LLRINT(N, DAG, Subtarget);
56886 case ISD::FADD:
56887 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
56888 case X86ISD::VFCMULC:
56889 case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);
56890 case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);
56891 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
56892 case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);
56893 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
56894 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
56895 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
56896 case X86ISD::FXOR:
56897 case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);
56898 case X86ISD::FMIN:
56899 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
56900 case ISD::FMINNUM:
56901 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
56902 case X86ISD::CVTSI2P:
56903 case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);
56904 case X86ISD::CVTP2SI:
56905 case X86ISD::CVTP2UI:
56907 case X86ISD::CVTTP2SI:
56909 case X86ISD::CVTTP2UI:
56910 return combineCVTP2I_CVTTP2I(N, DAG, DCI);
56912 case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);
56913 case X86ISD::BT: return combineBT(N, DAG, DCI);
56914 case ISD::ANY_EXTEND:
56915 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
56916 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
56917 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
56921 return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);
56922 case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);
56923 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
56924 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
56925 case X86ISD::PACKSS:
56926 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
56927 case X86ISD::HADD:
56928 case X86ISD::HSUB:
56929 case X86ISD::FHADD:
56930 case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);
56931 case X86ISD::VSHL:
56932 case X86ISD::VSRA:
56933 case X86ISD::VSRL:
56934 return combineVectorShiftVar(N, DAG, DCI, Subtarget);
56935 case X86ISD::VSHLI:
56936 case X86ISD::VSRAI:
56937 case X86ISD::VSRLI:
56938 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
56940 case X86ISD::PINSRB:
56941 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
56942 case X86ISD::SHUFP: // Handle all target specific shuffles
56943 case X86ISD::INSERTPS:
56944 case X86ISD::EXTRQI:
56945 case X86ISD::INSERTQI:
56946 case X86ISD::VALIGN:
56947 case X86ISD::PALIGNR:
56948 case X86ISD::VSHLDQ:
56949 case X86ISD::VSRLDQ:
56950 case X86ISD::BLENDI:
56951 case X86ISD::UNPCKH:
56952 case X86ISD::UNPCKL:
56953 case X86ISD::MOVHLPS:
56954 case X86ISD::MOVLHPS:
56955 case X86ISD::PSHUFB:
56956 case X86ISD::PSHUFD:
56957 case X86ISD::PSHUFHW:
56958 case X86ISD::PSHUFLW:
56959 case X86ISD::MOVSHDUP:
56960 case X86ISD::MOVSLDUP:
56961 case X86ISD::MOVDDUP:
56962 case X86ISD::MOVSS:
56963 case X86ISD::MOVSD:
56964 case X86ISD::MOVSH:
56965 case X86ISD::VBROADCAST:
56966 case X86ISD::VPPERM:
56967 case X86ISD::VPERMI:
56968 case X86ISD::VPERMV:
56969 case X86ISD::VPERMV3:
56970 case X86ISD::VPERMIL2:
56971 case X86ISD::VPERMILPI:
56972 case X86ISD::VPERMILPV:
56973 case X86ISD::VPERM2X128:
56974 case X86ISD::SHUF128:
56975 case X86ISD::VZEXT_MOVL:
56976 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
56977 case X86ISD::FMADD_RND:
56978 case X86ISD::FMSUB:
56980 case X86ISD::FMSUB_RND:
56981 case X86ISD::FNMADD:
56983 case X86ISD::FNMADD_RND:
56984 case X86ISD::FNMSUB:
56986 case X86ISD::FNMSUB_RND:
56987 case ISD::FMA:
56988 case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
56991 case X86ISD::FMADDSUB:
56992 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);
56993 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);
56994 case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);
56995 case X86ISD::MGATHER:
56996 case X86ISD::MSCATTER: return combineX86GatherScatter(N, DAG, DCI);
56997 case ISD::MGATHER:
56998 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);
56999 case X86ISD::PCMPEQ:
57000 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
57001 case X86ISD::PMULDQ:
57002 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);
57003 case X86ISD::VPMADDUBSW:
57004 case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);
57005 case X86ISD::KSHIFTL:
57006 case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);
57007 case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);
57009 case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);
57011 case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);
57013 case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);
57014 case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);
57015 case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);
57016 // clang-format on
57017 }
57018
57019 return SDValue();
57020}
57021
57023 return false;
57024}
57025
57026// Prefer (non-AVX512) vector TRUNCATE(SIGN_EXTEND_INREG(X)) to use of PACKSS.
57028 EVT ExtVT) const {
57029 return Subtarget.hasAVX512() || !VT.isVector();
57030}
57031
57032bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
57033 if (!isTypeLegal(VT))
57034 return false;
57035
57036 // There are no vXi8 shifts.
57037 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
57038 return false;
57039
57040 // TODO: Almost no 8-bit ops are desirable because they have no actual
57041 // size/speed advantages vs. 32-bit ops, but they do have a major
57042 // potential disadvantage by causing partial register stalls.
57043 //
57044 // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and
57045 // we have specializations to turn 32-bit multiply/shl into LEA or other ops.
57046 // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally
57047 // check for a constant operand to the multiply.
57048 if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)
57049 return false;
57050
57051 // i16 instruction encodings are longer and some i16 instructions are slow,
57052 // so those are not desirable.
57053 if (VT == MVT::i16) {
57054 switch (Opc) {
57055 default:
57056 break;
57057 case ISD::LOAD:
57058 case ISD::SIGN_EXTEND:
57059 case ISD::ZERO_EXTEND:
57060 case ISD::ANY_EXTEND:
57061 case ISD::SHL:
57062 case ISD::SRA:
57063 case ISD::SRL:
57064 case ISD::SUB:
57065 case ISD::ADD:
57066 case ISD::MUL:
57067 case ISD::AND:
57068 case ISD::OR:
57069 case ISD::XOR:
57070 return false;
57071 }
57072 }
57073
57074 // Any legal type not explicitly accounted for above here is desirable.
57075 return true;
57076}
57077
57080 int JTI,
57081 SelectionDAG &DAG) const {
57082 const Module *M = DAG.getMachineFunction().getMMI().getModule();
57083 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
57084 if (IsCFProtectionSupported) {
57085 // In case control-flow branch protection is enabled, we need to add
57086 // notrack prefix to the indirect branch.
57087 // In order to do that we create NT_BRIND SDNode.
57088 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
57089 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Value, dl);
57090 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, JTInfo, Addr);
57091 }
57092
57093 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, JTI, DAG);
57094}
57095
57098 const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {
57100 EVT VT = LogicOp->getValueType(0);
57101 EVT OpVT = SETCC0->getOperand(0).getValueType();
57102 if (!VT.isInteger())
57104
57105 if (VT.isVector())
57110
57111 // Don't use `NotAnd` as even though `not` is generally shorter code size than
57112 // `add`, `add` can lower to LEA which can save moves / spills. Any case where
57113 // `NotAnd` applies, `AddAnd` does as well.
57114 // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,
57115 // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.
57117}
57118
57120 EVT VT = Op.getValueType();
57121 bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&
57122 isa<ConstantSDNode>(Op.getOperand(1));
57123
57124 // i16 is legal, but undesirable since i16 instruction encodings are longer
57125 // and some i16 instructions are slow.
57126 // 8-bit multiply-by-constant can usually be expanded to something cheaper
57127 // using LEA and/or other ALU ops.
57128 if (VT != MVT::i16 && !Is8BitMulByConstant)
57129 return false;
57130
57131 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
57132 if (!Op.hasOneUse())
57133 return false;
57134 SDNode *User = *Op->use_begin();
57136 return false;
57137 auto *Ld = cast<LoadSDNode>(Load);
57138 auto *St = cast<StoreSDNode>(User);
57139 return Ld->getBasePtr() == St->getBasePtr();
57140 };
57141
57142 auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {
57143 if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)
57144 return false;
57145 if (!Op.hasOneUse())
57146 return false;
57147 SDNode *User = *Op->use_begin();
57148 if (User->getOpcode() != ISD::ATOMIC_STORE)
57149 return false;
57150 auto *Ld = cast<AtomicSDNode>(Load);
57151 auto *St = cast<AtomicSDNode>(User);
57152 return Ld->getBasePtr() == St->getBasePtr();
57153 };
57154
57155 bool Commute = false;
57156 switch (Op.getOpcode()) {
57157 default: return false;
57158 case ISD::SIGN_EXTEND:
57159 case ISD::ZERO_EXTEND:
57160 case ISD::ANY_EXTEND:
57161 break;
57162 case ISD::SHL:
57163 case ISD::SRA:
57164 case ISD::SRL: {
57165 SDValue N0 = Op.getOperand(0);
57166 // Look out for (store (shl (load), x)).
57167 if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
57168 return false;
57169 break;
57170 }
57171 case ISD::ADD:
57172 case ISD::MUL:
57173 case ISD::AND:
57174 case ISD::OR:
57175 case ISD::XOR:
57176 Commute = true;
57177 [[fallthrough]];
57178 case ISD::SUB: {
57179 SDValue N0 = Op.getOperand(0);
57180 SDValue N1 = Op.getOperand(1);
57181 // Avoid disabling potential load folding opportunities.
57182 if (X86::mayFoldLoad(N1, Subtarget) &&
57183 (!Commute || !isa<ConstantSDNode>(N0) ||
57184 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
57185 return false;
57186 if (X86::mayFoldLoad(N0, Subtarget) &&
57187 ((Commute && !isa<ConstantSDNode>(N1)) ||
57188 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
57189 return false;
57190 if (IsFoldableAtomicRMW(N0, Op) ||
57191 (Commute && IsFoldableAtomicRMW(N1, Op)))
57192 return false;
57193 }
57194 }
57195
57196 PVT = MVT::i32;
57197 return true;
57198}
57199
57200//===----------------------------------------------------------------------===//
57201// X86 Inline Assembly Support
57202//===----------------------------------------------------------------------===//
57203
57204// Helper to match a string separated by whitespace.
57206 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
57207
57208 for (StringRef Piece : Pieces) {
57209 if (!S.starts_with(Piece)) // Check if the piece matches.
57210 return false;
57211
57212 S = S.substr(Piece.size());
57214 if (Pos == 0) // We matched a prefix.
57215 return false;
57216
57217 S = S.substr(Pos);
57218 }
57219
57220 return S.empty();
57221}
57222
57224
57225 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
57226 if (llvm::is_contained(AsmPieces, "~{cc}") &&
57227 llvm::is_contained(AsmPieces, "~{flags}") &&
57228 llvm::is_contained(AsmPieces, "~{fpsr}")) {
57229
57230 if (AsmPieces.size() == 3)
57231 return true;
57232 else if (llvm::is_contained(AsmPieces, "~{dirflag}"))
57233 return true;
57234 }
57235 }
57236 return false;
57237}
57238
57240 InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
57241
57242 const std::string &AsmStr = IA->getAsmString();
57243
57244 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
57245 if (!Ty || Ty->getBitWidth() % 16 != 0)
57246 return false;
57247
57248 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
57249 SmallVector<StringRef, 4> AsmPieces;
57250 SplitString(AsmStr, AsmPieces, ";\n");
57251
57252 switch (AsmPieces.size()) {
57253 default: return false;
57254 case 1:
57255 // FIXME: this should verify that we are targeting a 486 or better. If not,
57256 // we will turn this bswap into something that will be lowered to logical
57257 // ops instead of emitting the bswap asm. For now, we don't support 486 or
57258 // lower so don't worry about this.
57259 // bswap $0
57260 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
57261 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
57262 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
57263 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
57264 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
57265 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
57266 // No need to check constraints, nothing other than the equivalent of
57267 // "=r,0" would be valid here.
57269 }
57270
57271 // rorw $$8, ${0:w} --> llvm.bswap.i16
57272 if (CI->getType()->isIntegerTy(16) &&
57273 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
57274 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
57275 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
57276 AsmPieces.clear();
57277 StringRef ConstraintsStr = IA->getConstraintString();
57278 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
57279 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
57280 if (clobbersFlagRegisters(AsmPieces))
57282 }
57283 break;
57284 case 3:
57285 if (CI->getType()->isIntegerTy(32) &&
57286 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
57287 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
57288 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
57289 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
57290 AsmPieces.clear();
57291 StringRef ConstraintsStr = IA->getConstraintString();
57292 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
57293 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
57294 if (clobbersFlagRegisters(AsmPieces))
57296 }
57297
57298 if (CI->getType()->isIntegerTy(64)) {
57299 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
57300 if (Constraints.size() >= 2 &&
57301 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
57302 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
57303 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
57304 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
57305 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
57306 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
57308 }
57309 }
57310 break;
57311 }
57312 return false;
57313}
57314
57317 .Case("{@cca}", X86::COND_A)
57318 .Case("{@ccae}", X86::COND_AE)
57319 .Case("{@ccb}", X86::COND_B)
57320 .Case("{@ccbe}", X86::COND_BE)
57321 .Case("{@ccc}", X86::COND_B)
57322 .Case("{@cce}", X86::COND_E)
57323 .Case("{@ccz}", X86::COND_E)
57324 .Case("{@ccg}", X86::COND_G)
57325 .Case("{@ccge}", X86::COND_GE)
57326 .Case("{@ccl}", X86::COND_L)
57327 .Case("{@ccle}", X86::COND_LE)
57328 .Case("{@ccna}", X86::COND_BE)
57329 .Case("{@ccnae}", X86::COND_B)
57330 .Case("{@ccnb}", X86::COND_AE)
57331 .Case("{@ccnbe}", X86::COND_A)
57332 .Case("{@ccnc}", X86::COND_AE)
57333 .Case("{@ccne}", X86::COND_NE)
57334 .Case("{@ccnz}", X86::COND_NE)
57335 .Case("{@ccng}", X86::COND_LE)
57336 .Case("{@ccnge}", X86::COND_L)
57337 .Case("{@ccnl}", X86::COND_GE)
57338 .Case("{@ccnle}", X86::COND_G)
57339 .Case("{@ccno}", X86::COND_NO)
57340 .Case("{@ccnp}", X86::COND_NP)
57341 .Case("{@ccns}", X86::COND_NS)
57342 .Case("{@cco}", X86::COND_O)
57343 .Case("{@ccp}", X86::COND_P)
57344 .Case("{@ccs}", X86::COND_S)
57346 return Cond;
57347}
57348
57349/// Given a constraint letter, return the type of constraint for this target.
57352 if (Constraint.size() == 1) {
57353 switch (Constraint[0]) {
57354 case 'R':
57355 case 'q':
57356 case 'Q':
57357 case 'f':
57358 case 't':
57359 case 'u':
57360 case 'y':
57361 case 'x':
57362 case 'v':
57363 case 'l':
57364 case 'k': // AVX512 masking registers.
57365 return C_RegisterClass;
57366 case 'a':
57367 case 'b':
57368 case 'c':
57369 case 'd':
57370 case 'S':
57371 case 'D':
57372 case 'A':
57373 return C_Register;
57374 case 'I':
57375 case 'J':
57376 case 'K':
57377 case 'N':
57378 case 'G':
57379 case 'L':
57380 case 'M':
57381 return C_Immediate;
57382 case 'C':
57383 case 'e':
57384 case 'Z':
57385 return C_Other;
57386 default:
57387 break;
57388 }
57389 }
57390 else if (Constraint.size() == 2) {
57391 switch (Constraint[0]) {
57392 default:
57393 break;
57394 case 'W':
57395 if (Constraint[1] != 's')
57396 break;
57397 return C_Other;
57398 case 'Y':
57399 switch (Constraint[1]) {
57400 default:
57401 break;
57402 case 'z':
57403 return C_Register;
57404 case 'i':
57405 case 'm':
57406 case 'k':
57407 case 't':
57408 case '2':
57409 return C_RegisterClass;
57410 }
57411 }
57412 } else if (parseConstraintCode(Constraint) != X86::COND_INVALID)
57413 return C_Other;
57414 return TargetLowering::getConstraintType(Constraint);
57415}
57416
57417/// Examine constraint type and operand type and determine a weight value.
57418/// This object must already have been set up with the operand type
57419/// and the current alternative constraint selected.
57422 AsmOperandInfo &Info, const char *Constraint) const {
57424 Value *CallOperandVal = Info.CallOperandVal;
57425 // If we don't have a value, we can't do a match,
57426 // but allow it at the lowest weight.
57427 if (!CallOperandVal)
57428 return CW_Default;
57429 Type *Ty = CallOperandVal->getType();
57430 // Look at the constraint type.
57431 switch (*Constraint) {
57432 default:
57434 [[fallthrough]];
57435 case 'R':
57436 case 'q':
57437 case 'Q':
57438 case 'a':
57439 case 'b':
57440 case 'c':
57441 case 'd':
57442 case 'S':
57443 case 'D':
57444 case 'A':
57445 if (CallOperandVal->getType()->isIntegerTy())
57446 Wt = CW_SpecificReg;
57447 break;
57448 case 'f':
57449 case 't':
57450 case 'u':
57451 if (Ty->isFloatingPointTy())
57452 Wt = CW_SpecificReg;
57453 break;
57454 case 'y':
57455 if (Ty->isX86_MMXTy() && Subtarget.hasMMX())
57456 Wt = CW_SpecificReg;
57457 break;
57458 case 'Y':
57459 if (StringRef(Constraint).size() != 2)
57460 break;
57461 switch (Constraint[1]) {
57462 default:
57463 return CW_Invalid;
57464 // XMM0
57465 case 'z':
57466 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
57467 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
57468 ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
57469 return CW_SpecificReg;
57470 return CW_Invalid;
57471 // Conditional OpMask regs (AVX512)
57472 case 'k':
57473 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
57474 return CW_Register;
57475 return CW_Invalid;
57476 // Any MMX reg
57477 case 'm':
57478 if (Ty->isX86_MMXTy() && Subtarget.hasMMX())
57479 return Wt;
57480 return CW_Invalid;
57481 // Any SSE reg when ISA >= SSE2, same as 'x'
57482 case 'i':
57483 case 't':
57484 case '2':
57485 if (!Subtarget.hasSSE2())
57486 return CW_Invalid;
57487 break;
57488 }
57489 break;
57490 case 'v':
57491 if ((Ty->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
57492 Wt = CW_Register;
57493 [[fallthrough]];
57494 case 'x':
57495 if (((Ty->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
57496 ((Ty->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
57497 Wt = CW_Register;
57498 break;
57499 case 'k':
57500 // Enable conditional vector operations using %k<#> registers.
57501 if ((Ty->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
57502 Wt = CW_Register;
57503 break;
57504 case 'I':
57505 if (auto *C = dyn_cast<ConstantInt>(Info.CallOperandVal))
57506 if (C->getZExtValue() <= 31)
57507 Wt = CW_Constant;
57508 break;
57509 case 'J':
57510 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
57511 if (C->getZExtValue() <= 63)
57512 Wt = CW_Constant;
57513 break;
57514 case 'K':
57515 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
57516 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
57517 Wt = CW_Constant;
57518 break;
57519 case 'L':
57520 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
57521 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
57522 Wt = CW_Constant;
57523 break;
57524 case 'M':
57525 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
57526 if (C->getZExtValue() <= 3)
57527 Wt = CW_Constant;
57528 break;
57529 case 'N':
57530 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
57531 if (C->getZExtValue() <= 0xff)
57532 Wt = CW_Constant;
57533 break;
57534 case 'G':
57535 case 'C':
57536 if (isa<ConstantFP>(CallOperandVal))
57537 Wt = CW_Constant;
57538 break;
57539 case 'e':
57540 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
57541 if ((C->getSExtValue() >= -0x80000000LL) &&
57542 (C->getSExtValue() <= 0x7fffffffLL))
57543 Wt = CW_Constant;
57544 break;
57545 case 'Z':
57546 if (auto *C = dyn_cast<ConstantInt>(CallOperandVal))
57547 if (C->getZExtValue() <= 0xffffffff)
57548 Wt = CW_Constant;
57549 break;
57550 }
57551 return Wt;
57552}
57553
57554/// Try to replace an X constraint, which matches anything, with another that
57555/// has more specific requirements based on the type of the corresponding
57556/// operand.
57558LowerXConstraint(EVT ConstraintVT) const {
57559 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
57560 // 'f' like normal targets.
57561 if (ConstraintVT.isFloatingPoint()) {
57562 if (Subtarget.hasSSE1())
57563 return "x";
57564 }
57565
57566 return TargetLowering::LowerXConstraint(ConstraintVT);
57567}
57568
57569// Lower @cc targets via setcc.
57571 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
57572 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
57574 if (Cond == X86::COND_INVALID)
57575 return SDValue();
57576 // Check that return type is valid.
57577 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
57578 OpInfo.ConstraintVT.getSizeInBits() < 8)
57579 report_fatal_error("Glue output operand is of invalid type");
57580
57581 // Get EFLAGS register. Only update chain when copyfrom is glued.
57582 if (Glue.getNode()) {
57583 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);
57584 Chain = Glue.getValue(1);
57585 } else
57586 Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);
57587 // Extract CC code.
57588 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
57589 // Extend to 32-bits
57590 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
57591
57592 return Result;
57593}
57594
57595/// Lower the specified operand into the Ops vector.
57596/// If it is invalid, don't add anything to Ops.
57598 StringRef Constraint,
57599 std::vector<SDValue> &Ops,
57600 SelectionDAG &DAG) const {
57601 SDValue Result;
57602 char ConstraintLetter = Constraint[0];
57603 switch (ConstraintLetter) {
57604 default: break;
57605 case 'I':
57606 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57607 if (C->getZExtValue() <= 31) {
57608 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57609 Op.getValueType());
57610 break;
57611 }
57612 }
57613 return;
57614 case 'J':
57615 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57616 if (C->getZExtValue() <= 63) {
57617 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57618 Op.getValueType());
57619 break;
57620 }
57621 }
57622 return;
57623 case 'K':
57624 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57625 if (isInt<8>(C->getSExtValue())) {
57626 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57627 Op.getValueType());
57628 break;
57629 }
57630 }
57631 return;
57632 case 'L':
57633 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57634 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
57635 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
57636 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
57637 Op.getValueType());
57638 break;
57639 }
57640 }
57641 return;
57642 case 'M':
57643 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57644 if (C->getZExtValue() <= 3) {
57645 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57646 Op.getValueType());
57647 break;
57648 }
57649 }
57650 return;
57651 case 'N':
57652 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57653 if (C->getZExtValue() <= 255) {
57654 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57655 Op.getValueType());
57656 break;
57657 }
57658 }
57659 return;
57660 case 'O':
57661 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57662 if (C->getZExtValue() <= 127) {
57663 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57664 Op.getValueType());
57665 break;
57666 }
57667 }
57668 return;
57669 case 'e': {
57670 // 32-bit signed value
57671 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57673 C->getSExtValue())) {
57674 // Widen to 64 bits here to get it sign extended.
57675 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
57676 break;
57677 }
57678 // FIXME gcc accepts some relocatable values here too, but only in certain
57679 // memory models; it's complicated.
57680 }
57681 return;
57682 }
57683 case 'W': {
57684 assert(Constraint[1] == 's');
57685 // Op is a BlockAddressSDNode or a GlobalAddressSDNode with an optional
57686 // offset.
57687 if (const auto *BA = dyn_cast<BlockAddressSDNode>(Op)) {
57688 Ops.push_back(DAG.getTargetBlockAddress(BA->getBlockAddress(),
57689 BA->getValueType(0)));
57690 } else {
57691 int64_t Offset = 0;
57692 if (Op->getOpcode() == ISD::ADD &&
57693 isa<ConstantSDNode>(Op->getOperand(1))) {
57694 Offset = cast<ConstantSDNode>(Op->getOperand(1))->getSExtValue();
57695 Op = Op->getOperand(0);
57696 }
57697 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
57698 Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
57699 GA->getValueType(0), Offset));
57700 }
57701 return;
57702 }
57703 case 'Z': {
57704 // 32-bit unsigned value
57705 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
57707 C->getZExtValue())) {
57708 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
57709 Op.getValueType());
57710 break;
57711 }
57712 }
57713 // FIXME gcc accepts some relocatable values here too, but only in certain
57714 // memory models; it's complicated.
57715 return;
57716 }
57717 case 'i': {
57718 // Literal immediates are always ok.
57719 if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {
57720 bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;
57721 BooleanContent BCont = getBooleanContents(MVT::i64);
57722 ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)
57724 int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()
57725 : CST->getSExtValue();
57726 Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);
57727 break;
57728 }
57729
57730 // In any sort of PIC mode addresses need to be computed at runtime by
57731 // adding in a register or some sort of table lookup. These can't
57732 // be used as immediates. BlockAddresses and BasicBlocks are fine though.
57733 if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&
57734 !(isa<BlockAddressSDNode>(Op) || isa<BasicBlockSDNode>(Op)))
57735 return;
57736
57737 // If we are in non-pic codegen mode, we allow the address of a global (with
57738 // an optional displacement) to be used with 'i'.
57739 if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))
57740 // If we require an extra load to get this address, as in PIC mode, we
57741 // can't accept it.
57743 Subtarget.classifyGlobalReference(GA->getGlobal())))
57744 return;
57745 break;
57746 }
57747 }
57748
57749 if (Result.getNode()) {
57750 Ops.push_back(Result);
57751 return;
57752 }
57753 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
57754}
57755
57756/// Check if \p RC is a general purpose register class.
57757/// I.e., GR* or one of their variant.
57758static bool isGRClass(const TargetRegisterClass &RC) {
57759 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
57760 RC.hasSuperClassEq(&X86::GR16RegClass) ||
57761 RC.hasSuperClassEq(&X86::GR32RegClass) ||
57762 RC.hasSuperClassEq(&X86::GR64RegClass) ||
57763 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
57764}
57765
57766/// Check if \p RC is a vector register class.
57767/// I.e., FR* / VR* or one of their variant.
57768static bool isFRClass(const TargetRegisterClass &RC) {
57769 return RC.hasSuperClassEq(&X86::FR16XRegClass) ||
57770 RC.hasSuperClassEq(&X86::FR32XRegClass) ||
57771 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
57772 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
57773 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
57774 RC.hasSuperClassEq(&X86::VR512RegClass);
57775}
57776
57777/// Check if \p RC is a mask register class.
57778/// I.e., VK* or one of their variant.
57779static bool isVKClass(const TargetRegisterClass &RC) {
57780 return RC.hasSuperClassEq(&X86::VK1RegClass) ||
57781 RC.hasSuperClassEq(&X86::VK2RegClass) ||
57782 RC.hasSuperClassEq(&X86::VK4RegClass) ||
57783 RC.hasSuperClassEq(&X86::VK8RegClass) ||
57784 RC.hasSuperClassEq(&X86::VK16RegClass) ||
57785 RC.hasSuperClassEq(&X86::VK32RegClass) ||
57786 RC.hasSuperClassEq(&X86::VK64RegClass);
57787}
57788
57789std::pair<unsigned, const TargetRegisterClass *>
57791 StringRef Constraint,
57792 MVT VT) const {
57793 // First, see if this is a constraint that directly corresponds to an LLVM
57794 // register class.
57795 if (Constraint.size() == 1) {
57796 // GCC Constraint Letters
57797 switch (Constraint[0]) {
57798 default: break;
57799 // 'A' means [ER]AX + [ER]DX.
57800 case 'A':
57801 if (Subtarget.is64Bit())
57802 return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);
57803 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
57804 "Expecting 64, 32 or 16 bit subtarget");
57805 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
57806
57807 // TODO: Slight differences here in allocation order and leaving
57808 // RIP in the class. Do they matter any more here than they do
57809 // in the normal allocation?
57810 case 'k':
57811 if (Subtarget.hasAVX512()) {
57812 if (VT == MVT::v1i1 || VT == MVT::i1)
57813 return std::make_pair(0U, &X86::VK1RegClass);
57814 if (VT == MVT::v8i1 || VT == MVT::i8)
57815 return std::make_pair(0U, &X86::VK8RegClass);
57816 if (VT == MVT::v16i1 || VT == MVT::i16)
57817 return std::make_pair(0U, &X86::VK16RegClass);
57818 }
57819 if (Subtarget.hasBWI()) {
57820 if (VT == MVT::v32i1 || VT == MVT::i32)
57821 return std::make_pair(0U, &X86::VK32RegClass);
57822 if (VT == MVT::v64i1 || VT == MVT::i64)
57823 return std::make_pair(0U, &X86::VK64RegClass);
57824 }
57825 break;
57826 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
57827 if (Subtarget.is64Bit()) {
57828 if (VT == MVT::i8 || VT == MVT::i1)
57829 return std::make_pair(0U, &X86::GR8_NOREX2RegClass);
57830 if (VT == MVT::i16)
57831 return std::make_pair(0U, &X86::GR16_NOREX2RegClass);
57832 if (VT == MVT::i32 || VT == MVT::f32)
57833 return std::make_pair(0U, &X86::GR32_NOREX2RegClass);
57834 if (VT != MVT::f80 && !VT.isVector())
57835 return std::make_pair(0U, &X86::GR64_NOREX2RegClass);
57836 break;
57837 }
57838 [[fallthrough]];
57839 // 32-bit fallthrough
57840 case 'Q': // Q_REGS
57841 if (VT == MVT::i8 || VT == MVT::i1)
57842 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
57843 if (VT == MVT::i16)
57844 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
57845 if (VT == MVT::i32 || VT == MVT::f32 ||
57846 (!VT.isVector() && !Subtarget.is64Bit()))
57847 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
57848 if (VT != MVT::f80 && !VT.isVector())
57849 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
57850 break;
57851 case 'r': // GENERAL_REGS
57852 case 'l': // INDEX_REGS
57853 if (VT == MVT::i8 || VT == MVT::i1)
57854 return std::make_pair(0U, &X86::GR8_NOREX2RegClass);
57855 if (VT == MVT::i16)
57856 return std::make_pair(0U, &X86::GR16_NOREX2RegClass);
57857 if (VT == MVT::i32 || VT == MVT::f32 ||
57858 (!VT.isVector() && !Subtarget.is64Bit()))
57859 return std::make_pair(0U, &X86::GR32_NOREX2RegClass);
57860 if (VT != MVT::f80 && !VT.isVector())
57861 return std::make_pair(0U, &X86::GR64_NOREX2RegClass);
57862 break;
57863 case 'R': // LEGACY_REGS
57864 if (VT == MVT::i8 || VT == MVT::i1)
57865 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
57866 if (VT == MVT::i16)
57867 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
57868 if (VT == MVT::i32 || VT == MVT::f32 ||
57869 (!VT.isVector() && !Subtarget.is64Bit()))
57870 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
57871 if (VT != MVT::f80 && !VT.isVector())
57872 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
57873 break;
57874 case 'f': // FP Stack registers.
57875 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
57876 // value to the correct fpstack register class.
57877 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
57878 return std::make_pair(0U, &X86::RFP32RegClass);
57879 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
57880 return std::make_pair(0U, &X86::RFP64RegClass);
57881 if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
57882 return std::make_pair(0U, &X86::RFP80RegClass);
57883 break;
57884 case 'y': // MMX_REGS if MMX allowed.
57885 if (!Subtarget.hasMMX()) break;
57886 return std::make_pair(0U, &X86::VR64RegClass);
57887 case 'v':
57888 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
57889 if (!Subtarget.hasSSE1()) break;
57890 bool VConstraint = (Constraint[0] == 'v');
57891
57892 switch (VT.SimpleTy) {
57893 default: break;
57894 // Scalar SSE types.
57895 case MVT::f16:
57896 if (VConstraint && Subtarget.hasFP16())
57897 return std::make_pair(0U, &X86::FR16XRegClass);
57898 break;
57899 case MVT::f32:
57900 case MVT::i32:
57901 if (VConstraint && Subtarget.hasVLX())
57902 return std::make_pair(0U, &X86::FR32XRegClass);
57903 return std::make_pair(0U, &X86::FR32RegClass);
57904 case MVT::f64:
57905 case MVT::i64:
57906 if (VConstraint && Subtarget.hasVLX())
57907 return std::make_pair(0U, &X86::FR64XRegClass);
57908 return std::make_pair(0U, &X86::FR64RegClass);
57909 case MVT::i128:
57910 if (Subtarget.is64Bit()) {
57911 if (VConstraint && Subtarget.hasVLX())
57912 return std::make_pair(0U, &X86::VR128XRegClass);
57913 return std::make_pair(0U, &X86::VR128RegClass);
57914 }
57915 break;
57916 // Vector types and fp128.
57917 case MVT::v8f16:
57918 if (!Subtarget.hasFP16())
57919 break;
57920 if (VConstraint)
57921 return std::make_pair(0U, &X86::VR128XRegClass);
57922 return std::make_pair(0U, &X86::VR128RegClass);
57923 case MVT::v8bf16:
57924 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
57925 break;
57926 if (VConstraint)
57927 return std::make_pair(0U, &X86::VR128XRegClass);
57928 return std::make_pair(0U, &X86::VR128RegClass);
57929 case MVT::f128:
57930 case MVT::v16i8:
57931 case MVT::v8i16:
57932 case MVT::v4i32:
57933 case MVT::v2i64:
57934 case MVT::v4f32:
57935 case MVT::v2f64:
57936 if (VConstraint && Subtarget.hasVLX())
57937 return std::make_pair(0U, &X86::VR128XRegClass);
57938 return std::make_pair(0U, &X86::VR128RegClass);
57939 // AVX types.
57940 case MVT::v16f16:
57941 if (!Subtarget.hasFP16())
57942 break;
57943 if (VConstraint)
57944 return std::make_pair(0U, &X86::VR256XRegClass);
57945 return std::make_pair(0U, &X86::VR256RegClass);
57946 case MVT::v16bf16:
57947 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
57948 break;
57949 if (VConstraint)
57950 return std::make_pair(0U, &X86::VR256XRegClass);
57951 return std::make_pair(0U, &X86::VR256RegClass);
57952 case MVT::v32i8:
57953 case MVT::v16i16:
57954 case MVT::v8i32:
57955 case MVT::v4i64:
57956 case MVT::v8f32:
57957 case MVT::v4f64:
57958 if (VConstraint && Subtarget.hasVLX())
57959 return std::make_pair(0U, &X86::VR256XRegClass);
57960 if (Subtarget.hasAVX())
57961 return std::make_pair(0U, &X86::VR256RegClass);
57962 break;
57963 case MVT::v32f16:
57964 if (!Subtarget.hasFP16())
57965 break;
57966 if (VConstraint)
57967 return std::make_pair(0U, &X86::VR512RegClass);
57968 return std::make_pair(0U, &X86::VR512_0_15RegClass);
57969 case MVT::v32bf16:
57970 if (!Subtarget.hasBF16())
57971 break;
57972 if (VConstraint)
57973 return std::make_pair(0U, &X86::VR512RegClass);
57974 return std::make_pair(0U, &X86::VR512_0_15RegClass);
57975 case MVT::v64i8:
57976 case MVT::v32i16:
57977 case MVT::v8f64:
57978 case MVT::v16f32:
57979 case MVT::v16i32:
57980 case MVT::v8i64:
57981 if (!Subtarget.hasAVX512()) break;
57982 if (VConstraint)
57983 return std::make_pair(0U, &X86::VR512RegClass);
57984 return std::make_pair(0U, &X86::VR512_0_15RegClass);
57985 }
57986 break;
57987 }
57988 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
57989 switch (Constraint[1]) {
57990 default:
57991 break;
57992 case 'i':
57993 case 't':
57994 case '2':
57995 return getRegForInlineAsmConstraint(TRI, "x", VT);
57996 case 'm':
57997 if (!Subtarget.hasMMX()) break;
57998 return std::make_pair(0U, &X86::VR64RegClass);
57999 case 'z':
58000 if (!Subtarget.hasSSE1()) break;
58001 switch (VT.SimpleTy) {
58002 default: break;
58003 // Scalar SSE types.
58004 case MVT::f16:
58005 if (!Subtarget.hasFP16())
58006 break;
58007 return std::make_pair(X86::XMM0, &X86::FR16XRegClass);
58008 case MVT::f32:
58009 case MVT::i32:
58010 return std::make_pair(X86::XMM0, &X86::FR32RegClass);
58011 case MVT::f64:
58012 case MVT::i64:
58013 return std::make_pair(X86::XMM0, &X86::FR64RegClass);
58014 case MVT::v8f16:
58015 if (!Subtarget.hasFP16())
58016 break;
58017 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
58018 case MVT::v8bf16:
58019 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
58020 break;
58021 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
58022 case MVT::f128:
58023 case MVT::v16i8:
58024 case MVT::v8i16:
58025 case MVT::v4i32:
58026 case MVT::v2i64:
58027 case MVT::v4f32:
58028 case MVT::v2f64:
58029 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
58030 // AVX types.
58031 case MVT::v16f16:
58032 if (!Subtarget.hasFP16())
58033 break;
58034 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
58035 case MVT::v16bf16:
58036 if (!Subtarget.hasBF16() || !Subtarget.hasVLX())
58037 break;
58038 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
58039 case MVT::v32i8:
58040 case MVT::v16i16:
58041 case MVT::v8i32:
58042 case MVT::v4i64:
58043 case MVT::v8f32:
58044 case MVT::v4f64:
58045 if (Subtarget.hasAVX())
58046 return std::make_pair(X86::YMM0, &X86::VR256RegClass);
58047 break;
58048 case MVT::v32f16:
58049 if (!Subtarget.hasFP16())
58050 break;
58051 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
58052 case MVT::v32bf16:
58053 if (!Subtarget.hasBF16())
58054 break;
58055 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
58056 case MVT::v64i8:
58057 case MVT::v32i16:
58058 case MVT::v8f64:
58059 case MVT::v16f32:
58060 case MVT::v16i32:
58061 case MVT::v8i64:
58062 if (Subtarget.hasAVX512())
58063 return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
58064 break;
58065 }
58066 break;
58067 case 'k':
58068 // This register class doesn't allocate k0 for masked vector operation.
58069 if (Subtarget.hasAVX512()) {
58070 if (VT == MVT::v1i1 || VT == MVT::i1)
58071 return std::make_pair(0U, &X86::VK1WMRegClass);
58072 if (VT == MVT::v8i1 || VT == MVT::i8)
58073 return std::make_pair(0U, &X86::VK8WMRegClass);
58074 if (VT == MVT::v16i1 || VT == MVT::i16)
58075 return std::make_pair(0U, &X86::VK16WMRegClass);
58076 }
58077 if (Subtarget.hasBWI()) {
58078 if (VT == MVT::v32i1 || VT == MVT::i32)
58079 return std::make_pair(0U, &X86::VK32WMRegClass);
58080 if (VT == MVT::v64i1 || VT == MVT::i64)
58081 return std::make_pair(0U, &X86::VK64WMRegClass);
58082 }
58083 break;
58084 }
58085 }
58086
58087 if (parseConstraintCode(Constraint) != X86::COND_INVALID)
58088 return std::make_pair(0U, &X86::GR32RegClass);
58089
58090 // Use the default implementation in TargetLowering to convert the register
58091 // constraint into a member of a register class.
58092 std::pair<Register, const TargetRegisterClass*> Res;
58094
58095 // Not found as a standard register?
58096 if (!Res.second) {
58097 // Only match x87 registers if the VT is one SelectionDAGBuilder can convert
58098 // to/from f80.
58099 if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {
58100 // Map st(0) -> st(7) -> ST0
58101 if (Constraint.size() == 7 && Constraint[0] == '{' &&
58102 tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&
58103 Constraint[3] == '(' &&
58104 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
58105 Constraint[5] == ')' && Constraint[6] == '}') {
58106 // st(7) is not allocatable and thus not a member of RFP80. Return
58107 // singleton class in cases where we have a reference to it.
58108 if (Constraint[4] == '7')
58109 return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);
58110 return std::make_pair(X86::FP0 + Constraint[4] - '0',
58111 &X86::RFP80RegClass);
58112 }
58113
58114 // GCC allows "st(0)" to be called just plain "st".
58115 if (StringRef("{st}").equals_insensitive(Constraint))
58116 return std::make_pair(X86::FP0, &X86::RFP80RegClass);
58117 }
58118
58119 // flags -> EFLAGS
58120 if (StringRef("{flags}").equals_insensitive(Constraint))
58121 return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);
58122
58123 // dirflag -> DF
58124 // Only allow for clobber.
58125 if (StringRef("{dirflag}").equals_insensitive(Constraint) &&
58126 VT == MVT::Other)
58127 return std::make_pair(X86::DF, &X86::DFCCRRegClass);
58128
58129 // fpsr -> FPSW
58130 // Only allow for clobber.
58131 if (StringRef("{fpsr}").equals_insensitive(Constraint) && VT == MVT::Other)
58132 return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);
58133
58134 return Res;
58135 }
58136
58137 // Make sure it isn't a register that requires 64-bit mode.
58138 if (!Subtarget.is64Bit() &&
58139 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
58140 TRI->getEncodingValue(Res.first) >= 8) {
58141 // Register requires REX prefix, but we're in 32-bit mode.
58142 return std::make_pair(0, nullptr);
58143 }
58144
58145 // Make sure it isn't a register that requires AVX512.
58146 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
58147 TRI->getEncodingValue(Res.first) & 0x10) {
58148 // Register requires EVEX prefix.
58149 return std::make_pair(0, nullptr);
58150 }
58151
58152 // Otherwise, check to see if this is a register class of the wrong value
58153 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
58154 // turn into {ax},{dx}.
58155 // MVT::Other is used to specify clobber names.
58156 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
58157 return Res; // Correct type already, nothing to do.
58158
58159 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
58160 // return "eax". This should even work for things like getting 64bit integer
58161 // registers when given an f64 type.
58162 const TargetRegisterClass *Class = Res.second;
58163 // The generic code will match the first register class that contains the
58164 // given register. Thus, based on the ordering of the tablegened file,
58165 // the "plain" GR classes might not come first.
58166 // Therefore, use a helper method.
58167 if (isGRClass(*Class)) {
58168 unsigned Size = VT.getSizeInBits();
58169 if (Size == 1) Size = 8;
58170 if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
58171 return std::make_pair(0, nullptr);
58172 Register DestReg = getX86SubSuperRegister(Res.first, Size);
58173 if (DestReg.isValid()) {
58174 bool is64Bit = Subtarget.is64Bit();
58175 const TargetRegisterClass *RC =
58176 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
58177 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
58178 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
58179 : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);
58180 if (Size == 64 && !is64Bit) {
58181 // Model GCC's behavior here and select a fixed pair of 32-bit
58182 // registers.
58183 switch (DestReg) {
58184 case X86::RAX:
58185 return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);
58186 case X86::RDX:
58187 return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);
58188 case X86::RCX:
58189 return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);
58190 case X86::RBX:
58191 return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);
58192 case X86::RSI:
58193 return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);
58194 case X86::RDI:
58195 return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);
58196 case X86::RBP:
58197 return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);
58198 default:
58199 return std::make_pair(0, nullptr);
58200 }
58201 }
58202 if (RC && RC->contains(DestReg))
58203 return std::make_pair(DestReg, RC);
58204 return Res;
58205 }
58206 // No register found/type mismatch.
58207 return std::make_pair(0, nullptr);
58208 } else if (isFRClass(*Class)) {
58209 // Handle references to XMM physical registers that got mapped into the
58210 // wrong class. This can happen with constraints like {xmm0} where the
58211 // target independent register mapper will just pick the first match it can
58212 // find, ignoring the required type.
58213
58214 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
58215 if (VT == MVT::f16)
58216 Res.second = &X86::FR16XRegClass;
58217 else if (VT == MVT::f32 || VT == MVT::i32)
58218 Res.second = &X86::FR32XRegClass;
58219 else if (VT == MVT::f64 || VT == MVT::i64)
58220 Res.second = &X86::FR64XRegClass;
58221 else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))
58222 Res.second = &X86::VR128XRegClass;
58223 else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))
58224 Res.second = &X86::VR256XRegClass;
58225 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
58226 Res.second = &X86::VR512RegClass;
58227 else {
58228 // Type mismatch and not a clobber: Return an error;
58229 Res.first = 0;
58230 Res.second = nullptr;
58231 }
58232 } else if (isVKClass(*Class)) {
58233 if (VT == MVT::v1i1 || VT == MVT::i1)
58234 Res.second = &X86::VK1RegClass;
58235 else if (VT == MVT::v8i1 || VT == MVT::i8)
58236 Res.second = &X86::VK8RegClass;
58237 else if (VT == MVT::v16i1 || VT == MVT::i16)
58238 Res.second = &X86::VK16RegClass;
58239 else if (VT == MVT::v32i1 || VT == MVT::i32)
58240 Res.second = &X86::VK32RegClass;
58241 else if (VT == MVT::v64i1 || VT == MVT::i64)
58242 Res.second = &X86::VK64RegClass;
58243 else {
58244 // Type mismatch and not a clobber: Return an error;
58245 Res.first = 0;
58246 Res.second = nullptr;
58247 }
58248 }
58249
58250 return Res;
58251}
58252
58254 // Integer division on x86 is expensive. However, when aggressively optimizing
58255 // for code size, we prefer to use a div instruction, as it is usually smaller
58256 // than the alternative sequence.
58257 // The exception to this is vector division. Since x86 doesn't have vector
58258 // integer division, leaving the division as-is is a loss even in terms of
58259 // size, because it will have to be scalarized, while the alternative code
58260 // sequence can be performed in vector form.
58261 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
58262 return OptSize && !VT.isVector();
58263}
58264
58265void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
58266 if (!Subtarget.is64Bit())
58267 return;
58268
58269 // Update IsSplitCSR in X86MachineFunctionInfo.
58271 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
58272 AFI->setIsSplitCSR(true);
58273}
58274
58275void X86TargetLowering::insertCopiesSplitCSR(
58276 MachineBasicBlock *Entry,
58277 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
58278 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
58279 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
58280 if (!IStart)
58281 return;
58282
58283 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
58284 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
58285 MachineBasicBlock::iterator MBBI = Entry->begin();
58286 for (const MCPhysReg *I = IStart; *I; ++I) {
58287 const TargetRegisterClass *RC = nullptr;
58288 if (X86::GR64RegClass.contains(*I))
58289 RC = &X86::GR64RegClass;
58290 else
58291 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
58292
58293 Register NewVR = MRI->createVirtualRegister(RC);
58294 // Create copy from CSR to a virtual register.
58295 // FIXME: this currently does not emit CFI pseudo-instructions, it works
58296 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
58297 // nounwind. If we want to generalize this later, we may need to emit
58298 // CFI pseudo-instructions.
58299 assert(
58300 Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&
58301 "Function should be nounwind in insertCopiesSplitCSR!");
58302 Entry->addLiveIn(*I);
58303 BuildMI(*Entry, MBBI, MIMetadata(), TII->get(TargetOpcode::COPY), NewVR)
58304 .addReg(*I);
58305
58306 // Insert the copy-back instructions right before the terminator.
58307 for (auto *Exit : Exits)
58308 BuildMI(*Exit, Exit->getFirstTerminator(), MIMetadata(),
58309 TII->get(TargetOpcode::COPY), *I)
58310 .addReg(NewVR);
58311 }
58312}
58313
58315 return Subtarget.is64Bit();
58316}
58317
58321 const TargetInstrInfo *TII) const {
58322 assert(MBBI->isCall() && MBBI->getCFIType() &&
58323 "Invalid call instruction for a KCFI check");
58324
58325 MachineFunction &MF = *MBB.getParent();
58326 // If the call target is a memory operand, unfold it and use R11 for the
58327 // call, so KCFI_CHECK won't have to recompute the address.
58328 switch (MBBI->getOpcode()) {
58329 case X86::CALL64m:
58330 case X86::CALL64m_NT:
58331 case X86::TAILJMPm64:
58332 case X86::TAILJMPm64_REX: {
58335 if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true,
58336 /*UnfoldStore=*/false, NewMIs))
58337 report_fatal_error("Failed to unfold memory operand for a KCFI check");
58338 for (auto *NewMI : NewMIs)
58339 MBBI = MBB.insert(OrigCall, NewMI);
58340 assert(MBBI->isCall() &&
58341 "Unexpected instruction after memory operand unfolding");
58342 if (OrigCall->shouldUpdateCallSiteInfo())
58343 MF.moveCallSiteInfo(&*OrigCall, &*MBBI);
58344 MBBI->setCFIType(MF, OrigCall->getCFIType());
58345 OrigCall->eraseFromParent();
58346 break;
58347 }
58348 default:
58349 break;
58350 }
58351
58352 MachineOperand &Target = MBBI->getOperand(0);
58353 Register TargetReg;
58354 switch (MBBI->getOpcode()) {
58355 case X86::CALL64r:
58356 case X86::CALL64r_NT:
58357 case X86::TAILJMPr64:
58358 case X86::TAILJMPr64_REX:
58359 assert(Target.isReg() && "Unexpected target operand for an indirect call");
58360 Target.setIsRenamable(false);
58361 TargetReg = Target.getReg();
58362 break;
58363 case X86::CALL64pcrel32:
58364 case X86::TAILJMPd64:
58365 assert(Target.isSymbol() && "Unexpected target operand for a direct call");
58366 // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for
58367 // 64-bit indirect thunk calls.
58368 assert(StringRef(Target.getSymbolName()).ends_with("_r11") &&
58369 "Unexpected register for an indirect thunk call");
58370 TargetReg = X86::R11;
58371 break;
58372 default:
58373 llvm_unreachable("Unexpected CFI call opcode");
58374 break;
58375 }
58376
58377 return BuildMI(MBB, MBBI, MIMetadata(*MBBI), TII->get(X86::KCFI_CHECK))
58378 .addReg(TargetReg)
58379 .addImm(MBBI->getCFIType())
58380 .getInstr();
58381}
58382
58383/// Returns true if stack probing through a function call is requested.
58385 return !getStackProbeSymbolName(MF).empty();
58386}
58387
58388/// Returns true if stack probing through inline assembly is requested.
58390
58391 // No inline stack probe for Windows, they have their own mechanism.
58392 if (Subtarget.isOSWindows() ||
58393 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
58394 return false;
58395
58396 // If the function specifically requests inline stack probes, emit them.
58397 if (MF.getFunction().hasFnAttribute("probe-stack"))
58398 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
58399 "inline-asm";
58400
58401 return false;
58402}
58403
58404/// Returns the name of the symbol used to emit stack probes or the empty
58405/// string if not applicable.
58408 // Inline Stack probes disable stack probe call
58409 if (hasInlineStackProbe(MF))
58410 return "";
58411
58412 // If the function specifically requests stack probes, emit them.
58413 if (MF.getFunction().hasFnAttribute("probe-stack"))
58414 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
58415
58416 // Generally, if we aren't on Windows, the platform ABI does not include
58417 // support for stack probes, so don't emit them.
58418 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
58419 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
58420 return "";
58421
58422 // We need a stack probe to conform to the Windows ABI. Choose the right
58423 // symbol.
58424 if (Subtarget.is64Bit())
58425 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
58426 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
58427}
58428
58429unsigned
58431 // The default stack probe size is 4096 if the function has no stackprobesize
58432 // attribute.
58433 return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",
58434 4096);
58435}
58436
58438 if (ML && ML->isInnermost() &&
58439 ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())
58442}
unsigned const MachineRegisterInfo * MRI
#define Success
static SDValue Widen(SelectionDAG *CurDAG, SDValue N)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
unsigned RegSize
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
#define NODE_NAME_CASE(node)
static const LLT S1
amdgpu AMDGPU Register Bank Select
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
#define EXPAND(Op)
Function Alias Analysis Results
BitTracker BT
Definition: BitTracker.cpp:73
BlockVerifier::State From
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:203
This file contains the declarations for the subclasses of Constant, which represent the different fla...
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
Looks at all the uses of the given value Returns the Liveness deduced from the uses of this value Adds all uses that cause the result to be MaybeLive to MaybeLiveRetUses If the result is MaybeLiveUses might be modified but its content should be ignored(since it might not be complete). DeadArgumentEliminationPass
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
std::string Name
uint64_t Size
Symbol * Sym
Definition: ELF_riscv.cpp:479
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
static KnownBits extractBits(unsigned BitWidth, const KnownBits &SrcOpKnown, const KnownBits &OffsetKnown, const KnownBits &WidthKnown)
Hexagon Common GEP
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
static const unsigned MaxDepth
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static int matchShuffleAsBitRotate(ArrayRef< int > Mask, int NumSubElts)
Try to lower a vector shuffle as a bit rotation.
static Value * LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctlz of V before the specified instruction IP.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:528
Live Register Matrix
static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc)
Return true if node is an ISD::AND or ISD::OR of two M68k::SETcc nodes each of which has no other use...
static bool hasNonFlagsUse(SDValue Op)
return true if Op has a use that doesn't just read flags.
static bool isCMOVPseudo(MachineInstr &MI)
static SDValue combineCarryThroughADD(SDValue CCR)
static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG)
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
#define R2(n)
#define T1
uint64_t High
LLVMContext & Context
This file defines ARC utility functions which are used by various parts of the compiler.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
PowerPC Reduce CR logical Operation
PowerPC TLS Dynamic Call Fixup
if(VerifyEach)
const char LLVMTargetMachineRef TM
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget)
static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc)
const SmallVectorImpl< MachineOperand > & Cond
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isSimple(Instruction *I)
unsigned OpIndex
static StringRef substr(StringRef Str, uint64_t Len)
This file implements the SmallBitVector class.
This file defines the SmallSet class.
static bool Enabled
Definition: Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
This file describes how to lower LLVM code to machine code.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &DL, unsigned VectorWidth)
static bool is64Bit(const char *name)
#define GET_EGPR_IF_ENABLED(OPC)
static unsigned getSUBriOpcode(bool IsLP64)
static bool isNoopOrBroadcastShuffleMask(ArrayRef< int > Mask)
static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask)
static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget)
Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer t...
static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::ANDNP nodes.
static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0, const SDValue &Zext1, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, SDValue X, SDValue Y, SelectionDAG &DAG, bool ZeroSecondOpOnly=false)
If this is an add or subtract where one operand is produced by a cmp+setcc, then try to convert it to...
static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, SmallVectorImpl< SDValue > &SrcOps, SmallVectorImpl< APInt > *SrcMask=nullptr)
Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...)) style scalarized (associative) ...
static SDValue combineSubABS(SDNode *N, SelectionDAG &DAG)
static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, SDValue &Op1, bool &IsAlwaysSignaling)
Turns an ISD::CondCode into a value suitable for SSE floating-point mask CMPs.
static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC)
static SDValue combineXorSubCTLZ(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If a value is a scalar FP zero or a vector FP zero (potentially including undefined elements),...
static bool matchBinaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static SDValue combineSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isGRClass(const TargetRegisterClass &RC)
Check if RC is a general purpose register class.
static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero, SmallVectorImpl< SDValue > &Ops, SmallVectorImpl< int > &Mask, bool &IsUnary)
Calculates the shuffle mask corresponding to the target-specific opcode.
static SDValue vectorizeExtractedCast(SDValue Cast, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast operation that is extracted from a vector, try to vectorize the cast op followed ...
static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG)
static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable)
static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, const SDLoc &DL, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1)
This is a helper function of LowerToHorizontalOp().
static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In, const SDLoc &dl, SelectionDAG &DAG)
static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, ArrayRef< int > HalfMask, int HalfIdx1, int HalfIdx2, bool UndefLower, SelectionDAG &DAG, bool UseConcat=false)
Given the output values from getHalfShuffleMask(), create a half width shuffle of extracted vectors f...
static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, SDValue ShAmt, int ShAmtIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle vector element shifts by a splat shift amount.
@ ConstantBit
@ NotConstantBit
@ NotShiftBit
@ ShiftBit
@ UndefBit
static SDValue combineZext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc, bool NSW)
Given a buildvector constant, return a new vector constant with each element incremented or decrement...
static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, unsigned &NumExtracts, bool &IsSubAdd)
Returns true iff BV builds a vector with the result equivalent to the result of ADDSUB/SUBADD operati...
static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode)
static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane 32-bit floating point shuffles.
static MachineBasicBlock * emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, const TargetInstrInfo *TII)
Utility function to emit xbegin specifying the start of an RTM region.
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef< SDValue > Elts, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
Given the initializing elements 'Elts' of a vector of type 'VT', see if the elements can be replaced ...
static bool scaleShuffleElements(ArrayRef< int > Mask, unsigned NumDstElts, SmallVectorImpl< int > &ScaledMask)
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerTruncateVecPackWithSignBits(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation of 'extended sign-bits' or 'extended zero-bits' values.
static APInt getExtractedDemandedElts(SDNode *N)
static SDValue combineBitOpWithPACK(SDNode *N, SelectionDAG &DAG)
static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit integer shuffles.
static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we are inverting an PTEST/TESTP operand, attempt to adjust the CC to avoid the inversion.
static unsigned getAltBitOpcode(unsigned Opcode)
static Constant * getConstantVector(MVT VT, ArrayRef< APInt > Bits, const APInt &Undefs, LLVMContext &C)
static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue promoteXINT_TO_FP(SDValue Op, const SDLoc &dl, SelectionDAG &DAG)
static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert i1-subvector to i1-vector.
static SDValue materializeVectorConstant(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Create a vector constant without a load.
static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle with a single PSHUFB of V1 or V2.
static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsDecomposedShuffleMerge(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic routine to decompose a shuffle and blend into independent blends and permutes.
static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerUINT_TO_FP_i64(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
64-bit unsigned integer to double expansion.
static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, const X86Subtarget &Subtarget)
static bool isX86CCSigned(unsigned X86CC)
Return true if the condition is an signed comparison operation.
static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a 128-bit shuffles.
static SDValue combineBitOpWithShift(SDNode *N, SelectionDAG &DAG)
static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on SELECT and VSELECT nodes.
static bool isUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is undef or ...
static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsAfterLegalize)
static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getConstVector(ArrayRef< int > Values, MVT VT, SelectionDAG &DAG, const SDLoc &dl, bool IsMask=false)
static MachineInstrBuilder createPHIsForCMOVsInSinkBB(MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd, MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, MachineBasicBlock *SinkMBB)
static SDValue combineCMP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to put 128-bits into a vector > 128 bits.
static bool onlyZeroFlagUsed(SDValue Flags)
static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 256-bits from a 512-bit vector.
static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Either split a vector in halves or decompose the shuffles and the blend/unpack.
static SDValue combineMulToPMADDWD(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsLanePermuteAndShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one source with a lane permutatio...
static bool isFoldableUseOfShuffle(SDNode *N)
static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return (and Op, Mask) for compare instructions or (vselect Mask, Op, PreservedSrc) for others along w...
static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKSS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg sign extension and X86ISD::PACKSS.
static SDValue combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool isShuffleMaskInputInPlace(int Input, ArrayRef< int > Mask)
Test whether the specified input (0 or 1) is in-place blended by the given mask.
static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether elements in each LaneSizeInBits lane in this shuffle mask come from multiple lanes - thi...
static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, ISD::CondCode Cond, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
As another special case, use PSUBUS[BW] when it's profitable.
static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 128-bit lane.
static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineADC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
static bool isShuffleFoldableLoad(SDValue V)
Helper to test for a load that can be folded with x86 shuffles.
static SDValue lowerShuffleAsElementInsertion(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower insertion of a single element into a zero vector.
static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnpackWdShuffleMask(ArrayRef< int > Mask, MVT VT, const SelectionDAG &DAG)
static SDValue LowerTruncateVecPack(MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into X86ISD::PACKUS/X86ISD::P...
static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle case where shuffle sources are coming from the same 128-bit lane and every lane can be represe...
static void computeKnownBitsForPSADBW(SDValue LHS, SDValue RHS, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth)
static int getSEHRegistrationNodeSize(const Function *Fn)
static SDValue combineShuffleOfConcatUndef(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Creates an SDNode for a predicated scalar operation.
static SDValue buildFromShuffleMostly(SDValue Op, const SDLoc &DL, SelectionDAG &DAG)
static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
If a BUILD_VECTOR's source elements all apply the same bit operation and one of their operands is con...
static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Dispatching routine to lower various 128-bit x86 vector shuffles.
static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG)
static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth=0)
Returns the negated value if the node N flips sign of FP value.
static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 16-bit integer shuffles.
static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower atomic_load_ops into LOCK-prefixed operations.
static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 8-bit integer shuffles.
static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG)
static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single - truncated - integer element, coming from a scalar_to_vector/buil...
static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, const SDLoc &DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI)
Emit a sequence of two 128-bit horizontal add/sub followed by a concat_vector.
static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, SDValue *InGlue, const EVT PtrVT, unsigned ReturnReg, unsigned char OperandFlags, bool LocalDynamic=false)
static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to fold: and (vector_shuffle<Z,...,Z> (insert_vector_elt undef, (xor X, -1), Z),...
static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a bitmask instruction for a shuffle.
static bool is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each 256-bit lane.
static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl, SDValue V1, SDValue V2, ArrayRef< int > Mask)
static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerUINT_TO_FP_i32(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
32-bit unsigned integer to float expansion.
static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static cl::opt< int > ExperimentalPrefInnermostLoopAlignment("x86-experimental-pref-innermost-loop-alignment", cl::init(4), cl::desc("Sets the preferable loop alignment for experiments (as log2 bytes) " "for innermost loops only. If specified, this option overrides " "alignment set by x86-experimental-pref-loop-alignment."), cl::Hidden)
static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute from a vector of source v...
static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, const SDLoc &DL, SelectionDAG &DAG, unsigned HOpcode, SDValue V0, SDValue V1)
static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle as a zero or any extension.
static bool needCarryOrOverflowFlag(SDValue Flags)
static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
Returns a vector of specified type with all bits set.
static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefLowerHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose lower half is undefined.
static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineRedundantDWordShuffle(SDValue N, MutableArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
Search for a combinable shuffle across a chain ending in pshufd.
static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG, SDValue OpMustEq, SDValue Op, unsigned Depth)
static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, SDValue SrcOp, uint64_t ShiftAmt, SelectionDAG &DAG)
Handle vector element shifts where the shift amount is a constant.
static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS, bool PackHiHalf=false)
Returns a node that packs the LHS + RHS nodes together at half width.
static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG)
static bool matchUnaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue V1, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT)
static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Given a scalar cast to FP with a cast to integer operand (almost an ftrunc), try to vectorize the cas...
static bool getHalfShuffleMask(ArrayRef< int > Mask, MutableArrayRef< int > HalfMask, int &HalfIdx1, int &HalfIdx2)
If the input shuffle mask results in a vector that is undefined in all upper or lower half elements a...
static cl::opt< int > BrMergingBaseCostThresh("x86-br-merging-base-cost", cl::init(2), cl::desc("Sets the cost threshold for when multiple conditionals will be merged " "into one branch versus be split in multiple branches. Merging " "conditionals saves branches at the cost of additional instructions. " "This value sets the instruction cost limit, below which conditionals " "will be merged, and above which conditionals will be split. Set to -1 " "to never merge branches."), cl::Hidden)
static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, const SelectionDAG &DAG, unsigned Depth, bool ResolveKnownElts)
static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT)
static SDValue emitLockedStackOp(SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue Chain, const SDLoc &DL)
Emit a locked operation on a stack location which does not change any memory location,...
static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool &ForceV1Zero, bool &ForceV2Zero, unsigned &ShuffleImm, ArrayRef< int > Mask, const APInt &Zeroable)
static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 8-lane 16-bit floating point shuffles.
static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle using bit math.
static SDValue reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-extending masked load, it is a scalar load and ve...
static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, unsigned TargetOpcode, unsigned SrcReg, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics with chain that return their value into registers EDX:EAX.
static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef< int > Mask, const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm)
static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI)
static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBuildVectorAsInsert(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, unsigned EltSizeInBits, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a target shuffle mask is equivalent within each sub-lane.
static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to map a 128-bit or larger integer comparison to vector instructions before type legalization spl...
static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
This function detects the AVG pattern between vectors of unsigned i8/i16, which is c = (a + b + 1) / ...
static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, unsigned ScalarSizeInBits, ArrayRef< int > Mask)
Test whether there are elements crossing LaneSizeInBits lanes in this shuffle mask.
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, X86::CondCode &X86CC)
Result of 'and' is compared against zero.
static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsZeroOrAnyExtend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a zero extension on any microarch.
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Compute the horizontal sum of bytes in V for the elements of VT.
static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG)
static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 32-lane 16-bit integer shuffles.
static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG)
static void computeInLaneShuffleMask(const ArrayRef< int > &Mask, int LaneSize, SmallVector< int > &InLaneMask)
Helper to get compute inlane shuffle mask for a complete shuffle mask.
static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG)
static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT, EVT MemVT, MemSDNode *Mem, unsigned Offset, SelectionDAG &DAG)
static bool isUndefUpperHalf(ArrayRef< int > Mask)
Return true if the mask creates a vector whose upper half is undefined.
static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG)
Lower SRA_PARTS and friends, which return two i32 values and take a 2 x i32 value to shift plus a shi...
static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode)
static std::pair< SDValue, SDValue > getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG)
static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT, const X86Subtarget &Subtarget)
static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs reference the same FP CMP,...
static bool isVKClass(const TargetRegisterClass &RC)
Check if RC is a mask register class.
static SDValue combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If a vector select has an operand that is -1 or 0, try to simplify the select to a bitwise logic oper...
static int canLowerByDroppingElements(ArrayRef< int > Mask, bool MatchEven, bool IsSingleInput)
Check whether a compaction lowering can be done by dropping even/odd elements and compute how many ti...
static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL)
Attempt to pre-truncate inputs to arithmetic ops if it will simplify the codegen.
static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower broadcast of a single element.
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void resolveTargetShuffleInputsAndMask(SmallVectorImpl< SDValue > &Inputs, SmallVectorImpl< int > &Mask)
Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 64-lane 8-bit integer shuffles.
static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to combine a shuffle into a target-specific add-sub or mul-add-sub node.
static SDValue lowerShuffleAsLanePermuteAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector shuffle crossing multiple 128-bit lanes as a lane permutation followed by a per-lane p...
static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG)
static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of 8-lane i16 shuffles.
static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue In, SelectionDAG &DAG)
static bool canonicalizeShuffleMaskWithCommute(ArrayRef< int > Mask)
Helper function that returns true if the shuffle mask should be commuted to improve canonicalization.
static bool matchAsm(StringRef S, ArrayRef< const char * > Pieces)
static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getV4X86ShuffleImm8ForMask(ArrayRef< int > Mask, const SDLoc &DL, SelectionDAG &DAG)
static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG)
Change a vector store into a pair of half-size vector stores.
static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a vector to a larger size with the same scalar type, with the new elements either zero or undef...
static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static bool isUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue MatchVectorAllEqualTest(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FANDN nodes.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, TLSModel::Model model, bool is64Bit, bool isPIC)
static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R, SDValue And1_L, SDValue And1_R, const SDLoc &DL, SelectionDAG &DAG)
static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget, unsigned Opcode)
static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendBoolVectorInReg(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break a binary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue createSetFPEnvNodes(SDValue Ptr, SDValue Chain, SDLoc DL, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static const char * getIndirectThunkSymbol(const X86Subtarget &Subtarget, unsigned Reg)
static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static unsigned getV4X86ShuffleImm(ArrayRef< int > Mask)
Get a 4-lane 8-bit shuffle immediate for a mask.
static void resolveTargetShuffleFromZeroables(SmallVectorImpl< int > &Mask, const APInt &KnownUndef, const APInt &KnownZero, bool ResolveKnownZeros=true)
static SDValue LowerBUILD_VECTORvXi1(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Insert one bit to mask vector, like v16i1 or v8i1.
static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower a vector shuffle by first fixing the 128-bit lanes and then shuffling each lane.
static bool isSoftF16(T VT, const X86Subtarget &Subtarget)
static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit integer shuffles.
static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Detect vector gather/scatter index generation and convert it from being a bunch of shuffles and extra...
static bool isSingleSHUFPSMask(ArrayRef< int > Mask)
Test whether this can be lowered with a single SHUFPS instruction.
static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, X86::CondCode &CC1, SDValue &Flags, bool &isAnd)
Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
static bool isX86LogicalCmp(SDValue Op)
Return true if opcode is a X86 logical comparison.
static bool isAnyInRange(ArrayRef< int > Mask, int Low, int Hi)
Return true if the value of any element in Mask falls within the specified range (L,...
static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG)
static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT, SelectionDAG &DAG, unsigned Depth)
static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS=false)
Detect patterns of truncation with signed saturation: (truncate (smin ((smax (x, signed_min_of_dest_t...
const unsigned FPStateSize
static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, unsigned &UnpackOpcode, bool IsUnary, ArrayRef< int > TargetMask, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating point negations.
static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl, unsigned vectorWidth)
static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If both input operands of a logic op are being cast from floating-point types or FP compares,...
static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, unsigned &HOpcode, SDValue &V0, SDValue &V1)
static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool createShuffleMaskFromVSELECT(SmallVectorImpl< int > &Mask, SDValue Cond, bool IsBLENDV=false)
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size, bool AllowTruncate)
static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to determine if In truncated to DstVT has the necessary signbits / leading zero bits to be tru...
static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Return Mask with the necessary casting or extending for Mask according to MaskVT when lowering maskin...
static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit floating point shuffles.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Horizontal vector math instructions may be slower than normal math with shuffles.
static bool isFRClass(const TargetRegisterClass &RC)
Check if RC is a vector register class.
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool SimpleOnly)
Generic routine to split vector shuffle into half-sized shuffles.
static SDValue LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue IsNOT(SDValue V, SelectionDAG &DAG)
static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG)
Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "test Op0,Op0", or something equivalent.
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, const SDLoc &dl)
Return a vector logical shift node.
static bool isFreeToSplitVector(SDNode *N, SelectionDAG &DAG)
static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower 4-lane i32 vector shuffles.
static SDValue combineX86ShuffleChain(ArrayRef< SDValue > Inputs, SDValue Root, ArrayRef< int > BaseMask, int Depth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine an arbitrary chain of shuffles into a single instruction if possible.
static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements, const X86Subtarget &Subtarget, SelectionDAG &DAG, const SDLoc &dl)
Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and bitcast with integer types.
static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isInRange(int Val, int Low, int Hi)
Return true if Val falls within the specified range (L, H].
static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Try to combine x86 target specific shuffles.
static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static std::pair< SDValue, SDValue > splitVector(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG)
Helper for attempting to create a X86ISD::BT node.
static SDValue EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Truncating Store with signed or unsigned saturation.
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, bool FillWithZeroes=false)
Widen a vector input to a vector of NVT.
static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG, bool ImmBlends=false)
Try to lower as a blend of elements from two inputs followed by a single-input permutation.
static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx, const APInt &Zeroable)
const unsigned X87StateSize
static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 64-bit integer shuffles.
static bool isLegalConversion(MVT VT, bool IsSigned, const X86Subtarget &Subtarget)
static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool isUndefOrEqual(int Val, int CmpVal)
Val is the undef sentinel value or equal to the specified value.
static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isTargetShuffle(unsigned Opcode)
static bool isSingleElementRepeatedMask(ArrayRef< int > Mask)
Check if the Mask consists of the same element repeated multiple times.
static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG)
static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG)
Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, SDValue N1, ArrayRef< int > Mask, SelectionDAG &DAG)
If we are extracting two 128-bit halves of a vector and shuffling the result, match that to a 256-bit...
static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit floating point shuffles.
static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or 'fsubadd' operation accordingly...
static SDValue lowerV8I16GeneralSingleInputShuffle(const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 shuffle lowering,...
static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 256-bit x86 vector shuffles.
static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG)
Try to turn tests against the signbit in the form of: XOR(TRUNCATE(SRL(X, size(X)-1)),...
static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit floating point shuffles.
static SDValue combineOrXorWithSETCC(SDNode *N, SDValue N0, SDValue N1, SelectionDAG &DAG)
static SDValue isUpperSubvectorUndef(SDValue V, const SDLoc &DL, SelectionDAG &DAG)
static cl::opt< int > BrMergingLikelyBias("x86-br-merging-likely-bias", cl::init(0), cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "likely, then it is likely that if the conditionals are split " "both sides will be executed, so it may be desirable to increase " "the instruction cost threshold. Set to -1 to never merge likely " "branches."), cl::Hidden)
static bool clobbersFlagRegisters(const SmallVector< StringRef, 4 > &AsmPieces)
static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG)
static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, int Idx, int ExpectedIdx)
Checks whether the vector elements referenced by two shuffle masks are equivalent.
static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to match a vector shuffle as an element rotation.
static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi)
Return true if Val is undef, zero or if its value falls within the specified range (L,...
static const Constant * getTargetConstantFromBasePtr(SDValue Ptr)
static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, SDValue Src, const SDLoc &DL)
static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Original, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to emit a blend instruction for a shuffle.
static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset)
static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, ArrayRef< SDValue > Ops, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Helper that combines an array of subvector ops as if they were the operands of a ISD::CONCAT_VECTORS ...
static bool isUndefOrInRange(int Val, int Low, int Hi)
Return true if Val is undef or if its value falls within the specified range (L, H].
static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1, const SDLoc &DL, EVT VT)
static bool collectConcatOps(SDNode *N, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG)
static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If both arms of a vector select are concatenated vectors, split the select, and concatenate the resul...
static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue combineSBB(SDNode *N, SelectionDAG &DAG)
static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static std::pair< Value *, BitTestKind > FindSingleBitChange(Value *V)
static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG)
If we are converting a value to floating-point, try to replace scalar truncate of an extracted vector...
static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef< int > Mask)
Test whether there are elements crossing 128-bit lanes in this shuffle mask.
static SDValue LowerI64IntToFP16(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 4-lane 64-bit integer shuffles.
static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Emit nodes that will be selected as "cmp Op0,Op1", or something equivalent.
static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG)
const unsigned FPStateSizeInBits
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If exactly one element of the mask is set for a non-truncating masked store, it is a vector extract a...
static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode)
static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue narrowExtractedVectorSelect(SDNode *Ext, const SDLoc &DL, SelectionDAG &DAG)
If we are extracting a subvector of a vector select and the select condition is composed of concatena...
static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNoopShuffleMask(ArrayRef< int > Mask)
Tiny helper function to identify a no-op mask.
static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackh operation.
static SDValue combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx, const SDLoc &dl, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If this is a zero/all-bits result that is bitwise-anded with a low bits mask.
static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a byte shift sequence.
static SDValue combineX86ShuffleChainWithExtract(ArrayRef< SDValue > Inputs, SDValue Root, ArrayRef< int > BaseMask, int Depth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isTargetShuffleVariableMask(unsigned Opcode)
static bool isLogicOp(unsigned Opcode)
static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool BitwiseOnly)
static SDValue LowerBuildVectorv8i16(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v8i16.
static bool matchBinaryShuffle(MVT MaskVT, ArrayRef< int > Mask, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary)
static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
Try to lower as an unpack of elements from two inputs followed by a single-input permutation.
static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG)
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, bool IsZero, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Return a vector_shuffle of the specified vector of zero or undef vector.
static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, const SDLoc &dl, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Attempt to use the vbroadcast instruction to generate a splat value from a splat BUILD_VECTOR which u...
static SDValue combineMulToPMULDQ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineX86ShufflesConstants(ArrayRef< SDValue > Ops, ArrayRef< int > Mask, SDValue Root, bool HasVariableMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 16-lane 32-bit floating point shuffles.
static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG)
static bool getTargetShuffleMaskIndices(SDValue MaskNode, unsigned MaskEltSizeInBits, SmallVectorImpl< uint64_t > &RawMask, APInt &UndefElts)
static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, const X86Subtarget &Subtarget)
sext(add_nsw(x, C)) --> add(sext(x), C_sext) zext(add_nuw(x, C)) --> add(zext(x), C_zext) Promoting a...
static const Constant * getTargetConstantFromNode(LoadSDNode *Load)
static bool canCombineAsMaskOperation(SDValue V, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a vector shuffle as a dword/qword rotation.
static bool isProfitableToUseFlagOp(SDValue Op)
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG)
ISD::FROUND is defined to round to nearest with ties rounding away from 0.
static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, const SDLoc &DL)
Detect patterns of truncation with unsigned saturation:
static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG)
If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the low half of each source v...
static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG)
Do a one-to-one translation of a ISD::CondCode to the X86-specific condition code,...
static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, unsigned ScalarSizeInBits, ArrayRef< int > Mask, int MaskOffset, const APInt &Zeroable, const X86Subtarget &Subtarget)
Try to lower a vector shuffle as a bit shift (shifts in zeros).
static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG)
static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef< int > Mask, MVT VT, SDValue V1, SDValue V2, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
High-level routine to lower various 512-bit x86 vector shuffles.
static SDValue LowerBuildVectorv16i8(SDValue Op, const SDLoc &DL, const APInt &NonZeroMask, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v16i8.
static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, APInt &UndefElts, SmallVectorImpl< APInt > &EltBits, bool AllowWholeUndefs=true, bool AllowPartialUndefs=false)
static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0, SDValue &Op1)
static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SelectionDAG &DAG, const SDLoc &dl)
Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then concatenate the result back.
static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit) followed by unpack 256-bit.
static SDValue combineLRINT_LLRINT(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerAddSubToHorizontalOp(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Depending on uarch and/or optimizing for size, we might prefer to use a vector operation in place of ...
static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, SelectionDAG &DAG, SDValue &Addr, SDValue &Index, Align &Alignment, unsigned &Offset)
Given a masked memory load/store operation, return true if it has one mask bit set.
static SDValue reduceVMULWidth(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
When the operands of vector mul are extended from smaller size values, like i8 and i16,...
static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode)
static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG)
static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, unsigned ExpectedUses)
Returns true if is possible to fold MUL and an idiom that has already been recognized as ADDSUB/SUBAD...
static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG)
static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, unsigned &LogBias, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering 2-lane 128-bit shuffles.
static SDValue lowerUINT_TO_FP_vec(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute)
static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG)
The only differences between FABS and FNEG are the mask and the logic op.
ShrinkMode
Different mul shrinking modes.
static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, const SDLoc &dl)
static SDValue canonicalizeShuffleMaskWithHorizOp(MutableArrayRef< SDValue > Ops, MutableArrayRef< int > Mask, unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void computeZeroableShuffleElements(ArrayRef< int > Mask, SDValue V1, SDValue V2, APInt &KnownUndef, APInt &KnownZero)
Compute whether each element of a shuffle is zeroable.
static SDValue EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &DL, SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, SelectionDAG &DAG)
Emit Masked Truncating Store with signed or unsigned saturation.
static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a VSELECT instruction to a vector shuffle.
static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2, MutableArrayRef< int > Mask, const APInt &Zeroable, bool &ForceV1Zero, bool &ForceV2Zero, uint64_t &BlendMask)
static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src, const SDLoc &DL)
static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
CMOV of constants requires materializing constant operands in registers.
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, EVT VT, const SDLoc &DL)
static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, const SDLoc &DL, const X86Subtarget &Subtarget)
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue V1, SDValue V2)
Returns a vector_shuffle node for an unpackl operation.
static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG)
Try to get a scalar value for a specific element of a vector.
static unsigned getOpcodeForIndirectThunk(unsigned RPOpc)
static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Generic lowering of v16i8 shuffles.
static bool isNullFPScalarOrVectorConst(SDValue V)
static bool hasIdenticalHalvesShuffleMask(ArrayRef< int > Mask)
Return true if a shuffle mask chooses elements identically in its top and bottom halves.
static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, unsigned &PackOpcode, ArrayRef< int > TargetMask, const SelectionDAG &DAG, const X86Subtarget &Subtarget, unsigned MaxStages=1)
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget)
static SDValue combineBITREVERSE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Try to convert a vector reduction sequence composed of binops and shuffles into horizontal ops.
static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Lower shuffle using X86ISD::VROTLI rotations.
static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT)
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Combine: (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) to: (brcond/cmov/setcc ....
static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize an EFLAGS definition used according to the condition code CC into a simpler EFLAGS value,...
static bool isBroadcastShuffleMask(ArrayRef< int > Mask)
static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const X86Subtarget &Subtarget, bool IsCommutative, SmallVectorImpl< int > &PostShuffleMask)
Return 'true' if this vector operation is "horizontal" and return the operands for the horizontal ope...
static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG, const SDLoc &DL)
static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extracting a scalar FP value from vector element 0 is free, so extract each operand first,...
static SDValue combineX86ShufflesRecursively(ArrayRef< SDValue > SrcOps, int SrcOpIndex, SDValue Root, ArrayRef< int > RootMask, ArrayRef< const SDNode * > SrcNodes, unsigned Depth, unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Fully generic combining of x86 shuffle instructions.
static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static StringRef getInstrStrFromOpNo(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo)
static bool isSequentialOrUndefOrZeroInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size,...
static bool canWidenShuffleElements(ArrayRef< int > Mask, SmallVectorImpl< int > &WidenedMask)
Helper function to test whether a shuffle mask could be simplified by widening the elements being shu...
static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an unary integer operation into 2 half sized ops and then concatenate the result back.
static SDValue combineSext(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 2-lane 64-bit integer shuffles.
static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineLogicBlendIntoConditionalNegate(EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue getShuffleScalarElt(SDValue Op, unsigned Index, SelectionDAG &DAG, unsigned Depth)
Returns the scalar element that will make up the i'th element of the result of the vector shuffle.
static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable)
static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG)
Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, unsigned &InsertPSMask, const APInt &Zeroable, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isNonZeroElementsInOrder(const APInt &Zeroable, ArrayRef< int > Mask, const EVT &VectorType, bool &IsZeroSideLeft)
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMul(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG, EVT VecVT, EVT CmpVT, bool HasPT, F SToV)
Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp expansion.
static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just pre-promote its result type since...
static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask)
Try to lower a vector shuffle as a byte rotation.
static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Try to lower a shuffle as a permute of the inputs followed by an UNPCK instruction.
static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, SelectionDAG &DAG)
static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static bool isAddSubOrSubAddMask(ArrayRef< int > Mask, bool &Op0Even)
Checks if the shuffle mask takes subsequent elements alternately from two vectors.
static bool isCompletePermute(ArrayRef< int > Mask)
Return true if every element of a single input is referenced by the shuffle mask.
static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, SDValue EntryEBP)
When the MSVC runtime transfers control to us, either to an outlined function or when returning to a ...
static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, SelectionDAG &DAG, const X86Subtarget &Subtarget, SmallVectorImpl< SDValue > &Results)
Handles the lowering of builtin intrinsics that read the time stamp counter (x86_rdtsc and x86_rdtscp...
static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS, ISD::CondCode CC, const APInt &OriginalMask, const X86Subtarget &Subtarget, SelectionDAG &DAG, X86::CondCode &X86CC)
static bool is128BitUnpackShuffleMask(ArrayRef< int > Mask, const SelectionDAG &DAG)
static bool isOrXorXorTree(SDValue X, bool Root=true)
Recursive helper for combineVectorSizedSetCCEquality() to see if we have a recognizable memcmp expans...
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static ConstantPoolSDNode * getTargetConstantPoolFromBasePtr(SDValue Ptr)
static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, SelectionDAG &DAG, const SDLoc &DL)
Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).
static bool isShuffleEquivalent(ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a shuffle mask is equivalent to an explicit list of arguments.
static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT, const APInt &Zeroable, ArrayRef< int > Mask, SDValue &V1, SDValue &V2, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef< int > Mask, const APInt &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Handle lowering of 8-lane 32-bit floating point shuffles.
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBUILD_VECTORAsVariablePermute(SDValue V, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerShuffleAsByteRotateAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then permuting the elements of th...
static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTPOP(SDValue Op, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, const X86Subtarget &Subtarget, SelectionDAG &DAG, bool ZeroUppers)
static void createPackShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Unary, unsigned NumStages=1)
Create a shuffle mask that matches the PACKSS/PACKUS truncation.
static bool isUndefOrEqualInRange(ArrayRef< int > Mask, int CmpVal, unsigned Pos, unsigned Size)
Return true if every element in Mask, beginning from position Pos and ending in Pos+Size is the undef...
static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Do target-specific dag combines on floating-point adds/subs.
static SDValue LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT)
static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG, const SDLoc &dl)
Break an operation into 2 half sized ops and then concatenate the results.
static cl::opt< bool > MulConstantOptimization("mul-constant-optimization", cl::init(true), cl::desc("Replace 'mul x, Const' with more effective instructions like " "SHIFT, LEA, etc."), cl::Hidden)
static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld)
static bool isAnyZero(ArrayRef< int > Mask)
Return true if the value of any element in Mask is the zero sentinel value.
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue truncateVectorWithPACKUS(EVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue lowerINT_TO_FP_vXi64(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl< int > &Mask, APInt &KnownUndef, APInt &KnownZero)
static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, SDValue Index, SDValue Base, SDValue Scale, SelectionDAG &DAG)
static SmallVector< int, 4 > getPSHUFShuffleMask(SDValue N)
Get the PSHUF-style mask from PSHUF node.
static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, SelectionDAG &DAG)
Scalarize a vector store, bitcasting to TargetVT to determine the scalar type.
static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineShuffleToFMAddSub(SDNode *N, const SDLoc &DL, const X86Subtarget &Subtarget, SelectionDAG &DAG)
Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Src, SDValue Mask, SDValue Base, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget)
static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, SelectionDAG &DAG)
static bool isUndefOrZero(int Val)
Val is either the undef or zero sentinel value.
static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
If this is a dynamic select (non-constant condition) and we can match this node with one of the varia...
SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops, F Builder, bool CheckBWI=true)
static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL].
static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr, MachineBasicBlock *BB)
static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, const SDLoc &dl)
Generate a DAG to grab 128-bits from a vector > 128 bits.
static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget, SDValue &X86CC)
static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, ArrayRef< int > Mask, SDValue V1, SDValue V2, SelectionDAG &DAG)
Lower a vector shuffle using the SHUFPS instruction.
static SDValue combineStore(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static LLVM_ATTRIBUTE_UNUSED bool isHorizOp(unsigned Opcode)
static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Lower a vector CTLZ using native supported vector CTLZ instruction.
static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Extract one bit from mask vector, like v16i1 or v8i1.
static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, MVT VT, bool IsSigned, const X86Subtarget &Subtarget, SelectionDAG &DAG, SDValue *Low=nullptr)
static SDValue lowerShuffleAsBlendOfPSHUFBs(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse)
Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the blend if only one input i...
static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, ArrayRef< int > Mask, uint64_t &BitLen, uint64_t &BitIdx)
static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS, SDValue Mask, SelectionDAG &DAG)
static bool isSequentialOrUndefInRange(ArrayRef< int > Mask, unsigned Pos, unsigned Size, int Low, int Step=1)
Return true if every element in Mask, beginning from position Pos and ending in Pos + Size,...
static cl::opt< int > BrMergingUnlikelyBias("x86-br-merging-unlikely-bias", cl::init(-1), cl::desc("Decreases 'x86-br-merging-base-cost' in cases that it is unlikely " "that all conditionals will be executed. For example for merging " "the conditionals (a == b && c > d), if its known that a == b is " "unlikely, then it is unlikely that if the conditionals are split " "both sides will be executed, so it may be desirable to decrease " "the instruction cost threshold. Set to -1 to never merge unlikely " "branches."), cl::Hidden)
static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl< int > &Mask, SmallVectorImpl< SDValue > &Ops, APInt &KnownUndef, APInt &KnownZero)
Decode a target shuffle mask and inputs and see if any values are known to be undef or zero from thei...
static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue LowerBuildVectorv4x32(SDValue Op, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Custom lower build_vector of v4i32 or v4f32.
static bool isTargetShuffleEquivalent(MVT VT, ArrayRef< int > Mask, ArrayRef< int > ExpectedMask, const SelectionDAG &DAG, SDValue V1=SDValue(), SDValue V2=SDValue())
Checks whether a target shuffle mask is equivalent to an explicit pattern.
static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG)
static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG)
static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG)
Fold "masked merge" expressions like (m & x) | (~m & y) into the equivalent ((x ^ y) & m) ^ y) patter...
static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1)
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, const EVT PtrVT, bool Is64Bit, bool Is64BitLP64)
static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG)
Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl, SelectionDAG &DAG, const X86Subtarget &Subtarget)
static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx)
For an EXTRACT_VECTOR_ELT with a constant index return the real underlying vector and index.
static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget)
static bool isUnaryOp(unsigned Opcode)
static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, ArrayRef< int > Mask, SmallVectorImpl< int > &RepeatedMask)
Test whether a shuffle mask is equivalent within each sub-lane.
static SDValue LowerZERO_EXTEND_Mask(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget)
Optimize branch condition evaluation.
static bool hasFPCMov(unsigned X86CC)
Is there a floating point cmov for the specific X86 condition code? Current x86 isa includes the foll...
static int getOneTrueElt(SDValue V)
If V is a build vector of boolean constants and exactly one of those constants is true,...
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef< int > Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG)
static constexpr int Concat[]
Value * RHS
Value * LHS
if(isa< SExtInst >(LHS)) std auto IsFreeTruncation
static const unsigned FramePtr
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:5196
static APFloat getAllOnesValue(const fltSemantics &Semantics)
Returns a float which is bitcasted from an all one value int.
Definition: APFloat.cpp:5221
void clearSign()
Definition: APFloat.h:1159
opStatus next(bool nextDown)
Definition: APFloat.h:1115
void changeSign()
Definition: APFloat.h:1158
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:957
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1385
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:427
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition: APInt.h:207
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition: APInt.h:401
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1491
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1370
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1620
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition: APInt.h:1364
uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition: APInt.cpp:489
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1002
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition: APInt.h:1463
APInt trunc(unsigned width) const
Truncate to new width.
Definition: APInt.cpp:906
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition: APInt.h:184
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1308
APInt abs() const
Get the absolute value.
Definition: APInt.h:1737
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:349
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:236
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:358
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:444
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1636
void setSignBit()
Set the sign bit to 1.
Definition: APInt.h:1318
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition: APInt.h:1089
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:187
bool isMinValue() const
Determine if this is the smallest unsigned value.
Definition: APInt.h:395
static APInt getMinValue(unsigned numBits)
Gets minimum unsigned value of APInt for a specific bit width.
Definition: APInt.h:194
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:307
bool intersects(const APInt &RHS) const
This operation tests if there are any pairs of corresponding bits between this APInt and RHS that are...
Definition: APInt.h:1227
bool eq(const APInt &RHS) const
Equality comparison.
Definition: APInt.h:1057
int32_t exactLogBase2() const
Definition: APInt.h:1725
void clearAllBits()
Set every bit to 0.
Definition: APInt.h:1375
void ashrInPlace(unsigned ShiftAmt)
Arithmetic right-shift this APInt by ShiftAmt in place.
Definition: APInt.h:812
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1589
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition: APInt.h:413
unsigned getNumSignBits() const
Computes the number of leading bits of this APInt that are equal to its sign bit.
Definition: APInt.h:1578
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition: APInt.h:1548
static APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition: APInt.cpp:620
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:197
unsigned getSignificantBits() const
Get the minimum bit size for this signed APInt.
Definition: APInt.h:1482
void flipAllBits()
Toggle every bit to its opposite value.
Definition: APInt.h:1405
unsigned countl_one() const
Count the number of leading one bits.
Definition: APInt.h:1565
void insertBits(const APInt &SubBits, unsigned bitPosition)
Insert the bits from a smaller APInt starting at bitPosition.
Definition: APInt.cpp:368
void clearLowBits(unsigned loBits)
Set bottom loBits bits to 0.
Definition: APInt.h:1395
unsigned logBase2() const
Definition: APInt.h:1703
void setAllBits()
Set every bit to 1.
Definition: APInt.h:1297
bool getBoolValue() const
Convert APInt to a boolean value.
Definition: APInt.h:449
bool isMask(unsigned numBits) const
Definition: APInt.h:466
bool isMaxSignedValue() const
Determine if this is the largest signed value.
Definition: APInt.h:383
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:312
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition: APInt.h:1128
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:954
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition: APInt.h:1345
APInt shl(unsigned shiftAmt) const
Left-shift function.
Definition: APInt.h:851
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1235
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:418
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:284
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition: APInt.h:319
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:178
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition: APInt.h:1367
APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition: APInt.cpp:453
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition: APInt.h:410
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:367
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:264
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:217
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition: APInt.h:836
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition: APInt.h:829
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1606
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1199
bool isMaxValue() const
Determine if this is the largest unsigned value.
Definition: APInt.h:377
APInt truncSSat(unsigned width) const
Truncate to new width with signed saturation.
Definition: APInt.cpp:942
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:154
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:210
iterator begin() const
Definition: ArrayRef.h:153
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:195
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:647
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
Definition: Instructions.h:696
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:748
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:867
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:760
@ Add
*p = old + v
Definition: Instructions.h:764
@ FAdd
*p = old + v
Definition: Instructions.h:785
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:778
@ Or
*p = old | v
Definition: Instructions.h:772
@ Sub
*p = old - v
Definition: Instructions.h:766
@ And
*p = old & v
Definition: Instructions.h:768
@ Xor
*p = old ^ v
Definition: Instructions.h:774
@ FSub
*p = old - v
Definition: Instructions.h:788
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:800
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:776
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:782
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:796
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:780
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:792
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:804
@ Nand
*p = ~(old & v)
Definition: Instructions.h:770
Value * getPointerOperand()
Definition: Instructions.h:910
BinOp getOperation() const
Definition: Instructions.h:845
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:901
Value * getValOperand()
Definition: Instructions.h:914
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:887
This is an SDNode representing atomic operations.
bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:349
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:162
bool none() const
none - Returns true if none of the bits are set.
Definition: BitVector.h:188
The address of a basic block.
Definition: Constants.h:889
A "pseudo-class" with methods for operating on BUILD_VECTORs.
bool getRepeatedSequence(const APInt &DemandedElts, SmallVectorImpl< SDValue > &Sequence, BitVector *UndefElements=nullptr) const
Find the shortest repeating sequence of values in the build vector.
SDValue getSplatValue(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted value or a null value if this is not a splat.
bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
Value * getCalledOperand() const
Definition: InstrTypes.h:1735
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:1022
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:1020
@ ICMP_EQ
equal
Definition: InstrTypes.h:1014
@ ICMP_NE
not equal
Definition: InstrTypes.h:1015
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:1105
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1291
static Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
Definition: Constants.cpp:2897
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
static bool isValueValidForType(Type *Ty, uint64_t V)
This static method returns true if the type Ty is big enough to represent the value V.
Definition: Constants.cpp:1588
bool isMachineConstantPoolEntry() const
const Constant * getConstVal() const
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1398
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
Definition: Constants.cpp:400
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
Constant * getAggregateElement(unsigned Elt) const
For aggregates (struct/array/vector) return the constant that corresponds to the specified element if...
Definition: Constants.cpp:432
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
iterator begin()
Definition: DenseMap.h:75
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Tagged union holding either a T or a Error.
Definition: Error.h:474
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type::subtype_iterator param_iterator
Definition: DerivedTypes.h:126
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:685
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:703
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:715
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:682
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:264
bool hasPersonalityFn() const
Check whether this function has a personality function.
Definition: Function.h:855
Constant * getPersonalityFn() const
Get the personality function associated with this function.
Definition: Function.cpp:1909
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:340
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:677
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:973
const GlobalValue * getGlobal() const
static StringRef dropLLVMManglingEscape(StringRef Name)
If the given string begins with the GlobalValue name mangling escape character '\1',...
Definition: GlobalValue.h:567
bool isAbsoluteSymbolRef() const
Returns whether this is a reference to an absolute symbol.
Definition: Globals.cpp:380
ThreadLocalMode getThreadLocalMode() const
Definition: GlobalValue.h:271
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
This instruction compares its operands according to the predicate given to the constructor.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
std::vector< ConstraintInfo > ConstraintInfoVector
Definition: InlineAsm.h:121
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:83
const BasicBlock * getParent() const
Definition: Instruction.h:152
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
Definition: Instruction.h:149
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:87
Class to represent integer types.
Definition: DerivedTypes.h:40
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
Definition: DerivedTypes.h:72
static bool LowerToByteSwap(CallInst *CI)
Try to replace a call instruction with a call to a bswap intrinsic.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:184
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:266
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
bool usesWindowsCFI() const
Definition: MCAsmInfo.h:799
MCSymbol * getOrCreateParentFrameOffsetSymbol(const Twine &FuncName)
Definition: MCContext.cpp:220
MCSymbol * getOrCreateLSDASymbol(const Twine &FuncName)
Definition: MCContext.cpp:225
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:40
Set of metadata that should be preserved when using BuildMI().
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
@ INVALID_SIMPLE_VALUE_TYPE
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
bool is32BitVector() const
Return true if this is a 32-bit vector type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:585
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
bool is512BitVector() const
Return true if this is a 512-bit vector type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool is256BitVector() const
Return true if this is a 256-bit vector type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
MVT getDoubleNumVectorElementsVT() const
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
MVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
reverse_iterator rend()
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
void push_back(MachineInstr *MI)
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
bool isLiveIn(MCPhysReg Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
unsigned succ_size() const
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
succ_reverse_iterator succ_rbegin()
void eraseFromParent()
This method unlinks 'this' from the containing function and deletes it.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
iterator insertAfter(iterator I, MachineInstr *MI)
Insert MI into the instruction list after I.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
succ_reverse_iterator succ_rend()
void setMachineBlockAddressTaken()
Set this block to indicate that its address is used as something other than the target of a terminato...
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
void setFrameAddressIsTaken(bool T)
void setReturnAddressIsTaken(bool s)
void setHasCopyImplyingStackAdjustment(bool B)
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
const WinEHFuncInfo * getWinEHFuncInfo() const
getWinEHFuncInfo - Return information about how the current function uses Windows exception handling.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
MachineModuleInfo & getMMI() const
bool shouldSplitStack() const
Should we be emitting segmented stack stuff for the function.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
void moveCallSiteInfo(const MachineInstr *Old, const MachineInstr *New)
Move the call site info from Old to \New call site info.
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDisp(const MachineOperand &Disp, int64_t off, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
bool killsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr kills the specified register.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:568
unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_LabelDifference32
EK_LabelDifference32 - Each entry is the address of the block minus the address of the jump table.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
const MCContext & getContext() const
const Module * getModule() const
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
This class is used to represent an MGATHER node.
This is a base class used to represent MGATHER and MSCATTER nodes.
This class is used to represent an MLOAD node.
This base class is used to represent MLOAD and MSTORE nodes.
const SDValue & getMask() const
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
This class is used to represent an MSTORE node.
bool isCompressingStore() const
Returns true if the op does a compression to the vector before storing.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID for this memory operation.
Align getOriginalAlign() const
Returns alignment and volatility of the memory access.
bool readMem() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isNonTemporal() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition: Module.cpp:333
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:307
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:116
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
iterator_range< use_iterator > uses()
SDNodeFlags getFlags() const
MVT getSimpleValueType(unsigned ResNo) const
Return the type of a specified result as a simple type.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
static bool areOnlyUsersOf(ArrayRef< const SDNode * > Nodes, const SDNode *N)
Return true if all the users of N are contained in Nodes.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Return true if the type of the node type undefined.
bool hasNUsesOfValue(unsigned NUses, unsigned Value) const
Return true if there are exactly NUSES uses of the indicated value.
void setFlags(SDNodeFlags NewFlags)
op_iterator op_end() const
op_iterator op_begin() const
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
unsigned getNumOperands() const
Help to insert SDNodeFlags automatically in transforming.
Definition: SelectionDAG.h:361
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
bool willNotOverflowAdd(bool IsSigned, SDValue N0, SDValue N1) const
Determine if the result of the addition of 2 nodes can never overflow.
static unsigned getOpcode_EXTEND_VECTOR_INREG(unsigned Opcode)
Convert *_EXTEND to *_EXTEND_VECTOR_INREG opcode.
Definition: SelectionDAG.h:924
SDValue getShiftAmountOperand(EVT LHSTy, SDValue Op)
Return the specified value casted to the target's desired shift amount type.
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:722
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition: SelectionDAG.h:954
SDValue getSplatSourceVector(SDValue V, int &SplatIndex)
If V is a splatted value, return the source vector and its splat index.
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:474
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
SDValue getConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offs=0, bool isT=false, unsigned TargetFlags=0)
SDNode * isConstantIntBuildVectorOrConstantInt(SDValue N) const
Test whether the given value is a constant int or similar node.
SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
bool shouldOptForSize() const
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:478
bool isEqualTo(SDValue A, SDValue B) const
Test whether two SDValues are known to compare equal.
static constexpr unsigned MaxRecursionDepth
Definition: SelectionDAG.h:448
SDValue expandVACopy(SDNode *Node)
Expand the specified ISD::VACOPY node as the Legalize pass would.
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:732
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:828
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
SDValue simplifySelect(SDValue Cond, SDValue TVal, SDValue FVal)
Try to simplify a select/vselect into 1 of its operands or a constant.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:472
SDValue expandVAArg(SDNode *Node)
Expand the specified ISD::VAARG node as the Legalize pass would.
bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
Definition: SelectionDAG.h:659
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getCommutedVectorShuffle(const ShuffleVectorSDNode &SV)
Returns an ISD::VECTOR_SHUFFLE node semantically equivalent to the shuffle node in input but with swa...
std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
const APInt * getValidShiftAmountConstant(SDValue V, const APInt &DemandedElts) const
If a SHL/SRA/SRL node V has a constant or splat constant shift amount that is less than the element b...
bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, bool PoisonOnly=false, unsigned Depth=0) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getRegister(unsigned Reg, EVT VT)
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:473
std::pair< SDValue, SDValue > getStrictFPExtendOrRound(SDValue Op, SDValue Chain, const SDLoc &DL, EVT VT)
Convert Op, which must be a STRICT operation of float type, to the float type VT, by either extending...
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:773
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
bool isKnownNeverZeroFloat(SDValue Op) const
Test whether the given floating point SDValue is known to never be positive or negative zero.
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:676
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool MaskedVectorIsZero(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
Return true if 'Op' is known to be zero in DemandedElts.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:768
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:469
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:799
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:845
SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDValue > Ops)
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
Definition: SelectionDAG.h:485
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
SDValue getMCSymbol(MCSymbol *Sym, EVT VT)
SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:739
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:554
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
static unsigned getOpcode_EXTEND(unsigned Opcode)
Convert *_EXTEND_VECTOR_INREG to *_EXTEND opcode.
Definition: SelectionDAG.h:908
SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp, ArrayRef< ISD::NodeType > CandidateBinOps, bool AllowPartials=false)
Match a binop + shuffle pyramid that represents a horizontal reduction over the elements of a vector ...
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
static bool isBitRotateMask(ArrayRef< int > Mask, unsigned EltSizeInBits, unsigned MinSubElts, unsigned MaxSubElts, unsigned &NumSubElts, unsigned &RotateAmt)
Checks if the shuffle is a bit rotation of the first operand across multiple subelements,...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static void commuteMask(MutableArrayRef< int > Mask)
Change values in a shuffle permute mask assuming the two vector operands have swapped position.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
size_type count() const
Returns the number of bits which are set.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
size_type size() const
Definition: SmallSet.h:161
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:717
iterator erase(const_iterator CI)
Definition: SmallVector.h:750
typename SuperClass::const_iterator const_iterator
Definition: SmallVector.h:591
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
pointer data()
Return a pointer to the vector's buffer, even if empty().
Definition: SmallVector.h:299
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition: StringRef.h:563
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:257
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:134
size_t size_type
Definition: StringRef.h:56
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:269
static constexpr size_t npos
Definition: StringRef.h:52
bool equals_insensitive(StringRef RHS) const
Check for string equality, ignoring case.
Definition: StringRef.h:170
size_t find_first_not_of(char C, size_t From=0) const
Find the first character in the string that is not C or npos if not found.
Definition: StringRef.cpp:251
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:373
Information about stack frame layout on the target.
virtual bool hasFP(const MachineFunction &MF) const =0
hasFP - Return true if the specified function should have a dedicated frame pointer register.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC)
Set the CallingConv that should be used for the specified libcall.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp convert the backend supports.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
virtual bool isCommutativeBinOp(unsigned Opcode) const
Returns true if the opcode is a commutative binary operation.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
bool isOperationCustom(unsigned Op, EVT VT) const
Return true if the operation uses custom lowering, regardless of whether the type is legal or not.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const
Return true if it is profitable to fold a pair of shifts into a mask.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
BooleanContent getBooleanContents(bool isVec, bool isFloat) const
For targets without i1 registers, this gives the nature of the high-bits of boolean values held in ty...
virtual MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const
Returns preferred type for switch condition.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
BooleanContent
Enum that describes how the target represents true/false values.
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
AndOrSETCCFoldKind
Enum of different potentially desirable ways to fold (and/or (setcc ...), (setcc ....
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
NegatibleCost
Enum that specifies when a float negation is beneficial.
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual bool shouldConvertPhiType(Type *From, Type *To) const
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
static ISD::NodeType getExtendForContent(BooleanContent Content)
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const
Expands target specific indirect branch for the case of JumpTable expansion.
SDValue getCheaperNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, unsigned Depth=0) const
This is the helper function to return the newly negated expression only when the cost is cheaper.
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue SimplifyMultipleUseDemandedVectorElts(SDValue Op, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
Helper wrapper around SimplifyMultipleUseDemandedBits, demanding all bits from only some vector eleme...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual const char * LowerXConstraint(EVT ConstraintVT) const
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const
virtual bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth=0) const
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index) const
Get a pointer to vector element Idx located in memory for a vector of type VecVT starting at a base a...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
bool useTLSDESC() const
Returns true if this target uses TLS Descriptors.
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
unsigned NoSignedZerosFPMath
NoSignedZerosFPMath - This flag is enabled when the -enable-no-signed-zeros-fp-math is specified on t...
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetInstrInfo * getInstrInfo() const
Target - Wrapper for Target specific information.
bool isOSMSVCRT() const
Is this a "Windows" OS targeting a "MSVCRT.dll" environment.
Definition: Triple.h:662
bool isOSDarwin() const
Is this a "Darwin" OS (macOS, iOS, tvOS, watchOS, XROS, or DriverKit).
Definition: Triple.h:553
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:342
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
static IntegerType * getInt1Ty(LLVMContext &C)
Type * getArrayElementType() const
Definition: Type.h:404
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
uint64_t getArrayNumElements() const
bool isX86_MMXTy() const
Return true if this is X86 MMX.
Definition: Type.h:201
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1808
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
use_iterator use_begin()
Definition: Value.h:360
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:683
Type * getElementType() const
Definition: DerivedTypes.h:436
bool has128ByteRedZone(const MachineFunction &MF) const
Return true if the function has a redzone (accessible bytes past the frame of the top of stack functi...
bool Uses64BitFramePtr
True if the 64-bit frame or stack pointer should be used.
unsigned getGlobalBaseReg(MachineFunction *MF) const
getGlobalBaseReg - Return a virtual register initialized with the the global base register value.
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
ArrayRef< size_t > getPreallocatedArgOffsets(const size_t Id)
void setRestoreBasePointer(const MachineFunction *MF)
size_t getPreallocatedStackSize(const size_t Id)
unsigned getPtrSizedFrameRegister(const MachineFunction &MF) const
bool hasBasePointer(const MachineFunction &MF) const
Register getFrameRegister(const MachineFunction &MF) const override
const uint32_t * getDarwinTLSCallPreservedMask() const
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
Register getStackRegister() const
unsigned getSlotSize() const
Register getBaseRegister() const
const uint32_t * getNoPreservedMask() const override
bool canExtendTo512BW() const
Definition: X86Subtarget.h:250
bool hasAnyFMA() const
Definition: X86Subtarget.h:213
bool isOSWindows() const
Definition: X86Subtarget.h:336
bool isTargetMachO() const
Definition: X86Subtarget.h:302
bool useIndirectThunkBranches() const
Definition: X86Subtarget.h:235
bool hasSSE1() const
Definition: X86Subtarget.h:200
bool hasThreeDNow() const
Definition: X86Subtarget.h:211
bool isPICStyleGOT() const
Definition: X86Subtarget.h:342
bool hasSSE42() const
Definition: X86Subtarget.h:205
const X86TargetLowering * getTargetLowering() const override
Definition: X86Subtarget.h:125
bool hasMFence() const
Use mfence if we have SSE2 or we're on x86-64 (even if we asked for no-sse2).
Definition: X86Subtarget.h:290
bool canUseCMOV() const
Definition: X86Subtarget.h:199
bool isPICStyleStubPIC() const
Definition: X86Subtarget.h:345
bool isTargetWindowsMSVC() const
Definition: X86Subtarget.h:314
bool canUseCMPXCHG8B() const
Definition: X86Subtarget.h:192
bool isTargetDarwin() const
Definition: X86Subtarget.h:294
bool isTargetWin64() const
Definition: X86Subtarget.h:338
bool isTarget64BitLP64() const
Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
Definition: X86Subtarget.h:185
const Triple & getTargetTriple() const
Definition: X86Subtarget.h:292
const X86InstrInfo * getInstrInfo() const override
Definition: X86Subtarget.h:129
bool useAVX512Regs() const
Definition: X86Subtarget.h:267
bool hasSSE3() const
Definition: X86Subtarget.h:202
bool isCallingConvWin64(CallingConv::ID CC) const
Definition: X86Subtarget.h:351
bool hasAVX512() const
Definition: X86Subtarget.h:208
bool canExtendTo512DQ() const
Definition: X86Subtarget.h:246
bool hasSSE41() const
Definition: X86Subtarget.h:204
bool hasMMX() const
Definition: X86Subtarget.h:210
bool isTargetELF() const
Definition: X86Subtarget.h:300
bool hasSSEPrefetch() const
Definition: X86Subtarget.h:221
bool canUseCMPXCHG16B() const
Definition: X86Subtarget.h:193
unsigned char classifyGlobalReference(const GlobalValue *GV, const Module &M) const
bool hasSSE2() const
Definition: X86Subtarget.h:201
bool hasSSSE3() const
Definition: X86Subtarget.h:203
bool hasInt256() const
Definition: X86Subtarget.h:209
bool isPICStyleRIPRel() const
Definition: X86Subtarget.h:343
bool isTargetCygMing() const
Definition: X86Subtarget.h:334
unsigned char classifyLocalReference(const GlobalValue *GV) const
Classify a global variable reference for the current subtarget according to how we should reference i...
unsigned char classifyBlockAddressReference() const
Classify a blockaddress reference for the current subtarget according to how we should reference it i...
bool isTargetPS() const
Definition: X86Subtarget.h:298
const X86RegisterInfo * getRegisterInfo() const override
Definition: X86Subtarget.h:139
bool hasAVX() const
Definition: X86Subtarget.h:206
bool isTargetWindowsGNU() const
Definition: X86Subtarget.h:326
unsigned getPreferVectorWidth() const
Definition: X86Subtarget.h:239
bool isTargetWindowsItanium() const
Definition: X86Subtarget.h:330
bool isTargetNaCl64() const
Definition: X86Subtarget.h:310
const X86FrameLowering * getFrameLowering() const override
Definition: X86Subtarget.h:131
bool useBWIRegs() const
Definition: X86Subtarget.h:276
unsigned char classifyGlobalFunctionReference(const GlobalValue *GV, const Module &M) const
Classify a global function reference for the current subtarget.
bool hasAVX2() const
Definition: X86Subtarget.h:207
bool shouldFormOverflowOp(unsigned Opcode, EVT VT, bool MathUsed) const override
Overflow nodes should get combined/lowered to optimal instructions (they should allow eliminating exp...
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
bool isLegalAddImmediate(int64_t Imm) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool preferSextInRegOfTruncate(EVT TruncVT, EVT VT, EVT ExtVT) const override
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool preferABDSToABSWithNSW(EVT VT) const override
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
std::pair< SDValue, SDValue > BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer, MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const
bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded vector elements, returning true on success...
SDValue LowerAsmOutputForConstraint(SDValue &Chain, SDValue &Flag, const SDLoc &DL, const AsmOperandInfo &Constraint, SelectionDAG &DAG) const override
Handle Lowering flag assembly outputs.
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
SDValue SimplifyMultipleUseDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth) const override
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
bool isSplatValueForTargetNode(SDValue Op, const APInt &DemandedElts, APInt &UndefElts, const SelectionDAG &DAG, unsigned Depth) const override
Return true if vector Op has the same value across all DemandedElts, indicating any elements which ma...
bool convertSelectOfConstantsToMath(EVT VT) const override
Return true if a select of constants (select Cond, C1, C2) should be transformed into simple math ops...
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint letter, return the type of constraint for this target.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
bool isVectorShiftByScalarCheap(Type *Ty) const override
This is used to enable splatted operand transforms for vector shifts and vector funnel shifts.
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool isLegalStoreImmediate(int64_t Imm) const override
Return true if the specified immediate is legal for the value input of a store instruction.
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isCtlzFast() const override
Return true if ctlz instruction is fast.
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
bool supportSwiftError() const override
Return true if the target supports swifterror attribute.
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
bool shouldSplatInsEltVarIndex(EVT VT) const override
Return true if inserting a scalar into a variable element of an undef vector is more efficiently hand...
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Return true if sinking I's operands to the same basic block as I is profitable, e....
bool isInlineAsmTargetBranch(const SmallVectorImpl< StringRef > &AsmStrs, unsigned OpNo) const override
On x86, return true if the operand with index OpNo is a CALL or JUMP instruction, which can use eithe...
MVT hasFastEqualityCompare(unsigned NumBits) const override
Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op, const APInt &DemandedElts, unsigned MaskIndex, TargetLoweringOpt &TLO, unsigned Depth) const
bool isLegalICmpImmediate(int64_t Imm) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool hasInlineStackProbe(const MachineFunction &MF) const override
Returns true if stack probing through inline assembly is requested.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
unsigned preferedOpcodeForCmpEqPiecesOfOperand(EVT VT, unsigned ShiftOpc, bool MayTransformRotate, const APInt &ShiftOrRotateAmt, const std::optional< APInt > &AndMask) const override
bool isXAndYEqZeroPreferableToXAndYEqY(ISD::CondCode Cond, EVT VT) const override
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool ExpandInlineAsm(CallInst *CI) const override
This hook allows the target to expand an inline asm call to be explicit llvm code if it wants to.
bool hasAndNot(SDValue Y) const override
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override
Return true if we believe it is correct and profitable to reduce the load node to a smaller type.
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool preferScalarizeSplat(SDNode *N) const override
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const override
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
bool useLoadStackGuardNode() const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool hasAndNotCompare(SDValue Y) const override
Return true if the target should transform: (X & Y) == Y —> (~X & Y) == 0 (X & Y) !...
bool reduceSelectOfFPConstantLoads(EVT CmpOpVT) const override
Return true if it is profitable to convert a select of FP constants into a constant pool load whose a...
StringRef getStackProbeSymbolName(const MachineFunction &MF) const override
Returns the name of the symbol used to emit stack probes or the empty string if not applicable.
bool hasBitTest(SDValue X, SDValue Y) const override
Return true if the target has a bit-test instruction: (X & (1 << Y)) ==/!= 0 This knowledge can be us...
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
bool isShuffleMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
bool useStackGuardXorFP() const override
If this function returns true, stack protection checks should XOR the frame pointer (or whichever poi...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine the number of bits in the operation that are sign bits.
bool shouldScalarizeBinop(SDValue) const override
Scalar ops always have equal or better analysis/performance/power than the vector equivalent,...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type Ty1 to type Ty2.
bool decomposeMulByConstant(LLVMContext &Context, EVT VT, SDValue C) const override
Return true if it is profitable to transform an integer multiplication-by-constant into simpler opera...
bool areJTsAllowed(const Function *Fn) const override
Returns true if lowering to a jump table is allowed.
bool isCommutativeBinOp(unsigned Opcode) const override
Returns true if the opcode is a commutative binary operation.
bool isScalarFPTypeInSSEReg(EVT VT) const
Return true if the specified scalar FP type is computed in an SSE register, not on the X87 floating p...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
MVT getPreferredSwitchConditionType(LLVMContext &Context, EVT ConditionVT) const override
Returns preferred type for switch condition.
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isVectorClearMaskLegal(ArrayRef< int > Mask, EVT VT) const override
Similar to isShuffleMaskLegal.
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info, const char *Constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Customize the preferred legalization strategy for certain types.
bool shouldConvertPhiType(Type *From, Type *To) const override
Given a set in interconnected phis of type 'From' that are loaded/stored or bitcast to type 'To',...
bool hasStackProbeSymbol(const MachineFunction &MF) const override
Returns true if stack probing through a function call is requested.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type Ty1 implicit zero-extends the valu...
bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
This function returns true if the memory access is aligned or if the target allows this specific unal...
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const override
TargetLowering::AndOrSETCCFoldKind isDesirableToCombineLogicOpOfSETCC(const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const override
Return prefered fold type, Abs if this is a vector, AddAnd if its an integer, None otherwise.
bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override
There are two ways to clear extreme bits (either low or high): Mask: x & (-1 << y) (the instcombine c...
bool addressingModeSupportsTLS(const GlobalValue &GV) const override
Returns true if the targets addressing mode can target thread local storage (TLS).
SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
SDValue expandIndirectJTBranch(const SDLoc &dl, SDValue Value, SDValue Addr, int JTI, SelectionDAG &DAG) const override
Expands target specific indirect branch for the case of JumpTable expansion.
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isBinOp(unsigned Opcode) const override
Add x86-specific opcodes to the default list.
bool isGuaranteedNotToBeUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, unsigned Depth) const override
Return true if this function can prove that Op is never poison and, if PoisonOnly is false,...
bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDValue unwrapAddress(SDValue N) const override
CondMergingParams getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs, const Value *Rhs) const override
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the value type to use for ISD::SETCC.
X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI)
bool isVectorLoadExtDesirable(SDValue) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const Constant * getTargetConstantFromLoad(LoadSDNode *LD) const override
This method returns the constant pool value that will be loaded by LD.
EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const override
For types supported by the target, this is an identity function.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
unsigned getStackProbeSize(const MachineFunction &MF) const
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
Replace the results of node with an illegal result type with new values built out of custom code.
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
bool needsFixedCatchObjects() const override
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:199
self_iterator getIterator()
Definition: ilist_node.h:109
#define INT64_MIN
Definition: DataTypes.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2978
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ X86_ThisCall
Similar to X86_StdCall.
Definition: CallingConv.h:122
@ X86_StdCall
stdcall is mostly used by the Win32 API.
Definition: CallingConv.h:99
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ X86_FastCall
'fast' analog of X86_StdCall.
Definition: CallingConv.h:103
bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:751
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:237
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1133
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1129
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:724
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:477
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition: ISDOpcodes.h:147
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition: ISDOpcodes.h:498
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:251
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1276
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:560
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:715
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1162
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1278
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1248
@ STRICT_FCEIL
Definition: ISDOpcodes.h:427
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1279
@ FRAME_TO_ARGS_OFFSET
FRAME_TO_ARGS_OFFSET - This node represents offset from frame pointer to first (possible) on-stack ar...
Definition: ISDOpcodes.h:124
@ RESET_FPENV
Set floating-point environment to default state.
Definition: ISDOpcodes.h:1009
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:488
@ FMAXNUM_IEEE
Definition: ISDOpcodes.h:986
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:240
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1038
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:784
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:484
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ RETURNADDR
Definition: ISDOpcodes.h:95
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition: ISDOpcodes.h:151
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1261
@ STRICT_FMINIMUM
Definition: ISDOpcodes.h:437
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:791
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:544
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:391
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:689
@ MEMBARRIER
MEMBARRIER - Compiler barrier only; generate a no-op.
Definition: ISDOpcodes.h:1235
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
Definition: ISDOpcodes.h:1240
@ SIGN_EXTEND_VECTOR_INREG
SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register sign-extension of the low ...
Definition: ISDOpcodes.h:821
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:256
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:478
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:914
@ STRICT_FLOG2
Definition: ISDOpcodes.h:422
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1274
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:904
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:230
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1275
@ INIT_TRAMPOLINE
INIT_TRAMPOLINE - This corresponds to the init_trampoline intrinsic.
Definition: ISDOpcodes.h:1206
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:940
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition: ISDOpcodes.h:412
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1407
@ GlobalTLSAddress
Definition: ISDOpcodes.h:79
@ EH_LABEL
EH_LABEL - Represents a label in mid basic block used to track locations needed for debug and excepti...
Definition: ISDOpcodes.h:1109
@ EH_RETURN
OUTCHAIN = EH_RETURN(INCHAIN, OFFSET, HANDLER) - This node represents 'eh_return' gcc dwarf builtin,...
Definition: ISDOpcodes.h:135
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:886
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:775
@ STRICT_UINT_TO_FP
Definition: ISDOpcodes.h:451
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:621
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition: ISDOpcodes.h:101
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1054
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:723
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1228
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:995
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:759
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:931
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1084
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:328
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1277
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1063
@ GC_TRANSITION_START
GC_TRANSITION_START/GC_TRANSITION_END - These operators mark the beginning and end of GC transition s...
Definition: ISDOpcodes.h:1320
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:350
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:728
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1244
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:212
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition: ISDOpcodes.h:628
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1158
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:209
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:324
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:431
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:881
@ STRICT_FP_TO_FP16
Definition: ISDOpcodes.h:917
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:652
@ STRICT_FP16_TO_FP
Definition: ISDOpcodes.h:916
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:706
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:601
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1272
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:574
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:985
@ STRICT_FMAXIMUM
Definition: ISDOpcodes.h:436
@ STRICT_FMAXNUM
Definition: ISDOpcodes.h:425
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:536
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:781
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1218
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:857
@ STRICT_FMINNUM
Definition: ISDOpcodes.h:426
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:743
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1280
@ LOCAL_RECOVER
LOCAL_RECOVER - Represents the llvm.localrecover intrinsic.
Definition: ISDOpcodes.h:114
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:972
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
Definition: ISDOpcodes.h:1222
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:332
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1048
@ ConstantPool
Definition: ISDOpcodes.h:82
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition: ISDOpcodes.h:810
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:799
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:675
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:889
@ STRICT_FROUND
Definition: ISDOpcodes.h:429
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:737
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:304
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition: ISDOpcodes.h:450
@ STRICT_BF16_TO_FP
Definition: ISDOpcodes.h:925
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:428
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:430
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:923
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:94
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1270
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:444
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:466
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:443
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:991
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1271
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:837
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1189
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition: ISDOpcodes.h:158
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:471
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:681
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1215
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ GET_FPENV_MEM
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1014
@ STRICT_FP_TO_BF16
Definition: ISDOpcodes.h:926
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition: ISDOpcodes.h:401
@ STRICT_FLOG10
Definition: ISDOpcodes.h:421
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:525
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ STRICT_FEXP2
Definition: ISDOpcodes.h:419
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1269
@ ExternalSymbol
Definition: ISDOpcodes.h:83
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:870
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:415
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:832
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:908
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:424
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:856
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition: ISDOpcodes.h:141
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:787
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1153
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1077
@ BlockAddress
Definition: ISDOpcodes.h:84
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:764
@ GC_TRANSITION_END
Definition: ISDOpcodes.h:1321
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:494
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:341
@ AssertZext
Definition: ISDOpcodes.h:62
@ STRICT_FRINT
Definition: ISDOpcodes.h:423
@ SET_FPENV_MEM
Sets the current floating point environment.
Definition: ISDOpcodes.h:1019
@ ADJUST_TRAMPOLINE
ADJUST_TRAMPOLINE - This corresponds to the adjust_trampoline intrinsic.
Definition: ISDOpcodes.h:1212
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:314
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:516
bool isExtVecInRegOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1606
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isBuildVectorOfConstantSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantSDNode or undef.
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool matchUnaryPredicate(SDValue Op, std::function< bool(ConstantSDNode *)> Match, bool AllowUndefs=false)
Hook for matching ConstantSDNode predicate.
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1601
CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isBitwiseLogicOp(unsigned Opcode)
Whether this is bitwise logic opcode.
Definition: ISDOpcodes.h:1422
bool isTrueWhenEqual(CondCode Cond)
Return true if the specified condition returns true if the two operands to the condition are equal.
Definition: ISDOpcodes.h:1588
bool isUNINDEXEDLoad(const SDNode *N)
Returns true if the specified node is an unindexed load.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
bool isFreezeUndef(const SDNode *N)
Return true if the specified node is FREEZE(UNDEF).
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
Definition: ISDOpcodes.h:1563
bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
bool isBuildVectorOfConstantFPSDNodes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR node of all ConstantFPSDNode or undef.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1530
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1510
bool isUnsignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs an unsigned comparison when used with intege...
Definition: ISDOpcodes.h:1569
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1471
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
Definition: PatternMatch.h:518
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
cst_pred_ty< is_sign_mask > m_SignMask()
Match an integer or vector with only the sign bit(s) set.
Definition: PatternMatch.h:658
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:966
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:869
BinaryOp_match< LHS, RHS, Instruction::Xor, true > m_c_Xor(const LHS &L, const RHS &R)
Matches an Xor with LHS and RHS in either order.
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:593
CmpClass_match< LHS, RHS, ICmpInst, ICmpInst::Predicate > m_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
CmpClass_match< LHS, RHS, ICmpInst, ICmpInst::Predicate, true > m_c_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R)
Matches an ICmp with a predicate over LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:299
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition: LLVMContext.h:54
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
@ GeneralDynamic
Definition: CodeGen.h:46
@ X86
Windows x64, Windows Itanium (IA-64)
@ PTR32_UPTR
Definition: X86.h:209
@ FS
Definition: X86.h:206
@ PTR64
Definition: X86.h:210
@ PTR32_SPTR
Definition: X86.h:208
@ GS
Definition: X86.h:205
Reg
All possible values of the reg field in the ModR/M byte.
@ MO_TLSLD
MO_TLSLD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
Definition: X86BaseInfo.h:425
@ MO_GOTPCREL_NORELAX
MO_GOTPCREL_NORELAX - Same as MO_GOTPCREL except that R_X86_64_GOTPCREL relocations are guaranteed to...
Definition: X86BaseInfo.h:405
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
Definition: X86BaseInfo.h:502
@ MO_NTPOFF
MO_NTPOFF - On a symbol operand this indicates that the immediate is the negative thread-pointer offs...
Definition: X86BaseInfo.h:464
@ MO_INDNTPOFF
MO_INDNTPOFF - On a symbol operand this indicates that the immediate is the absolute address of the G...
Definition: X86BaseInfo.h:446
@ MO_GOTNTPOFF
MO_GOTNTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry w...
Definition: X86BaseInfo.h:470
@ MO_TPOFF
MO_TPOFF - On a symbol operand this indicates that the immediate is the thread-pointer offset for the...
Definition: X86BaseInfo.h:452
@ MO_TLVP_PIC_BASE
MO_TLVP_PIC_BASE - On a symbol operand this indicates that the immediate is some TLS offset from the ...
Definition: X86BaseInfo.h:490
@ MO_TLSGD
MO_TLSGD - On a symbol operand this indicates that the immediate is the offset of the GOT entry with ...
Definition: X86BaseInfo.h:417
@ MO_NO_FLAG
MO_NO_FLAG - No flag for the operand.
Definition: X86BaseInfo.h:377
@ MO_TLVP
MO_TLVP - On a symbol operand this indicates that the immediate is some TLS offset.
Definition: X86BaseInfo.h:486
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the reference is actually to the "__imp...
Definition: X86BaseInfo.h:474
@ MO_GOTTPOFF
MO_GOTTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry wi...
Definition: X86BaseInfo.h:439
@ MO_SECREL
MO_SECREL - On a symbol operand this indicates that the immediate is the offset from beginning of sec...
Definition: X86BaseInfo.h:494
@ MO_DTPOFF
MO_DTPOFF - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
Definition: X86BaseInfo.h:458
@ MO_TLSLDM
MO_TLSLDM - On a symbol operand this indicates that the immediate is the offset of the GOT entry with...
Definition: X86BaseInfo.h:433
@ MO_GOTPCREL
MO_GOTPCREL - On a symbol operand this indicates that the immediate is offset to the GOT entry for th...
Definition: X86BaseInfo.h:401
@ FST
This instruction implements a truncating store from FP stack slots.
@ CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FMAX
Floating point max and min.
@ BT
X86 bit-test instructions.
@ HADD
Integer horizontal add/sub.
@ MOVQ2DQ
Copies a 64-bit value from an MMX vector to the low word of an XMM vector, with the high word zero fi...
@ BLENDI
Blend where the selector is an immediate.
@ CMP
X86 compare and logical compare instructions.
@ BLENDV
Dynamic (non-constant condition) vector blend where only the sign bits of the condition elements are ...
@ ADDSUB
Combined add and sub on an FP vector.
@ STRICT_FCMP
X86 strict FP compare instructions.
@ STRICT_CMPM
Vector comparison generating mask bits for fp and integer signed and unsigned data types.
@ FHADD
Floating point horizontal add/sub.
@ BSR
Bit scan reverse.
@ SETCC
X86 SetCC.
@ NT_BRIND
BRIND node with NoTrack prefix.
@ SELECTS
X86 Select.
@ FSETCCM
X86 FP SETCC, similar to above, but with output as an i1 mask and and a version with SAE.
@ PEXTRB
Extract an 8-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRB.
@ FXOR
Bitwise logical XOR of floating point values.
@ BRCOND
X86 conditional branches.
@ FSETCC
X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
@ PINSRB
Insert the lower 8-bits of a 32-bit value to a vector, corresponds to X86::PINSRB.
@ INSERTPS
Insert any element of a 4 x float vector into any element of a destination 4 x floatvector.
@ PSHUFB
Shuffle 16 8-bit values within a vector.
@ PEXTRW
Extract a 16-bit value from a vector and zero extend it to i32, corresponds to X86::PEXTRW.
@ AADD
RAO arithmetic instructions.
@ FANDN
Bitwise logical ANDNOT of floating point values.
@ GlobalBaseReg
On Darwin, this node represents the result of the popl at function entry, used for PIC code.
@ FMAXC
Commutative FMIN and FMAX.
@ EXTRQI
SSE4A Extraction and Insertion.
@ FLD
This instruction implements an extending load to FP stack slots.
@ PSADBW
Compute Sum of Absolute Differences.
@ FOR
Bitwise logical OR of floating point values.
@ FIST
This instruction implements a fp->int store from FP stack slots.
@ FP_TO_INT_IN_MEM
This instruction implements FP_TO_SINT with the integer destination in memory and a FP reg source.
@ LADD
LOCK-prefixed arithmetic read-modify-write instructions.
@ MMX_MOVW2D
Copies a GPR into the low 32-bit word of a MMX vector and zero out the high word.
@ Wrapper
A wrapper node for TargetConstantPool, TargetJumpTable, TargetExternalSymbol, TargetGlobalAddress,...
@ PINSRW
Insert the lower 16-bits of a 32-bit value to a vector, corresponds to X86::PINSRW.
@ CMPCCXADD
Compare and Add if Condition is Met.
@ MMX_MOVD2W
Copies a 32-bit value from the low word of a MMX vector to a GPR.
@ FILD
This instruction implements SINT_TO_FP with the integer source in memory and FP reg result.
@ MOVDQ2Q
Copies a 64-bit value from the low word of an XMM vector to an MMX vector.
@ ANDNP
Bitwise Logical AND NOT of Packed FP values.
@ BSF
Bit scan forward.
@ VAARG_64
These instructions grab the address of the next argument from a va_list.
@ FAND
Bitwise logical AND of floating point values.
@ CMOV
X86 conditional moves.
@ WrapperRIP
Special wrapper used under X86-64 PIC mode for RIP relative displacements.
@ FSHL
X86 funnel/double shift i16 instructions.
@ FRSQRT
Floating point reciprocal-sqrt and reciprocal approximation.
@ TO_NEAREST_INT
Definition: X86BaseInfo.h:42
@ CUR_DIRECTION
Definition: X86BaseInfo.h:46
bool mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into a vector splat instruction as a memory oper...
bool isZeroNode(SDValue Elt)
Returns true if Elt is a constant zero or floating point constant +0.0.
CondCode GetOppositeBranchCondition(CondCode CC)
GetOppositeBranchCondition - Return the inverse of the specified cond, e.g.
bool mayFoldIntoZeroExtend(SDValue Op)
Check if Op is an operation that could be folded into a zero extend x86 instruction.
@ AddrNumOperands
Definition: X86BaseInfo.h:36
bool mayFoldIntoStore(SDValue Op)
Check if Op is a value that could be used to fold a store into some other x86 instruction as a memory...
bool isExtendedSwiftAsyncFrameSupported(const X86Subtarget &Subtarget, const MachineFunction &MF)
True if the target supports the extended frame for async Swift functions.
bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, bool AssumeSingleUse=false)
Check if Op is a load operation that could be folded into some other x86 instruction as a memory oper...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement)
Returns true of the given offset can be fit into displacement field of the instruction.
bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs)
If Op is a constant whose elements are all the same constant or undefined, return true and return the...
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
std::optional< const char * > toString(const std::optional< DWARFFormValue > &V)
Take an optional DWARFFormValue and try to extract a string value from it.
constexpr double e
Definition: MathExtras.h:31
NodeAddr< FuncNode * > Func
Definition: RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits, unsigned NumDstElts, bool IsAnyExtend, SmallVectorImpl< int > &ShuffleMask)
Decode a zero extension instruction as a shuffle mask.
IterT next_nodbg(IterT It, IterT End, bool SkipPseudoOp=true)
Increment It, then continue incrementing it while it points to a debug instruction.
static bool isGlobalStubReference(unsigned char TargetFlag)
isGlobalStubReference - Return true if the specified TargetFlag operand is a reference to a stub for ...
Definition: X86InstrInfo.h:109
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:456
@ Length
Definition: DWP.cpp:456
void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVHLPS instruction as a v2f64/v4f32 shuffle mask.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
static bool isGlobalRelativeToPICBase(unsigned char TargetFlag)
isGlobalRelativeToPICBase - Return true if the specified global value reference is relative to a 32-b...
Definition: X86InstrInfo.h:127
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
void DecodeZeroMoveLowMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decode a move lower and zero upper instruction as a shuffle mask.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
void DecodeVPERMILPMask(unsigned NumElts, unsigned ScalarBits, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants.
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isAllOnesOrAllOnesSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant -1 integer or a splatted vector of a constant -1 integer (with...
Definition: Utils.cpp:1527
void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshuflw.
static const IntrinsicData * getIntrinsicWithChain(unsigned IntNo)
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2406
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition: MathExtras.h:343
MCRegister getX86SubSuperRegister(MCRegister Reg, unsigned Size, bool High=false)
AddressSpace
Definition: NVPTXBaseInfo.h:21
@ SjLj
setjmp/longjmp based exceptions
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
static void setDirectAddressInInstr(MachineInstr *MI, unsigned Operand, unsigned Reg)
Replace the address used in the instruction with the direct memory reference.
void DecodeVPERMV3Mask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMT2 W/D/Q/PS/PD mask from a raw array of constants.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
void DecodeBLENDMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a BLEND immediate mask into a shuffle mask.
void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a shuffle packed values at 128-bit granularity (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) immed...
void DecodeVPERMMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for VPERMQ/VPERMPD.
static const MachineInstrBuilder & addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset=0, bool mem=true)
addFrameReference - This function is used to add a reference to the base of an abstract object on the...
void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A EXTRQ instruction as a shuffle mask.
static const MachineInstrBuilder & addFullAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM)
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:280
static const IntrinsicData * getIntrinsicWithoutChain(unsigned IntNo)
@ SM_SentinelUndef
@ SM_SentinelZero
bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition: Utils.cpp:1509
void DecodePSRLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1768
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:330
bool isMinSignedConstant(SDValue V)
Returns true if V is a constant min signed integer value.
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
unsigned M1(unsigned Val)
Definition: VE.h:376
void DecodeVPERMIL2PMask(unsigned NumElts, unsigned ScalarBits, unsigned M2Z, ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERMIL2PD/VPERMIL2PS variable mask from a raw array of constants.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl< int > &ShuffleMask)
Decode a MOVLHPS instruction as a v2f64/v4f32 shuffle mask.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
static const MachineInstrBuilder & addRegOffset(const MachineInstrBuilder &MIB, unsigned Reg, bool isKill, int Offset)
addRegOffset - This function is used to add a memory reference of the form [Reg + Offset],...
void createUnpackShuffleMask(EVT VT, SmallVectorImpl< int > &Mask, bool Lo, bool Unary)
Generate unpacklo/unpackhi shuffle mask.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:138
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
void DecodeINSERTQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx, SmallVectorImpl< int > &ShuffleMask)
Decode a SSE4A INSERTQ instruction as a shuffle mask.
SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:143
void DecodeVPERMVMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPERM W/D/Q/PS/PD mask from a raw array of constants.
static void verifyIntrinsicTables()
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Mod
The access may modify the value stored in memory.
void createSplat2ShuffleMask(MVT VT, SmallVectorImpl< int > &Mask, bool Lo)
Similar to unpacklo/unpackhi, but without the 128-bit lane limitation imposed by AVX and specific to ...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
bool isFuncletEHPersonality(EHPersonality Pers)
Returns true if this is a personality function that invokes handler funclets (which must return to it...
void DecodeVALIGNMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
CombineLevel
Definition: DAGCombine.h:15
auto lower_bound(R &&Range, T &&Value)
Provide wrappers to std::lower_bound which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:1954
void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ Add
Sum of integers.
void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad, SmallVectorImpl< int > &ShuffleMask)
Decode a scalar float move instruction as a shuffle mask.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1914
static X86AddressMode getAddressFromInstr(const MachineInstr *MI, unsigned Operand)
Compute the addressing mode from an machine instruction starting with the given operand.
void DecodeVPPERMMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a VPPERM mask from a raw array of constants such as from BUILD_VECTOR.
DWARFExpression::Operation Op
void DecodePALIGNRMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
void DecodeMOVSLDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
RoundingMode
Rounding mode.
@ TowardZero
roundTowardZero.
@ NearestTiesToEven
roundTiesToEven.
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
Definition: VE.h:375
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
void DecodeINSERTPSMask(unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decode a 128-bit INSERTPS instruction as a v4f32 shuffle mask.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
void DecodePSLLDQMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1921
void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2039
@ TRUNCATE_TO_MEM_VI16
@ INTR_TYPE_SCALAR_MASK_SAE
@ INTR_TYPE_1OP_SAE
@ TRUNCATE_TO_MEM_VI32
@ INTR_TYPE_2OP_SAE
@ TRUNCATE_TO_REG
@ INTR_TYPE_3OP_SCALAR_MASK_SAE
@ INTR_TYPE_3OP_MASK_SAE
@ INTR_TYPE_2OP_MASK
@ TRUNCATE_TO_MEM_VI8
@ CVTNEPS2BF16_MASK
@ CMP_MASK_SCALAR_CC
@ INTR_TYPE_1OP_MASK_SAE
@ FIXUPIMM_MASKZ
@ INTR_TYPE_SCALAR_MASK
@ INTR_TYPE_3OP_IMM8
@ INTR_TYPE_2OP_MASK_SAE
@ INTR_TYPE_SCALAR_MASK_RND
@ INTR_TYPE_1OP_MASK
@ COMPRESS_EXPAND_IN_REG
@ INTR_TYPE_4OP_IMM8
void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
void DecodeMOVDDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
void array_pod_sort(IteratorTy Start, IteratorTy End)
array_pod_sort - This sorts an array with the specified start and end extent.
Definition: STLExtras.h:1607
void DecodeVectorBroadcast(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
Decodes a broadcast of the first element of a vector.
void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for shufp*.
void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm, SmallVectorImpl< int > &ShuffleMask)
Decodes the shuffle masks for pshufhw.
void DecodeMOVSHDUPMask(unsigned NumElts, SmallVectorImpl< int > &ShuffleMask)
uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition: MathExtras.h:203
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
void DecodePSHUFBMask(ArrayRef< uint64_t > RawMask, const APInt &UndefElts, SmallVectorImpl< int > &ShuffleMask)
Decode a PSHUFB mask from a raw array of constants such as from BUILD_VECTOR.
int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
static const MachineInstrBuilder & addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg)
addDirectMem - This function is used to add a direct memory reference to the current instruction – th...
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
#define EQ(a, b)
Definition: regexec.c:112
This is used by foldAnyOrAllBitsSet() to capture a source value (Root) and the bit indexes (Mask) nee...
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:249
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:230
static constexpr roundingMode rmTowardZero
Definition: APFloat.h:234
static const fltSemantics & x87DoubleExtended() LLVM_READNONE
Definition: APFloat.cpp:263
static const fltSemantics & IEEEquad() LLVM_READNONE
Definition: APFloat.cpp:251
static const fltSemantics & IEEEdouble() LLVM_READNONE
Definition: APFloat.cpp:250
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:247
static const fltSemantics & BFloat() LLVM_READNONE
Definition: APFloat.cpp:248
opStatus
IEEE-754R 7: Default exception handling.
Definition: APFloat.h:246
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Extended Value Type.
Definition: ValueTypes.h:34
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:93
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:290
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:448
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:233
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:349
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:628
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:203
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:366
bool is512BitVector() const
Return true if this is a 512-bit vector type.
Definition: ValueTypes.h:213
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:58
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:208
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:202
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:101
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:438
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:198
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:297
static std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
Definition: KnownBits.cpp:494
KnownBits anyextOrTrunc(unsigned BitWidth) const
Return known bits for an "any" extension or truncation of the value we're tracking.
Definition: KnownBits.h:182
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:104
bool isZero() const
Returns true if value is all zero.
Definition: KnownBits.h:77
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition: KnownBits.h:238
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:63
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:157
bool hasConflict() const
Returns true if there is conflicting information.
Definition: KnownBits.h:47
unsigned countMaxPopulation() const
Returns the maximum number of bits that could be one.
Definition: KnownBits.h:285
void setAllZero()
Make all bits known to be zero and discard any previous information.
Definition: KnownBits.h:89
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:40
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition: KnownBits.h:168
bool isConstant() const
Returns true if we know the value of all bits.
Definition: KnownBits.h:50
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:71
static KnownBits abdu(const KnownBits &LHS, const KnownBits &RHS)
Compute known bits for abdu(LHS, RHS).
Definition: KnownBits.cpp:234
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition: KnownBits.h:221
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:292
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:307
KnownBits zextOrTrunc(unsigned BitWidth) const
Return known bits for a zero extension or truncation of the value we're tracking.
Definition: KnownBits.h:192
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:244
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition: KnownBits.h:141
static KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition: KnownBits.cpp:57
bool isNegative() const
Returns true if this value is known to be negative.
Definition: KnownBits.h:101
void setAllOnes()
Make all bits known to be one and discard any previous information.
Definition: KnownBits.h:95
static KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
Definition: KnownBits.cpp:777
static std::optional< bool > sgt(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_SGT result.
Definition: KnownBits.cpp:532
bool isAllOnes() const
Returns true if value is all one bits.
Definition: KnownBits.h:83
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition: KnownBits.h:57
This class contains a discriminated union of information about pointers in memory operands,...
bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoSignedZeros() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
MVT ConstraintVT
The ValueType for the operand value.
std::string ConstraintCode
This contains the actual string for the code, like "m".
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setChain(SDValue InChain)
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)
X86AddressMode - This struct holds a generalized full x86 address mode.