LLVM 20.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86FrameLowering.h"
18#include "X86InstrBuilder.h"
19#include "X86IntrinsicsInfo.h"
21#include "X86TargetMachine.h"
23#include "llvm/ADT/SmallSet.h"
24#include "llvm/ADT/Statistic.h"
41#include "llvm/IR/CallingConv.h"
42#include "llvm/IR/Constants.h"
45#include "llvm/IR/Function.h"
46#include "llvm/IR/GlobalAlias.h"
48#include "llvm/IR/IRBuilder.h"
50#include "llvm/IR/Intrinsics.h"
52#include "llvm/MC/MCAsmInfo.h"
53#include "llvm/MC/MCContext.h"
54#include "llvm/MC/MCExpr.h"
55#include "llvm/MC/MCSymbol.h"
57#include "llvm/Support/Debug.h"
62#include <algorithm>
63#include <bitset>
64#include <cctype>
65#include <numeric>
66using namespace llvm;
67
68#define DEBUG_TYPE "x86-isel"
69
71 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
73 "Sets the preferable loop alignment for experiments (as log2 bytes) "
74 "for innermost loops only. If specified, this option overrides "
75 "alignment set by x86-experimental-pref-loop-alignment."),
77
79 "x86-br-merging-base-cost", cl::init(2),
81 "Sets the cost threshold for when multiple conditionals will be merged "
82 "into one branch versus be split in multiple branches. Merging "
83 "conditionals saves branches at the cost of additional instructions. "
84 "This value sets the instruction cost limit, below which conditionals "
85 "will be merged, and above which conditionals will be split. Set to -1 "
86 "to never merge branches."),
88
90 "x86-br-merging-ccmp-bias", cl::init(6),
91 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
92 "supports conditional compare instructions."),
94
95static cl::opt<bool>
96 WidenShift("x86-widen-shift", cl::init(true),
97 cl::desc("Replace narrow shifts with wider shifts."),
99
101 "x86-br-merging-likely-bias", cl::init(0),
102 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
103 "that all conditionals will be executed. For example for merging "
104 "the conditionals (a == b && c > d), if its known that a == b is "
105 "likely, then it is likely that if the conditionals are split "
106 "both sides will be executed, so it may be desirable to increase "
107 "the instruction cost threshold. Set to -1 to never merge likely "
108 "branches."),
109 cl::Hidden);
110
112 "x86-br-merging-unlikely-bias", cl::init(-1),
113 cl::desc(
114 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
115 "that all conditionals will be executed. For example for merging "
116 "the conditionals (a == b && c > d), if its known that a == b is "
117 "unlikely, then it is unlikely that if the conditionals are split "
118 "both sides will be executed, so it may be desirable to decrease "
119 "the instruction cost threshold. Set to -1 to never merge unlikely "
120 "branches."),
121 cl::Hidden);
122
124 "mul-constant-optimization", cl::init(true),
125 cl::desc("Replace 'mul x, Const' with more effective instructions like "
126 "SHIFT, LEA, etc."),
127 cl::Hidden);
128
130 const X86Subtarget &STI)
131 : TargetLowering(TM), Subtarget(STI) {
132 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
133 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
134
135 // Set up the TargetLowering object.
136
137 // X86 is weird. It always uses i8 for shift amounts and setcc results.
139 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
141
142 // X86 instruction cache is coherent with its data cache so we can use the
143 // default expansion to a no-op.
145
146 // For 64-bit, since we have so many registers, use the ILP scheduler.
147 // For 32-bit, use the register pressure specific scheduling.
148 // For Atom, always use ILP scheduling.
149 if (Subtarget.isAtom())
151 else if (Subtarget.is64Bit())
153 else
155 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
157
158 // Bypass expensive divides and use cheaper ones.
159 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
160 if (Subtarget.hasSlowDivide32())
161 addBypassSlowDiv(32, 8);
162 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
163 addBypassSlowDiv(64, 32);
164 }
165
166 // Setup Windows compiler runtime calls.
167 if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
168 static const struct {
169 const RTLIB::Libcall Op;
170 const char * const Name;
171 const CallingConv::ID CC;
172 } LibraryCalls[] = {
173 { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
174 { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
175 { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
176 { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
177 { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
178 };
179
180 for (const auto &LC : LibraryCalls) {
181 setLibcallName(LC.Op, LC.Name);
182 setLibcallCallingConv(LC.Op, LC.CC);
183 }
184 }
185
186 if (Subtarget.canUseCMPXCHG16B())
188 else if (Subtarget.canUseCMPXCHG8B())
190 else
192
193 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
194
196
197 // Set up the register classes.
198 addRegisterClass(MVT::i8, &X86::GR8RegClass);
199 addRegisterClass(MVT::i16, &X86::GR16RegClass);
200 addRegisterClass(MVT::i32, &X86::GR32RegClass);
201 if (Subtarget.is64Bit())
202 addRegisterClass(MVT::i64, &X86::GR64RegClass);
203
204 for (MVT VT : MVT::integer_valuetypes())
206
207 // We don't accept any truncstore of integer registers.
208 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
209 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
210 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
211 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
212 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
213 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
214
215 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
216
217 // SETOEQ and SETUNE require checking two conditions.
218 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
221 }
222
223 // Integer absolute.
224 if (Subtarget.canUseCMOV()) {
225 setOperationAction(ISD::ABS , MVT::i16 , Custom);
226 setOperationAction(ISD::ABS , MVT::i32 , Custom);
227 if (Subtarget.is64Bit())
228 setOperationAction(ISD::ABS , MVT::i64 , Custom);
229 }
230
231 // Absolute difference.
232 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
233 setOperationAction(Op , MVT::i8 , Custom);
234 setOperationAction(Op , MVT::i16 , Custom);
235 setOperationAction(Op , MVT::i32 , Custom);
236 if (Subtarget.is64Bit())
237 setOperationAction(Op , MVT::i64 , Custom);
238 }
239
240 // Signed saturation subtraction.
244 if (Subtarget.is64Bit())
246
247 // Funnel shifts.
248 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
249 // For slow shld targets we only lower for code size.
250 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
251
252 setOperationAction(ShiftOp , MVT::i8 , Custom);
253 setOperationAction(ShiftOp , MVT::i16 , Custom);
254 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
255 if (Subtarget.is64Bit())
256 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
257 }
258
259 if (!Subtarget.useSoftFloat()) {
260 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
261 // operation.
266 // We have an algorithm for SSE2, and we turn this into a 64-bit
267 // FILD or VCVTUSI2SS/SD for other targets.
270 // We have an algorithm for SSE2->double, and we turn this into a
271 // 64-bit FILD followed by conditional FADD for other targets.
274
275 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
276 // this operation.
279 // SSE has no i16 to fp conversion, only i32. We promote in the handler
280 // to allow f80 to use i16 and f64 to use i16 with sse1 only
283 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
286 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
287 // are Legal, f80 is custom lowered.
290
291 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
292 // this operation.
294 // FIXME: This doesn't generate invalid exception when it should. PR44019.
300 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
301 // are Legal, f80 is custom lowered.
304
305 // Handle FP_TO_UINT by promoting the destination to a larger signed
306 // conversion.
308 // FIXME: This doesn't generate invalid exception when it should. PR44019.
311 // FIXME: This doesn't generate invalid exception when it should. PR44019.
317
322
323 if (!Subtarget.is64Bit()) {
326 }
327 }
328
329 if (Subtarget.hasSSE2()) {
330 // Custom lowering for saturating float to int conversions.
331 // We handle promotion to larger result types manually.
332 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
335 }
338 if (Subtarget.is64Bit()) {
341 }
342 }
343 if (Subtarget.hasAVX10_2()) {
346 for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
347 MVT::v4i64}) {
350 }
351 if (Subtarget.hasAVX10_2_512()) {
354 }
355 if (Subtarget.is64Bit()) {
358 }
359 }
360
361 // Handle address space casts between mixed sized pointers.
364
365 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
366 if (!Subtarget.hasSSE2()) {
371 if (Subtarget.is64Bit()) {
373 // Without SSE, i64->f64 goes through memory.
375 }
376 } else if (!Subtarget.is64Bit())
378
379 // Scalar integer divide and remainder are lowered to use operations that
380 // produce two results, to match the available instructions. This exposes
381 // the two-result form to trivial CSE, which is able to combine x/y and x%y
382 // into a single instruction.
383 //
384 // Scalar integer multiply-high is also lowered to use two-result
385 // operations, to match the available instructions. However, plain multiply
386 // (low) operations are left as Legal, as there are single-result
387 // instructions for this in x86. Using the two-result multiply instructions
388 // when both high and low results are needed must be arranged by dagcombine.
389 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
396 }
397
398 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
400 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
401 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
404 }
405 if (Subtarget.is64Bit())
410
411 setOperationAction(ISD::FREM , MVT::f32 , Expand);
412 setOperationAction(ISD::FREM , MVT::f64 , Expand);
413 setOperationAction(ISD::FREM , MVT::f80 , Expand);
414 setOperationAction(ISD::FREM , MVT::f128 , Expand);
415
416 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
422 }
423
424 // Promote the i8 variants and force them on up to i32 which has a shorter
425 // encoding.
426 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
428 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
429 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
430 // promote that too.
431 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
433
434 if (!Subtarget.hasBMI()) {
435 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
437 if (Subtarget.is64Bit()) {
438 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
440 }
441 }
442
443 if (Subtarget.hasLZCNT()) {
444 // When promoting the i8 variants, force them to i32 for a shorter
445 // encoding.
446 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
448 } else {
449 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
450 if (VT == MVT::i64 && !Subtarget.is64Bit())
451 continue;
454 }
455 }
456
459 // Special handling for half-precision floating point conversions.
460 // If we don't have F16C support, then lower half float conversions
461 // into library calls.
463 Op, MVT::f32,
464 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
465 // There's never any support for operations beyond MVT::f32.
466 setOperationAction(Op, MVT::f64, Expand);
467 setOperationAction(Op, MVT::f80, Expand);
468 setOperationAction(Op, MVT::f128, Expand);
469 }
470
471 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
474 }
475
476 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
477 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
478 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
479 setTruncStoreAction(VT, MVT::f16, Expand);
480 setTruncStoreAction(VT, MVT::bf16, Expand);
481
484 }
485
489 if (Subtarget.is64Bit())
491 if (Subtarget.hasPOPCNT()) {
492 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
493 // popcntw is longer to encode than popcntl and also has a false dependency
494 // on the dest that popcntl hasn't had since Cannon Lake.
495 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
496 } else {
501 }
502
504
505 if (!Subtarget.hasMOVBE())
507
508 // X86 wants to expand cmov itself.
509 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
514 }
515 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
516 if (VT == MVT::i64 && !Subtarget.is64Bit())
517 continue;
520 }
521
522 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
525
527 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
528 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
532 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
533 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
534
535 // Darwin ABI issue.
536 for (auto VT : { MVT::i32, MVT::i64 }) {
537 if (VT == MVT::i64 && !Subtarget.is64Bit())
538 continue;
545 }
546
547 // 64-bit shl, sra, srl (iff 32-bit x86)
548 for (auto VT : { MVT::i32, MVT::i64 }) {
549 if (VT == MVT::i64 && !Subtarget.is64Bit())
550 continue;
554 }
555
556 if (Subtarget.hasSSEPrefetch())
558
560
561 // Expand certain atomics
562 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
570 }
571
572 if (!Subtarget.is64Bit())
574
575 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
576 // All CPUs supporting AVX will atomically load/store aligned 128-bit
577 // values, so we can emit [V]MOVAPS/[V]MOVDQA.
580 }
581
582 if (Subtarget.canUseCMPXCHG16B())
584
585 // FIXME - use subtarget debug flags
586 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
587 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
588 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
590 }
591
594
597
598 setOperationAction(ISD::TRAP, MVT::Other, Legal);
600 if (Subtarget.isTargetPS())
602 else
604
605 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
607 setOperationAction(ISD::VAEND , MVT::Other, Expand);
608 bool Is64Bit = Subtarget.is64Bit();
609 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
610 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
611
614
616
617 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
620
622
623 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
624 setOperationAction(ISD::FABS, VT, Action);
625 setOperationAction(ISD::FNEG, VT, Action);
627 setOperationAction(ISD::FREM, VT, Action);
628 setOperationAction(ISD::FMA, VT, Action);
629 setOperationAction(ISD::FMINNUM, VT, Action);
630 setOperationAction(ISD::FMAXNUM, VT, Action);
635 setOperationAction(ISD::FSIN, VT, Action);
636 setOperationAction(ISD::FCOS, VT, Action);
637 setOperationAction(ISD::FSINCOS, VT, Action);
638 setOperationAction(ISD::FTAN, VT, Action);
639 setOperationAction(ISD::FSQRT, VT, Action);
640 setOperationAction(ISD::FPOW, VT, Action);
641 setOperationAction(ISD::FPOWI, VT, Action);
642 setOperationAction(ISD::FLOG, VT, Action);
643 setOperationAction(ISD::FLOG2, VT, Action);
644 setOperationAction(ISD::FLOG10, VT, Action);
645 setOperationAction(ISD::FEXP, VT, Action);
646 setOperationAction(ISD::FEXP2, VT, Action);
647 setOperationAction(ISD::FEXP10, VT, Action);
648 setOperationAction(ISD::FCEIL, VT, Action);
649 setOperationAction(ISD::FFLOOR, VT, Action);
651 setOperationAction(ISD::FRINT, VT, Action);
652 setOperationAction(ISD::BR_CC, VT, Action);
653 setOperationAction(ISD::SETCC, VT, Action);
656 setOperationAction(ISD::FROUND, VT, Action);
658 setOperationAction(ISD::FTRUNC, VT, Action);
659 setOperationAction(ISD::FLDEXP, VT, Action);
660 };
661
662 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
663 // f16, f32 and f64 use SSE.
664 // Set up the FP register classes.
665 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
666 : &X86::FR16RegClass);
667 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
668 : &X86::FR32RegClass);
669 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
670 : &X86::FR64RegClass);
671
672 // Disable f32->f64 extload as we can only generate this in one instruction
673 // under optsize. So its easier to pattern match (fpext (load)) for that
674 // case instead of needing to emit 2 instructions for extload in the
675 // non-optsize case.
676 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
677
678 for (auto VT : { MVT::f32, MVT::f64 }) {
679 // Use ANDPD to simulate FABS.
681
682 // Use XORP to simulate FNEG.
684
685 // Use ANDPD and ORPD to simulate FCOPYSIGN.
687
688 // These might be better off as horizontal vector ops.
691
692 // We don't support sin/cos/fmod
696 }
697
698 // Half type will be promoted by default.
699 setF16Action(MVT::f16, Promote);
707
738
739 setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
740 setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
741
742 // Lower this to MOVMSK plus an AND.
745
746 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
747 (UseX87 || Is64Bit)) {
748 // Use SSE for f32, x87 for f64.
749 // Set up the FP register classes.
750 addRegisterClass(MVT::f32, &X86::FR32RegClass);
751 if (UseX87)
752 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
753
754 // Use ANDPS to simulate FABS.
756
757 // Use XORP to simulate FNEG.
759
760 if (UseX87)
762
763 // Use ANDPS and ORPS to simulate FCOPYSIGN.
764 if (UseX87)
767
768 // We don't support sin/cos/fmod
772
773 if (UseX87) {
774 // Always expand sin/cos functions even though x87 has an instruction.
778 }
779 } else if (UseX87) {
780 // f32 and f64 in x87.
781 // Set up the FP register classes.
782 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
783 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
784
785 for (auto VT : { MVT::f32, MVT::f64 }) {
788
789 // Always expand sin/cos functions even though x87 has an instruction.
793 }
794 }
795
796 // Expand FP32 immediates into loads from the stack, save special cases.
797 if (isTypeLegal(MVT::f32)) {
798 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
799 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
800 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
801 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
802 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
803 } else // SSE immediates.
804 addLegalFPImmediate(APFloat(+0.0f)); // xorps
805 }
806 // Expand FP64 immediates into loads from the stack, save special cases.
807 if (isTypeLegal(MVT::f64)) {
808 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
809 addLegalFPImmediate(APFloat(+0.0)); // FLD0
810 addLegalFPImmediate(APFloat(+1.0)); // FLD1
811 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
812 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
813 } else // SSE immediates.
814 addLegalFPImmediate(APFloat(+0.0)); // xorpd
815 }
816 // Support fp16 0 immediate.
817 if (isTypeLegal(MVT::f16))
818 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
819
820 // Handle constrained floating-point operations of scalar.
833
834 // We don't support FMA.
837
838 // f80 always uses X87.
839 if (UseX87) {
840 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
843 {
845 addLegalFPImmediate(TmpFlt); // FLD0
846 TmpFlt.changeSign();
847 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
848
849 bool ignored;
850 APFloat TmpFlt2(+1.0);
852 &ignored);
853 addLegalFPImmediate(TmpFlt2); // FLD1
854 TmpFlt2.changeSign();
855 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
856 }
857
858 // Always expand sin/cos functions even though x87 has an instruction.
859 // clang-format off
871 // clang-format on
872
884
885 // Handle constrained floating-point operations of scalar.
892 if (isTypeLegal(MVT::f16)) {
895 } else {
897 }
898 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
899 // as Custom.
901 }
902
903 // f128 uses xmm registers, but most operations require libcalls.
904 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
905 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
906 : &X86::VR128RegClass);
907
908 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
909
920
924
925 // clang-format off
933 // clang-format on
934 // No STRICT_FSINCOS
937
940 // We need to custom handle any FP_ROUND with an f128 input, but
941 // LegalizeDAG uses the result type to know when to run a custom handler.
942 // So we have to list all legal floating point result types here.
943 if (isTypeLegal(MVT::f32)) {
946 }
947 if (isTypeLegal(MVT::f64)) {
950 }
951 if (isTypeLegal(MVT::f80)) {
955 }
956
958
959 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
960 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
961 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
962 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
963 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
964 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
965 }
966
967 // Always use a library call for pow.
968 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
969 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
970 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
971 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
972
981
982 // Some FP actions are always expanded for vector types.
983 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
984 MVT::v4f32, MVT::v8f32, MVT::v16f32,
985 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
986 // clang-format off
1000 // clang-format on
1001 }
1002
1003 // First set operation action for all vector types to either promote
1004 // (for widening) or expand (for scalarization). Then we will selectively
1005 // turn on ones that can be effectively codegen'd.
1045 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1046 setTruncStoreAction(InnerVT, VT, Expand);
1047
1048 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
1049 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
1050
1051 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1052 // types, we have to deal with them whether we ask for Expansion or not.
1053 // Setting Expand causes its own optimisation problems though, so leave
1054 // them legal.
1055 if (VT.getVectorElementType() == MVT::i1)
1056 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1057
1058 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1059 // split/scalarized right now.
1060 if (VT.getVectorElementType() == MVT::f16 ||
1061 VT.getVectorElementType() == MVT::bf16)
1062 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1063 }
1064 }
1065
1066 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1067 // with -msoft-float, disable use of MMX as well.
1068 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1069 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1070 // No operations on x86mmx supported, everything uses intrinsics.
1071 }
1072
1073 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1074 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1075 : &X86::VR128RegClass);
1076
1081
1082 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1083 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1091
1092 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1093 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1095
1101 }
1102
1103 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1104 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1105 : &X86::VR128RegClass);
1106
1107 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1108 // registers cannot be used even for integer operations.
1109 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1110 : &X86::VR128RegClass);
1111 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1112 : &X86::VR128RegClass);
1113 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1114 : &X86::VR128RegClass);
1115 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1116 : &X86::VR128RegClass);
1117 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1118 : &X86::VR128RegClass);
1119
1120 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1125 }
1126
1127 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1128 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1133 }
1134
1135 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1136 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1137 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1138
1139 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1140 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1141 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1142 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1143 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1144 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1145 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1146 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1147 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1148 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1151
1152 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1153 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1154 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1155
1156 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1158 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1160
1161 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1162
1163 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1164 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1165 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1166 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1167 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1168 }
1169
1180
1185
1186 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1192
1193 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1194 // setcc all the way to isel and prefer SETGT in some isel patterns.
1197 }
1198
1199 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1200 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1205
1206 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1212 }
1213
1214 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1218
1219 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1220 continue;
1221
1224 }
1225 setF16Action(MVT::v8f16, Expand);
1226 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1227 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1228 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1229 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1230 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1231 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1233
1234 // Custom lower v2i64 and v2f64 selects.
1241
1248
1249 // Custom legalize these to avoid over promotion or custom promotion.
1250 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1255 }
1256
1261
1264
1267
1268 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1273
1278
1279 // We want to legalize this to an f64 load rather than an i64 load on
1280 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1281 // store.
1282 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1283 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1284 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1285 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1286 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1288
1289 // Add 32-bit vector stores to help vectorization opportunities.
1290 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1292
1296 if (!Subtarget.hasAVX512())
1298
1302
1304
1321
1322 // In the customized shift lowering, the legal v4i32/v2i64 cases
1323 // in AVX2 will be recognized.
1324 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1328 if (VT == MVT::v2i64) continue;
1333 }
1334
1340 }
1341
1342 if (Subtarget.hasGFNI()) {
1347 }
1348
1349 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1350 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1351 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1352 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1353
1354 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1357 }
1358
1359 // These might be better off as horizontal vector ops.
1364 }
1365
1366 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1367 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1370 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1374 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1380
1382 }
1383
1384 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1385 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1386 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1387 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1388 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1389 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1390 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1391 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1392
1396
1397 // FIXME: Do we need to handle scalar-to-vector here?
1398 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1399 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1400
1401 // We directly match byte blends in the backend as they match the VSELECT
1402 // condition form.
1404
1405 // SSE41 brings specific instructions for doing vector sign extend even in
1406 // cases where we don't have SRA.
1407 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1410 }
1411
1412 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1413 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1414 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1415 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1416 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1417 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1418 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1419 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1420 }
1421
1422 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1423 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1424 // do the pre and post work in the vector domain.
1427 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1428 // so that DAG combine doesn't try to turn it into uint_to_fp.
1431 }
1432 }
1433
1434 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1436 }
1437
1438 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1439 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1440 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1443 }
1444
1445 // XOP can efficiently perform BITREVERSE with VPPERM.
1446 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1448 }
1449
1450 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1451 bool HasInt256 = Subtarget.hasInt256();
1452
1453 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1454 : &X86::VR256RegClass);
1455 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1456 : &X86::VR256RegClass);
1457 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1458 : &X86::VR256RegClass);
1459 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1460 : &X86::VR256RegClass);
1461 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1462 : &X86::VR256RegClass);
1463 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1464 : &X86::VR256RegClass);
1465 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1466 : &X86::VR256RegClass);
1467
1468 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1481
1483
1487
1493 }
1494
1495 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1496 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1497
1498 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1499 // even though v8i16 is a legal type.
1500 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1501 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1502 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1503 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1507
1514
1526
1527 if (!Subtarget.hasAVX512())
1529
1530 // In the customized shift lowering, the legal v8i32/v4i64 cases
1531 // in AVX2 will be recognized.
1532 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1538 if (VT == MVT::v4i64) continue;
1543 }
1544
1545 // These types need custom splitting if their input is a 128-bit vector.
1550
1554 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1555 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1558
1559 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1563 }
1564
1569
1570 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1575
1576 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1577 // setcc all the way to isel and prefer SETGT in some isel patterns.
1580 }
1581
1582 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1583 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1588
1589 if (Subtarget.hasAnyFMA()) {
1590 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1591 MVT::v2f64, MVT::v4f64 }) {
1594 }
1595 }
1596
1597 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1598 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1599 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1600 }
1601
1602 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1603 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1604 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1605 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1606
1607 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1608 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1609 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1610 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1611 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1612 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1613 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1614 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1615
1616 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1617 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1618
1619 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1620 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1621 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1622 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1623 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1624
1625 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1626 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1627 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1628 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1629 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1630 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1631 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1632 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1637
1638 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1639 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1640 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1641 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1642 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1643 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1644 }
1645
1646 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1649 }
1650
1651 if (HasInt256) {
1652 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1653 // when we have a 256bit-wide blend with immediate.
1656
1657 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1658 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1659 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1660 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1661 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1662 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1663 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1664 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1665 }
1666 }
1667
1668 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1669 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1670 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1672 }
1673
1674 // Extract subvector is special because the value type
1675 // (result) is 128-bit but the source is 256-bit wide.
1676 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1677 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1679 }
1680
1681 // Custom lower several nodes for 256-bit types.
1682 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1683 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1693 }
1694 setF16Action(MVT::v16f16, Expand);
1695 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1696 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1698 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1699 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1700 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1701 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1702
1703 if (HasInt256) {
1705
1706 // Custom legalize 2x32 to get a little better code.
1709
1710 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1711 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1713 }
1714 }
1715
1716 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1717 Subtarget.hasF16C()) {
1718 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1721 }
1722 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1725 }
1726 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1727 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1728 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1729 }
1730 setOperationAction(ISD::SETCC, MVT::v8f16, Custom);
1731 setOperationAction(ISD::SETCC, MVT::v16f16, Custom);
1732 }
1733
1734 // This block controls legalization of the mask vector sizes that are
1735 // available with AVX512. 512-bit vectors are in a separate block controlled
1736 // by useAVX512Regs.
1737 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1738 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1739 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1740 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1741 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1742 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1743
1747
1748 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1749 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1750 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1751 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1752 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1753 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1754 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1755 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1763
1764 // There is no byte sized k-register load or store without AVX512DQ.
1765 if (!Subtarget.hasDQI()) {
1766 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1767 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1768 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1769 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1770
1775 }
1776
1777 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1778 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1782 }
1783
1784 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1786
1787 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1791
1798 }
1799
1800 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1802 }
1803 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1804 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1807 }
1808 }
1809
1810 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1811 // elements. 512-bits can be disabled based on prefer-vector-width and
1812 // required-vector-width function attributes.
1813 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1814 bool HasBWI = Subtarget.hasBWI();
1815
1816 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1817 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1818 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1819 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1820 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1821 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1822 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1823
1824 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1825 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1826 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1827 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1828 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1829 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1830 if (HasBWI)
1831 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1832 }
1833
1834 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1845 }
1846 setOperationAction(ISD::LRINT, MVT::v16f32,
1847 Subtarget.hasDQI() ? Legal : Custom);
1848 setOperationAction(ISD::LRINT, MVT::v8f64,
1849 Subtarget.hasDQI() ? Legal : Custom);
1850 if (Subtarget.hasDQI())
1851 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1852
1853 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1858 }
1859
1860 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1865 }
1866
1873
1885
1886 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1887 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1888 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1889 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1890 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1891 if (HasBWI)
1892 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1893
1894 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1895 // to 512-bit rather than use the AVX2 instructions so that we can use
1896 // k-masks.
1897 if (!Subtarget.hasVLX()) {
1898 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1899 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1902 }
1903 }
1904
1906 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1907 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1917
1918 if (HasBWI) {
1919 // Extends from v64i1 masks to 512-bit vectors.
1923 }
1924
1925 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1938
1940 }
1941
1942 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1945 }
1946
1947 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1948 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1949 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1950 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1951
1952 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1953 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1954 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1955 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1956
1957 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1958 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1959 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1960 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1961 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1962 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1963 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1964 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1965
1966 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1967 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1968
1969 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1979
1980 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1981 // setcc all the way to isel and prefer SETGT in some isel patterns.
1984 }
1985
1986 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
1987 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
1992
1993 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
2000 }
2001
2002 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2003 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
2004 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
2006 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
2007 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
2008 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
2009 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
2014 }
2015
2016 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
2017 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
2018 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
2019 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
2020 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
2021 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
2022
2023 if (Subtarget.hasDQI()) {
2027 setOperationAction(Opc, MVT::v8i64, Custom);
2028 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
2029 }
2030
2031 if (Subtarget.hasCDI()) {
2032 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
2033 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
2035 }
2036 } // Subtarget.hasCDI()
2037
2038 if (Subtarget.hasVPOPCNTDQ()) {
2039 for (auto VT : { MVT::v16i32, MVT::v8i64 })
2041 }
2042
2043 // Extract subvector is special because the value type
2044 // (result) is 256-bit but the source is 512-bit wide.
2045 // 128-bit was made Legal under AVX1.
2046 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2047 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2049
2050 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2051 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2061 }
2062 setF16Action(MVT::v32f16, Expand);
2067 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2068 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2069 setOperationAction(ISD::SETCC, MVT::v32f16, Custom);
2070
2071 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2076 }
2077 if (HasBWI) {
2078 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2081 }
2082 } else {
2083 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2084 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2085 }
2086
2087 if (Subtarget.hasVBMI2()) {
2088 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2091 }
2092
2093 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2094 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2095 }
2096
2097 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2098 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2100 }// useAVX512Regs
2101
2102 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2103 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2104 MVT::v4i64}) {
2107 }
2108 }
2109
2110 // This block controls legalization for operations that don't have
2111 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2112 // narrower widths.
2113 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2114 // These operations are handled on non-VLX by artificially widening in
2115 // isel patterns.
2116
2120
2121 if (Subtarget.hasDQI()) {
2122 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2123 // v2f32 UINT_TO_FP is already custom under SSE2.
2126 "Unexpected operation action!");
2127 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2132 }
2133
2134 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2140 }
2141
2142 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2145 }
2146
2147 // Custom legalize 2x32 to get a little better code.
2150
2151 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2152 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2154
2155 if (Subtarget.hasDQI()) {
2159 setOperationAction(Opc, MVT::v2i64, Custom);
2160 setOperationAction(Opc, MVT::v4i64, Custom);
2161 }
2162 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2163 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2164 }
2165
2166 if (Subtarget.hasCDI()) {
2167 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2169 }
2170 } // Subtarget.hasCDI()
2171
2172 if (Subtarget.hasVPOPCNTDQ()) {
2173 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2175 }
2176
2177 // We can try to convert vectors to different sizes to leverage legal
2178 // `vpcompress` cases. So we mark these supported vector sizes as Custom and
2179 // then specialize to Legal below.
2180 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2181 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2182 MVT::v16i16, MVT::v8i8})
2184
2185 // Legal vpcompress depends on various AVX512 extensions.
2186 // Legal in AVX512F
2187 for (MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2189
2190 // Legal in AVX512F + AVX512VL
2191 if (Subtarget.hasVLX())
2192 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2193 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2195
2196 // Legal in AVX512F + AVX512VBMI2
2197 if (Subtarget.hasVBMI2())
2198 for (MVT VT : {MVT::v32i16, MVT::v64i8})
2200
2201 // Legal in AVX512F + AVX512VL + AVX512VBMI2
2202 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2203 for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2205 }
2206
2207 // This block control legalization of v32i1/v64i1 which are available with
2208 // AVX512BW..
2209 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2210 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2211 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2212
2213 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2224 }
2225
2226 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2228
2229 // Extends from v32i1 masks to 256-bit vectors.
2233
2234 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
2235 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2236 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2237 }
2238
2239 // These operations are handled on non-VLX by artificially widening in
2240 // isel patterns.
2241 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2242
2243 if (Subtarget.hasBITALG()) {
2244 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2246 }
2247 }
2248
2249 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2250 auto setGroup = [&] (MVT VT) {
2261
2274
2276
2279
2285
2291
2295 };
2296
2297 // AVX512_FP16 scalar operations
2298 setGroup(MVT::f16);
2314
2317
2318 if (Subtarget.useAVX512Regs()) {
2319 setGroup(MVT::v32f16);
2325 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2332
2337 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2339 MVT::v32i16);
2340 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2342 MVT::v32i16);
2343 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2345 MVT::v32i16);
2346 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2348 MVT::v32i16);
2349
2353
2354 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2355 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2356
2361 }
2362
2363 if (Subtarget.hasVLX()) {
2364 setGroup(MVT::v8f16);
2365 setGroup(MVT::v16f16);
2366
2377
2388
2389 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2392
2396
2397 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2398 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2399 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2400 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2401
2402 // Need to custom widen these to prevent scalarization.
2403 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2404 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2405
2410
2415 }
2416 }
2417
2418 if (!Subtarget.useSoftFloat() &&
2419 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2420 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2421 : &X86::VR128RegClass);
2422 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2423 : &X86::VR256RegClass);
2424 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2425 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2426 // Set the operation action Custom to do the customization later.
2429 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2430 setF16Action(VT, Expand);
2431 if (!Subtarget.hasBF16())
2437 }
2438 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2439 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2440 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2441 }
2442 setOperationAction(ISD::SETCC, MVT::v8bf16, Custom);
2443 setOperationAction(ISD::SETCC, MVT::v16bf16, Custom);
2445 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2446 }
2447
2448 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2449 Subtarget.useAVX512Regs()) {
2450 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2451 setF16Action(MVT::v32bf16, Expand);
2452 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2453 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2454 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2456 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2460 }
2461
2462 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2463 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2475 }
2476 if (Subtarget.hasAVX10_2_512()) {
2477 setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
2478 setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
2479 setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
2480 setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
2481 setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
2482 setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
2483 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2484 setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
2485 setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
2488 }
2489 for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2492 }
2493 }
2494
2495 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2496 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2497 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2498 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2499 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2500 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2501
2502 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2503 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2504 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2505 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2506 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2507
2508 if (Subtarget.hasBWI()) {
2509 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2510 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2511 }
2512
2513 if (Subtarget.hasFP16()) {
2514 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2523 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2532 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2537 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2542 }
2543 }
2544
2545 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2546 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2547 }
2548
2549 // We want to custom lower some of our intrinsics.
2553 if (!Subtarget.is64Bit()) {
2555 }
2556
2557 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2558 // handle type legalization for these operations here.
2559 //
2560 // FIXME: We really should do custom legalization for addition and
2561 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2562 // than generic legalization for 64-bit multiplication-with-overflow, though.
2563 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2564 if (VT == MVT::i64 && !Subtarget.is64Bit())
2565 continue;
2566 // Add/Sub/Mul with overflow operations are custom lowered.
2573
2574 // Support carry in as value rather than glue.
2580 }
2581
2582 // Combine sin / cos into _sincos_stret if it is available.
2583 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2584 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2587 }
2588
2589 if (Subtarget.isTargetWin64()) {
2590 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2591 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2592 setOperationAction(ISD::SREM, MVT::i128, Custom);
2593 setOperationAction(ISD::UREM, MVT::i128, Custom);
2602 }
2603
2604 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2605 // is. We should promote the value to 64-bits to solve this.
2606 // This is what the CRT headers do - `fmodf` is an inline header
2607 // function casting to f64 and calling `fmod`.
2608 if (Subtarget.is32Bit() &&
2609 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2610 // clang-format off
2611 for (ISD::NodeType Op :
2629 if (isOperationExpand(Op, MVT::f32))
2630 setOperationAction(Op, MVT::f32, Promote);
2631 // clang-format on
2632
2633 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2634 // it, but it's just a wrapper around ldexp.
2635 if (Subtarget.isOSWindows()) {
2637 if (isOperationExpand(Op, MVT::f32))
2638 setOperationAction(Op, MVT::f32, Promote);
2639 }
2640
2641 // We have target-specific dag combine patterns for the following nodes:
2652 ISD::SHL,
2653 ISD::SRA,
2654 ISD::SRL,
2655 ISD::OR,
2656 ISD::AND,
2662 ISD::ADD,
2663 ISD::FADD,
2664 ISD::FSUB,
2665 ISD::FNEG,
2666 ISD::FMA,
2670 ISD::SUB,
2671 ISD::LOAD,
2672 ISD::LRINT,
2674 ISD::MLOAD,
2675 ISD::STORE,
2691 ISD::SETCC,
2692 ISD::MUL,
2693 ISD::XOR,
2704
2706
2707 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2709 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2711 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2713
2714 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2715 // that needs to benchmarked and balanced with the potential use of vector
2716 // load/store types (PR33329, PR33914).
2719
2720 // Default loop alignment, which can be overridden by -align-loops.
2722
2723 // An out-of-order CPU can speculatively execute past a predictable branch,
2724 // but a conditional move could be stalled by an expensive earlier operation.
2725 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2726 EnableExtLdPromotion = true;
2728
2730
2731 // Default to having -disable-strictnode-mutation on
2732 IsStrictFPEnabled = true;
2733}
2734
2735// This has so far only been implemented for 64-bit MachO.
2737 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2738}
2739
2741 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2742 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2743}
2744
2746 const SDLoc &DL) const {
2747 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2748 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2749 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2750 return SDValue(Node, 0);
2751}
2752
2755 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2756 !Subtarget.hasBWI())
2757 return TypeSplitVector;
2758
2759 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2760 !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)
2761 return TypeSplitVector;
2762
2763 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2764 VT.getVectorElementType() != MVT::i1)
2765 return TypeWidenVector;
2766
2768}
2769
2770FastISel *
2772 const TargetLibraryInfo *libInfo) const {
2773 return X86::createFastISel(funcInfo, libInfo);
2774}
2775
2776//===----------------------------------------------------------------------===//
2777// Other Lowering Hooks
2778//===----------------------------------------------------------------------===//
2779
2781 bool AssumeSingleUse) {
2782 if (!AssumeSingleUse && !Op.hasOneUse())
2783 return false;
2784 if (!ISD::isNormalLoad(Op.getNode()))
2785 return false;
2786
2787 // If this is an unaligned vector, make sure the target supports folding it.
2788 auto *Ld = cast<LoadSDNode>(Op.getNode());
2789 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2790 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2791 return false;
2792
2793 // TODO: If this is a non-temporal load and the target has an instruction
2794 // for it, it should not be folded. See "useNonTemporalLoad()".
2795
2796 return true;
2797}
2798
2800 const X86Subtarget &Subtarget,
2801 bool AssumeSingleUse) {
2802 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2803 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2804 return false;
2805
2806 // We can not replace a wide volatile load with a broadcast-from-memory,
2807 // because that would narrow the load, which isn't legal for volatiles.
2808 auto *Ld = cast<LoadSDNode>(Op.getNode());
2809 return !Ld->isVolatile() ||
2810 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2811}
2812
2814 if (!Op.hasOneUse())
2815 return false;
2816 // Peek through (oneuse) bitcast users
2817 SDNode *User = *Op->user_begin();
2818 while (User->getOpcode() == ISD::BITCAST) {
2819 if (!User->hasOneUse())
2820 return false;
2821 User = *User->user_begin();
2822 }
2823 return ISD::isNormalStore(User);
2824}
2825
2827 if (Op.hasOneUse()) {
2828 unsigned Opcode = Op.getNode()->user_begin()->getOpcode();
2829 return (ISD::ZERO_EXTEND == Opcode);
2830 }
2831 return false;
2832}
2833
2834static bool isLogicOp(unsigned Opcode) {
2835 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2836 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2837}
2838
2839static bool isTargetShuffle(unsigned Opcode) {
2840 switch(Opcode) {
2841 default: return false;
2842 case X86ISD::BLENDI:
2843 case X86ISD::PSHUFB:
2844 case X86ISD::PSHUFD:
2845 case X86ISD::PSHUFHW:
2846 case X86ISD::PSHUFLW:
2847 case X86ISD::SHUFP:
2848 case X86ISD::INSERTPS:
2849 case X86ISD::EXTRQI:
2850 case X86ISD::INSERTQI:
2851 case X86ISD::VALIGN:
2852 case X86ISD::PALIGNR:
2853 case X86ISD::VSHLDQ:
2854 case X86ISD::VSRLDQ:
2855 case X86ISD::MOVLHPS:
2856 case X86ISD::MOVHLPS:
2857 case X86ISD::MOVSHDUP:
2858 case X86ISD::MOVSLDUP:
2859 case X86ISD::MOVDDUP:
2860 case X86ISD::MOVSS:
2861 case X86ISD::MOVSD:
2862 case X86ISD::MOVSH:
2863 case X86ISD::UNPCKL:
2864 case X86ISD::UNPCKH:
2865 case X86ISD::VBROADCAST:
2866 case X86ISD::VPERMILPI:
2867 case X86ISD::VPERMILPV:
2868 case X86ISD::VPERM2X128:
2869 case X86ISD::SHUF128:
2870 case X86ISD::VPERMIL2:
2871 case X86ISD::VPERMI:
2872 case X86ISD::VPPERM:
2873 case X86ISD::VPERMV:
2874 case X86ISD::VPERMV3:
2875 case X86ISD::VZEXT_MOVL:
2876 return true;
2877 }
2878}
2879
2880static bool isTargetShuffleVariableMask(unsigned Opcode) {
2881 switch (Opcode) {
2882 default: return false;
2883 // Target Shuffles.
2884 case X86ISD::PSHUFB:
2885 case X86ISD::VPERMILPV:
2886 case X86ISD::VPERMIL2:
2887 case X86ISD::VPPERM:
2888 case X86ISD::VPERMV:
2889 case X86ISD::VPERMV3:
2890 return true;
2891 // 'Faux' Target Shuffles.
2892 case ISD::OR:
2893 case ISD::AND:
2894 case X86ISD::ANDNP:
2895 return true;
2896 }
2897}
2898
2901 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2903 int ReturnAddrIndex = FuncInfo->getRAIndex();
2904
2905 if (ReturnAddrIndex == 0) {
2906 // Set up a frame object for the return address.
2907 unsigned SlotSize = RegInfo->getSlotSize();
2908 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2909 -(int64_t)SlotSize,
2910 false);
2911 FuncInfo->setRAIndex(ReturnAddrIndex);
2912 }
2913
2914 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2915}
2916
2918 bool HasSymbolicDisplacement) {
2919 // Offset should fit into 32 bit immediate field.
2920 if (!isInt<32>(Offset))
2921 return false;
2922
2923 // If we don't have a symbolic displacement - we don't have any extra
2924 // restrictions.
2925 if (!HasSymbolicDisplacement)
2926 return true;
2927
2928 // We can fold large offsets in the large code model because we always use
2929 // 64-bit offsets.
2930 if (CM == CodeModel::Large)
2931 return true;
2932
2933 // For kernel code model we know that all object resist in the negative half
2934 // of 32bits address space. We may not accept negative offsets, since they may
2935 // be just off and we may accept pretty large positive ones.
2936 if (CM == CodeModel::Kernel)
2937 return Offset >= 0;
2938
2939 // For other non-large code models we assume that latest small object is 16MB
2940 // before end of 31 bits boundary. We may also accept pretty large negative
2941 // constants knowing that all objects are in the positive half of address
2942 // space.
2943 return Offset < 16 * 1024 * 1024;
2944}
2945
2946/// Return true if the condition is an signed comparison operation.
2947static bool isX86CCSigned(unsigned X86CC) {
2948 switch (X86CC) {
2949 default:
2950 llvm_unreachable("Invalid integer condition!");
2951 case X86::COND_E:
2952 case X86::COND_NE:
2953 case X86::COND_B:
2954 case X86::COND_A:
2955 case X86::COND_BE:
2956 case X86::COND_AE:
2957 return false;
2958 case X86::COND_G:
2959 case X86::COND_GE:
2960 case X86::COND_L:
2961 case X86::COND_LE:
2962 return true;
2963 }
2964}
2965
2967 switch (SetCCOpcode) {
2968 // clang-format off
2969 default: llvm_unreachable("Invalid integer condition!");
2970 case ISD::SETEQ: return X86::COND_E;
2971 case ISD::SETGT: return X86::COND_G;
2972 case ISD::SETGE: return X86::COND_GE;
2973 case ISD::SETLT: return X86::COND_L;
2974 case ISD::SETLE: return X86::COND_LE;
2975 case ISD::SETNE: return X86::COND_NE;
2976 case ISD::SETULT: return X86::COND_B;
2977 case ISD::SETUGT: return X86::COND_A;
2978 case ISD::SETULE: return X86::COND_BE;
2979 case ISD::SETUGE: return X86::COND_AE;
2980 // clang-format on
2981 }
2982}
2983
2984/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2985/// condition code, returning the condition code and the LHS/RHS of the
2986/// comparison to make.
2988 bool isFP, SDValue &LHS, SDValue &RHS,
2989 SelectionDAG &DAG) {
2990 if (!isFP) {
2991 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
2992 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2993 // X > -1 -> X == 0, jump !sign.
2994 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2995 return X86::COND_NS;
2996 }
2997 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2998 // X < 0 -> X == 0, jump on sign.
2999 return X86::COND_S;
3000 }
3001 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
3002 // X >= 0 -> X == 0, jump on !sign.
3003 return X86::COND_NS;
3004 }
3005 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
3006 // X < 1 -> X <= 0
3007 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3008 return X86::COND_LE;
3009 }
3010 }
3011
3012 return TranslateIntegerX86CC(SetCCOpcode);
3013 }
3014
3015 // First determine if it is required or is profitable to flip the operands.
3016
3017 // If LHS is a foldable load, but RHS is not, flip the condition.
3018 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3019 !ISD::isNON_EXTLoad(RHS.getNode())) {
3020 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3021 std::swap(LHS, RHS);
3022 }
3023
3024 switch (SetCCOpcode) {
3025 default: break;
3026 case ISD::SETOLT:
3027 case ISD::SETOLE:
3028 case ISD::SETUGT:
3029 case ISD::SETUGE:
3030 std::swap(LHS, RHS);
3031 break;
3032 }
3033
3034 // On a floating point condition, the flags are set as follows:
3035 // ZF PF CF op
3036 // 0 | 0 | 0 | X > Y
3037 // 0 | 0 | 1 | X < Y
3038 // 1 | 0 | 0 | X == Y
3039 // 1 | 1 | 1 | unordered
3040 switch (SetCCOpcode) {
3041 // clang-format off
3042 default: llvm_unreachable("Condcode should be pre-legalized away");
3043 case ISD::SETUEQ:
3044 case ISD::SETEQ: return X86::COND_E;
3045 case ISD::SETOLT: // flipped
3046 case ISD::SETOGT:
3047 case ISD::SETGT: return X86::COND_A;
3048 case ISD::SETOLE: // flipped
3049 case ISD::SETOGE:
3050 case ISD::SETGE: return X86::COND_AE;
3051 case ISD::SETUGT: // flipped
3052 case ISD::SETULT:
3053 case ISD::SETLT: return X86::COND_B;
3054 case ISD::SETUGE: // flipped
3055 case ISD::SETULE:
3056 case ISD::SETLE: return X86::COND_BE;
3057 case ISD::SETONE:
3058 case ISD::SETNE: return X86::COND_NE;
3059 case ISD::SETUO: return X86::COND_P;
3060 case ISD::SETO: return X86::COND_NP;
3061 case ISD::SETOEQ:
3062 case ISD::SETUNE: return X86::COND_INVALID;
3063 // clang-format on
3064 }
3065}
3066
3067/// Is there a floating point cmov for the specific X86 condition code?
3068/// Current x86 isa includes the following FP cmov instructions:
3069/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3070static bool hasFPCMov(unsigned X86CC) {
3071 switch (X86CC) {
3072 default:
3073 return false;
3074 case X86::COND_B:
3075 case X86::COND_BE:
3076 case X86::COND_E:
3077 case X86::COND_P:
3078 case X86::COND_A:
3079 case X86::COND_AE:
3080 case X86::COND_NE:
3081 case X86::COND_NP:
3082 return true;
3083 }
3084}
3085
3086static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
3087 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
3088 VT.is512BitVector();
3089}
3090
3092 const CallInst &I,
3093 MachineFunction &MF,
3094 unsigned Intrinsic) const {
3096 Info.offset = 0;
3097
3098 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
3099 if (!IntrData) {
3100 switch (Intrinsic) {
3101 case Intrinsic::x86_aesenc128kl:
3102 case Intrinsic::x86_aesdec128kl:
3104 Info.ptrVal = I.getArgOperand(1);
3105 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3106 Info.align = Align(1);
3108 return true;
3109 case Intrinsic::x86_aesenc256kl:
3110 case Intrinsic::x86_aesdec256kl:
3112 Info.ptrVal = I.getArgOperand(1);
3113 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3114 Info.align = Align(1);
3116 return true;
3117 case Intrinsic::x86_aesencwide128kl:
3118 case Intrinsic::x86_aesdecwide128kl:
3120 Info.ptrVal = I.getArgOperand(0);
3121 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3122 Info.align = Align(1);
3124 return true;
3125 case Intrinsic::x86_aesencwide256kl:
3126 case Intrinsic::x86_aesdecwide256kl:
3128 Info.ptrVal = I.getArgOperand(0);
3129 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3130 Info.align = Align(1);
3132 return true;
3133 case Intrinsic::x86_cmpccxadd32:
3134 case Intrinsic::x86_cmpccxadd64:
3135 case Intrinsic::x86_atomic_bts:
3136 case Intrinsic::x86_atomic_btc:
3137 case Intrinsic::x86_atomic_btr: {
3139 Info.ptrVal = I.getArgOperand(0);
3140 unsigned Size = I.getType()->getScalarSizeInBits();
3141 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3142 Info.align = Align(Size);
3145 return true;
3146 }
3147 case Intrinsic::x86_atomic_bts_rm:
3148 case Intrinsic::x86_atomic_btc_rm:
3149 case Intrinsic::x86_atomic_btr_rm: {
3151 Info.ptrVal = I.getArgOperand(0);
3152 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3153 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3154 Info.align = Align(Size);
3157 return true;
3158 }
3159 case Intrinsic::x86_aadd32:
3160 case Intrinsic::x86_aadd64:
3161 case Intrinsic::x86_aand32:
3162 case Intrinsic::x86_aand64:
3163 case Intrinsic::x86_aor32:
3164 case Intrinsic::x86_aor64:
3165 case Intrinsic::x86_axor32:
3166 case Intrinsic::x86_axor64:
3167 case Intrinsic::x86_atomic_add_cc:
3168 case Intrinsic::x86_atomic_sub_cc:
3169 case Intrinsic::x86_atomic_or_cc:
3170 case Intrinsic::x86_atomic_and_cc:
3171 case Intrinsic::x86_atomic_xor_cc: {
3173 Info.ptrVal = I.getArgOperand(0);
3174 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3175 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3176 Info.align = Align(Size);
3179 return true;
3180 }
3181 }
3182 return false;
3183 }
3184
3185 switch (IntrData->Type) {
3188 case TRUNCATE_TO_MEM_VI32: {
3190 Info.ptrVal = I.getArgOperand(0);
3191 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3193 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3194 ScalarVT = MVT::i8;
3195 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3196 ScalarVT = MVT::i16;
3197 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3198 ScalarVT = MVT::i32;
3199
3200 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3201 Info.align = Align(1);
3203 break;
3204 }
3205 case GATHER:
3206 case GATHER_AVX2: {
3208 Info.ptrVal = nullptr;
3209 MVT DataVT = MVT::getVT(I.getType());
3210 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3211 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3212 IndexVT.getVectorNumElements());
3213 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3214 Info.align = Align(1);
3216 break;
3217 }
3218 case SCATTER: {
3220 Info.ptrVal = nullptr;
3221 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3222 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3223 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3224 IndexVT.getVectorNumElements());
3225 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3226 Info.align = Align(1);
3228 break;
3229 }
3230 default:
3231 return false;
3232 }
3233
3234 return true;
3235}
3236
3237/// Returns true if the target can instruction select the
3238/// specified FP immediate natively. If false, the legalizer will
3239/// materialize the FP immediate as a load from a constant pool.
3241 bool ForCodeSize) const {
3242 for (const APFloat &FPImm : LegalFPImmediates)
3243 if (Imm.bitwiseIsEqual(FPImm))
3244 return true;
3245 return false;
3246}
3247
3249 ISD::LoadExtType ExtTy,
3250 EVT NewVT) const {
3251 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3252
3253 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3254 // relocation target a movq or addq instruction: don't let the load shrink.
3255 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3256 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3257 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3258 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3259
3260 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3261 // those uses are extracted directly into a store, then the extract + store
3262 // can be store-folded. Therefore, it's probably not worth splitting the load.
3263 EVT VT = Load->getValueType(0);
3264 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
3265 for (SDUse &Use : Load->uses()) {
3266 // Skip uses of the chain value. Result 0 of the node is the load value.
3267 if (Use.getResNo() != 0)
3268 continue;
3269
3270 SDNode *User = Use.getUser();
3271
3272 // If this use is not an extract + store, it's probably worth splitting.
3273 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR || !User->hasOneUse() ||
3274 User->user_begin()->getOpcode() != ISD::STORE)
3275 return true;
3276 }
3277 // All non-chain uses are extract + store.
3278 return false;
3279 }
3280
3281 return true;
3282}
3283
3284/// Returns true if it is beneficial to convert a load of a constant
3285/// to just the constant itself.
3287 Type *Ty) const {
3288 assert(Ty->isIntegerTy());
3289
3290 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3291 if (BitSize == 0 || BitSize > 64)
3292 return false;
3293 return true;
3294}
3295
3297 // If we are using XMM registers in the ABI and the condition of the select is
3298 // a floating-point compare and we have blendv or conditional move, then it is
3299 // cheaper to select instead of doing a cross-register move and creating a
3300 // load that depends on the compare result.
3301 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3302 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3303}
3304
3306 // TODO: It might be a win to ease or lift this restriction, but the generic
3307 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3308 if (VT.isVector() && Subtarget.hasAVX512())
3309 return false;
3310
3311 return true;
3312}
3313
3315 SDValue C) const {
3316 // TODO: We handle scalars using custom code, but generic combining could make
3317 // that unnecessary.
3318 APInt MulC;
3319 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3320 return false;
3321
3322 // Find the type this will be legalized too. Otherwise we might prematurely
3323 // convert this to shl+add/sub and then still have to type legalize those ops.
3324 // Another choice would be to defer the decision for illegal types until
3325 // after type legalization. But constant splat vectors of i64 can't make it
3326 // through type legalization on 32-bit targets so we would need to special
3327 // case vXi64.
3328 while (getTypeAction(Context, VT) != TypeLegal)
3329 VT = getTypeToTransformTo(Context, VT);
3330
3331 // If vector multiply is legal, assume that's faster than shl + add/sub.
3332 // Multiply is a complex op with higher latency and lower throughput in
3333 // most implementations, sub-vXi32 vector multiplies are always fast,
3334 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3335 // is always going to be slow.
3336 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3337 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3338 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3339 return false;
3340
3341 // shl+add, shl+sub, shl+add+neg
3342 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3343 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3344}
3345
3347 unsigned Index) const {
3349 return false;
3350
3351 // Mask vectors support all subregister combinations and operations that
3352 // extract half of vector.
3353 if (ResVT.getVectorElementType() == MVT::i1)
3354 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3355 (Index == ResVT.getVectorNumElements()));
3356
3357 return (Index % ResVT.getVectorNumElements()) == 0;
3358}
3359
3361 unsigned Opc = VecOp.getOpcode();
3362
3363 // Assume target opcodes can't be scalarized.
3364 // TODO - do we have any exceptions?
3365 if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
3366 return false;
3367
3368 // If the vector op is not supported, try to convert to scalar.
3369 EVT VecVT = VecOp.getValueType();
3370 if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
3371 return true;
3372
3373 // If the vector op is supported, but the scalar op is not, the transform may
3374 // not be worthwhile.
3375 EVT ScalarVT = VecVT.getScalarType();
3376 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3377}
3378
3380 bool) const {
3381 // TODO: Allow vectors?
3382 if (VT.isVector())
3383 return false;
3384 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3385}
3386
3388 // Speculate cttz only if we can directly use TZCNT/CMOV, can promote to
3389 // i32/i64 or can rely on BSF passthrough value.
3390 return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
3391 Subtarget.hasBitScanPassThrough() ||
3392 (!Ty->isVectorTy() &&
3393 Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
3394}
3395
3397 // Speculate ctlz only if we can directly use LZCNT/CMOV, or can rely on BSR
3398 // passthrough value.
3399 return Subtarget.hasLZCNT() || Subtarget.canUseCMOV() ||
3400 Subtarget.hasBitScanPassThrough();
3401}
3402
3404 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3405 // expensive than a straight movsd. On the other hand, it's important to
3406 // shrink long double fp constant since fldt is very slow.
3407 return !Subtarget.hasSSE2() || VT == MVT::f80;
3408}
3409
3411 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3412 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3413}
3414
3416 const SelectionDAG &DAG,
3417 const MachineMemOperand &MMO) const {
3418 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3419 BitcastVT.getVectorElementType() == MVT::i1)
3420 return false;
3421
3422 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3423 return false;
3424
3425 // If both types are legal vectors, it's always ok to convert them.
3426 if (LoadVT.isVector() && BitcastVT.isVector() &&
3427 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
3428 return true;
3429
3430 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3431}
3432
3434 const MachineFunction &MF) const {
3435 // Do not merge to float value size (128 bytes) if no implicit
3436 // float attribute is set.
3437 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3438
3439 if (NoFloat) {
3440 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3441 return (MemVT.getSizeInBits() <= MaxIntSize);
3442 }
3443 // Make sure we don't merge greater than our preferred vector
3444 // width.
3445 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3446 return false;
3447
3448 return true;
3449}
3450
3452 return Subtarget.hasFastLZCNT();
3453}
3454
3456 const Instruction &AndI) const {
3457 return true;
3458}
3459
3461 EVT VT = Y.getValueType();
3462
3463 if (VT.isVector())
3464 return false;
3465
3466 if (!Subtarget.hasBMI())
3467 return false;
3468
3469 // There are only 32-bit and 64-bit forms for 'andn'.
3470 if (VT != MVT::i32 && VT != MVT::i64)
3471 return false;
3472
3473 return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
3474}
3475
3477 EVT VT = Y.getValueType();
3478
3479 if (!VT.isVector())
3480 return hasAndNotCompare(Y);
3481
3482 // Vector.
3483
3484 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3485 return false;
3486
3487 if (VT == MVT::v4i32)
3488 return true;
3489
3490 return Subtarget.hasSSE2();
3491}
3492
3494 return X.getValueType().isScalarInteger(); // 'bt'
3495}
3496
3500 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3501 SelectionDAG &DAG) const {
3502 // Does baseline recommend not to perform the fold by default?
3504 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3505 return false;
3506 // For scalars this transform is always beneficial.
3507 if (X.getValueType().isScalarInteger())
3508 return true;
3509 // If all the shift amounts are identical, then transform is beneficial even
3510 // with rudimentary SSE2 shifts.
3511 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3512 return true;
3513 // If we have AVX2 with it's powerful shift operations, then it's also good.
3514 if (Subtarget.hasAVX2())
3515 return true;
3516 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3517 return NewShiftOpcode == ISD::SHL;
3518}
3519
3521 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3522 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3523 if (!VT.isInteger())
3524 return ShiftOpc;
3525
3526 bool PreferRotate = false;
3527 if (VT.isVector()) {
3528 // For vectors, if we have rotate instruction support, then its definetly
3529 // best. Otherwise its not clear what the best so just don't make changed.
3530 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3531 VT.getScalarType() == MVT::i64);
3532 } else {
3533 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3534 // rotate unless we have a zext mask+shr.
3535 PreferRotate = Subtarget.hasBMI2();
3536 if (!PreferRotate) {
3537 unsigned MaskBits =
3538 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3539 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3540 }
3541 }
3542
3543 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3544 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3545
3546 if (PreferRotate && MayTransformRotate)
3547 return ISD::ROTL;
3548
3549 // If vector we don't really get much benefit swapping around constants.
3550 // Maybe we could check if the DAG has the flipped node already in the
3551 // future.
3552 if (VT.isVector())
3553 return ShiftOpc;
3554
3555 // See if the beneficial to swap shift type.
3556 if (ShiftOpc == ISD::SHL) {
3557 // If the current setup has imm64 mask, then inverse will have
3558 // at least imm32 mask (or be zext i32 -> i64).
3559 if (VT == MVT::i64)
3560 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3561 : ShiftOpc;
3562
3563 // We can only benefit if req at least 7-bit for the mask. We
3564 // don't want to replace shl of 1,2,3 as they can be implemented
3565 // with lea/add.
3566 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3567 }
3568
3569 if (VT == MVT::i64)
3570 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3571 // extremely efficient.
3572 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3573
3574 // Keep small shifts as shl so we can generate add/lea.
3575 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3576 }
3577
3578 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3579 // (PreferRotate will be set in the latter case).
3580 if (PreferRotate || !MayTransformRotate || VT.isVector())
3581 return ShiftOpc;
3582
3583 // Non-vector type and we have a zext mask with SRL.
3584 return ISD::SRL;
3585}
3586
3589 const Value *Lhs,
3590 const Value *Rhs) const {
3591 using namespace llvm::PatternMatch;
3592 int BaseCost = BrMergingBaseCostThresh.getValue();
3593 // With CCMP, branches can be merged in a more efficient way.
3594 if (BaseCost >= 0 && Subtarget.hasCCMP())
3595 BaseCost += BrMergingCcmpBias;
3596 // a == b && a == c is a fast pattern on x86.
3597 if (BaseCost >= 0 && Opc == Instruction::And &&
3600 BaseCost += 1;
3601 return {BaseCost, BrMergingLikelyBias.getValue(),
3602 BrMergingUnlikelyBias.getValue()};
3603}
3604
3606 return N->getOpcode() != ISD::FP_EXTEND;
3607}
3608
3610 const SDNode *N, CombineLevel Level) const {
3611 assert(((N->getOpcode() == ISD::SHL &&
3612 N->getOperand(0).getOpcode() == ISD::SRL) ||
3613 (N->getOpcode() == ISD::SRL &&
3614 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3615 "Expected shift-shift mask");
3616 // TODO: Should we always create i64 masks? Or only folded immediates?
3617 EVT VT = N->getValueType(0);
3618 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3619 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3620 // Only fold if the shift values are equal - so it folds to AND.
3621 // TODO - we should fold if either is a non-uniform vector but we don't do
3622 // the fold for non-splats yet.
3623 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3624 }
3626}
3627
3629 EVT VT = Y.getValueType();
3630
3631 // For vectors, we don't have a preference, but we probably want a mask.
3632 if (VT.isVector())
3633 return false;
3634
3635 // 64-bit shifts on 32-bit targets produce really bad bloated code.
3636 if (VT == MVT::i64 && !Subtarget.is64Bit())
3637 return false;
3638
3639 return true;
3640}
3641
3644 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3646 !Subtarget.isOSWindows())
3649 ExpansionFactor);
3650}
3651
3653 // Any legal vector type can be splatted more efficiently than
3654 // loading/spilling from memory.
3655 return isTypeLegal(VT);
3656}
3657
3659 MVT VT = MVT::getIntegerVT(NumBits);
3660 if (isTypeLegal(VT))
3661 return VT;
3662
3663 // PMOVMSKB can handle this.
3664 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3665 return MVT::v16i8;
3666
3667 // VPMOVMSKB can handle this.
3668 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3669 return MVT::v32i8;
3670
3671 // TODO: Allow 64-bit type for 32-bit target.
3672 // TODO: 512-bit types should be allowed, but make sure that those
3673 // cases are handled in combineVectorSizedSetCCEquality().
3674
3676}
3677
3678/// Val is the undef sentinel value or equal to the specified value.
3679static bool isUndefOrEqual(int Val, int CmpVal) {
3680 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3681}
3682
3683/// Return true if every element in Mask is the undef sentinel value or equal to
3684/// the specified value.
3685static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3686 return llvm::all_of(Mask, [CmpVal](int M) {
3687 return (M == SM_SentinelUndef) || (M == CmpVal);
3688 });
3689}
3690
3691/// Return true if every element in Mask, beginning from position Pos and ending
3692/// in Pos+Size is the undef sentinel value or equal to the specified value.
3693static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3694 unsigned Size) {
3695 return llvm::all_of(Mask.slice(Pos, Size),
3696 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3697}
3698
3699/// Val is either the undef or zero sentinel value.
3700static bool isUndefOrZero(int Val) {
3701 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3702}
3703
3704/// Return true if every element in Mask, beginning from position Pos and ending
3705/// in Pos+Size is the undef sentinel value.
3706static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3707 return llvm::all_of(Mask.slice(Pos, Size),
3708 [](int M) { return M == SM_SentinelUndef; });
3709}
3710
3711/// Return true if the mask creates a vector whose lower half is undefined.
3713 unsigned NumElts = Mask.size();
3714 return isUndefInRange(Mask, 0, NumElts / 2);
3715}
3716
3717/// Return true if the mask creates a vector whose upper half is undefined.
3719 unsigned NumElts = Mask.size();
3720 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3721}
3722
3723/// Return true if Val falls within the specified range (L, H].
3724static bool isInRange(int Val, int Low, int Hi) {
3725 return (Val >= Low && Val < Hi);
3726}
3727
3728/// Return true if the value of any element in Mask falls within the specified
3729/// range (L, H].
3730static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3731 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3732}
3733
3734/// Return true if the value of any element in Mask is the zero sentinel value.
3735static bool isAnyZero(ArrayRef<int> Mask) {
3736 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
3737}
3738
3739/// Return true if Val is undef or if its value falls within the
3740/// specified range (L, H].
3741static bool isUndefOrInRange(int Val, int Low, int Hi) {
3742 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3743}
3744
3745/// Return true if every element in Mask is undef or if its value
3746/// falls within the specified range (L, H].
3747static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3748 return llvm::all_of(
3749 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3750}
3751
3752/// Return true if Val is undef, zero or if its value falls within the
3753/// specified range (L, H].
3754static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3755 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3756}
3757
3758/// Return true if every element in Mask is undef, zero or if its value
3759/// falls within the specified range (L, H].
3760static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3761 return llvm::all_of(
3762 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3763}
3764
3765/// Return true if every element in Mask, is an in-place blend/select mask or is
3766/// undef.
3768 unsigned NumElts = Mask.size();
3769 for (auto [I, M] : enumerate(Mask))
3770 if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
3771 return false;
3772 return true;
3773}
3774
3775/// Return true if every element in Mask, beginning
3776/// from position Pos and ending in Pos + Size, falls within the specified
3777/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3778static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3779 unsigned Size, int Low, int Step = 1) {
3780 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3781 if (!isUndefOrEqual(Mask[i], Low))
3782 return false;
3783 return true;
3784}
3785
3786/// Return true if every element in Mask, beginning
3787/// from position Pos and ending in Pos+Size, falls within the specified
3788/// sequential range (Low, Low+Size], or is undef or is zero.
3790 unsigned Size, int Low,
3791 int Step = 1) {
3792 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3793 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3794 return false;
3795 return true;
3796}
3797
3798/// Return true if every element in Mask, beginning
3799/// from position Pos and ending in Pos+Size is undef or is zero.
3800static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3801 unsigned Size) {
3802 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3803}
3804
3805/// Return true if every element of a single input is referenced by the shuffle
3806/// mask. i.e. it just permutes them all.
3808 unsigned NumElts = Mask.size();
3809 APInt DemandedElts = APInt::getZero(NumElts);
3810 for (int M : Mask)
3811 if (isInRange(M, 0, NumElts))
3812 DemandedElts.setBit(M);
3813 return DemandedElts.isAllOnes();
3814}
3815
3816/// Helper function to test whether a shuffle mask could be
3817/// simplified by widening the elements being shuffled.
3818///
3819/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
3820/// leaves it in an unspecified state.
3821///
3822/// NOTE: This must handle normal vector shuffle masks and *target* vector
3823/// shuffle masks. The latter have the special property of a '-2' representing
3824/// a zero-ed lane of a vector.
3826 SmallVectorImpl<int> &WidenedMask) {
3827 WidenedMask.assign(Mask.size() / 2, 0);
3828 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
3829 int M0 = Mask[i];
3830 int M1 = Mask[i + 1];
3831
3832 // If both elements are undef, its trivial.
3833 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
3834 WidenedMask[i / 2] = SM_SentinelUndef;
3835 continue;
3836 }
3837
3838 // Check for an undef mask and a mask value properly aligned to fit with
3839 // a pair of values. If we find such a case, use the non-undef mask's value.
3840 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
3841 WidenedMask[i / 2] = M1 / 2;
3842 continue;
3843 }
3844 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
3845 WidenedMask[i / 2] = M0 / 2;
3846 continue;
3847 }
3848
3849 // When zeroing, we need to spread the zeroing across both lanes to widen.
3850 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
3851 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
3853 WidenedMask[i / 2] = SM_SentinelZero;
3854 continue;
3855 }
3856 return false;
3857 }
3858
3859 // Finally check if the two mask values are adjacent and aligned with
3860 // a pair.
3861 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
3862 WidenedMask[i / 2] = M0 / 2;
3863 continue;
3864 }
3865
3866 // Otherwise we can't safely widen the elements used in this shuffle.
3867 return false;
3868 }
3869 assert(WidenedMask.size() == Mask.size() / 2 &&
3870 "Incorrect size of mask after widening the elements!");
3871
3872 return true;
3873}
3874
3876 const APInt &Zeroable,
3877 bool V2IsZero,
3878 SmallVectorImpl<int> &WidenedMask) {
3879 // Create an alternative mask with info about zeroable elements.
3880 // Here we do not set undef elements as zeroable.
3881 SmallVector<int, 64> ZeroableMask(Mask);
3882 if (V2IsZero) {
3883 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3884 for (int i = 0, Size = Mask.size(); i != Size; ++i)
3885 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
3886 ZeroableMask[i] = SM_SentinelZero;
3887 }
3888 return canWidenShuffleElements(ZeroableMask, WidenedMask);
3889}
3890
3892 SmallVector<int, 32> WidenedMask;
3893 return canWidenShuffleElements(Mask, WidenedMask);
3894}
3895
3896// Attempt to narrow/widen shuffle mask until it matches the target number of
3897// elements.
3898static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
3899 SmallVectorImpl<int> &ScaledMask) {
3900 unsigned NumSrcElts = Mask.size();
3901 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
3902 "Illegal shuffle scale factor");
3903
3904 // Narrowing is guaranteed to work.
3905 if (NumDstElts >= NumSrcElts) {
3906 int Scale = NumDstElts / NumSrcElts;
3907 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
3908 return true;
3909 }
3910
3911 // We have to repeat the widening until we reach the target size, but we can
3912 // split out the first widening as it sets up ScaledMask for us.
3913 if (canWidenShuffleElements(Mask, ScaledMask)) {
3914 while (ScaledMask.size() > NumDstElts) {
3915 SmallVector<int, 16> WidenedMask;
3916 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
3917 return false;
3918 ScaledMask = std::move(WidenedMask);
3919 }
3920 return true;
3921 }
3922
3923 return false;
3924}
3925
3926static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
3927 SmallVector<int, 32> ScaledMask;
3928 return scaleShuffleElements(Mask, NumDstElts, ScaledMask);
3929}
3930
3931/// Returns true if Elt is a constant zero or a floating point constant +0.0.
3933 return isNullConstant(Elt) || isNullFPConstant(Elt);
3934}
3935
3936// Build a vector of constants.
3937// Use an UNDEF node if MaskElt == -1.
3938// Split 64-bit constants in the 32-bit mode.
3940 const SDLoc &dl, bool IsMask = false) {
3941
3943 bool Split = false;
3944
3945 MVT ConstVecVT = VT;
3946 unsigned NumElts = VT.getVectorNumElements();
3947 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3948 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3949 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3950 Split = true;
3951 }
3952
3953 MVT EltVT = ConstVecVT.getVectorElementType();
3954 for (unsigned i = 0; i < NumElts; ++i) {
3955 bool IsUndef = Values[i] < 0 && IsMask;
3956 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
3957 DAG.getConstant(Values[i], dl, EltVT);
3958 Ops.push_back(OpNode);
3959 if (Split)
3960 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
3961 DAG.getConstant(0, dl, EltVT));
3962 }
3963 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
3964 if (Split)
3965 ConstsNode = DAG.getBitcast(VT, ConstsNode);
3966 return ConstsNode;
3967}
3968
3969static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
3970 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
3971 assert(Bits.size() == Undefs.getBitWidth() &&
3972 "Unequal constant and undef arrays");
3974 bool Split = false;
3975
3976 MVT ConstVecVT = VT;
3977 unsigned NumElts = VT.getVectorNumElements();
3978 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3979 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3980 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3981 Split = true;
3982 }
3983
3984 MVT EltVT = ConstVecVT.getVectorElementType();
3985 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
3986 if (Undefs[i]) {
3987 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
3988 continue;
3989 }
3990 const APInt &V = Bits[i];
3991 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
3992 if (Split) {
3993 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
3994 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
3995 } else if (EltVT == MVT::f32) {
3997 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
3998 } else if (EltVT == MVT::f64) {
4000 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4001 } else {
4002 Ops.push_back(DAG.getConstant(V, dl, EltVT));
4003 }
4004 }
4005
4006 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4007 return DAG.getBitcast(VT, ConstsNode);
4008}
4009
4011 SelectionDAG &DAG, const SDLoc &dl) {
4012 APInt Undefs = APInt::getZero(Bits.size());
4013 return getConstVector(Bits, Undefs, VT, DAG, dl);
4014}
4015
4016/// Returns a vector of specified type with all zero elements.
4017static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4018 SelectionDAG &DAG, const SDLoc &dl) {
4019 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4020 VT.getVectorElementType() == MVT::i1) &&
4021 "Unexpected vector type");
4022
4023 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4024 // type. This ensures they get CSE'd. But if the integer type is not
4025 // available, use a floating-point +0.0 instead.
4026 SDValue Vec;
4027 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4028 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4029 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4030 } else if (VT.isFloatingPoint() &&
4032 Vec = DAG.getConstantFP(+0.0, dl, VT);
4033 } else if (VT.getVectorElementType() == MVT::i1) {
4034 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4035 "Unexpected vector type");
4036 Vec = DAG.getConstant(0, dl, VT);
4037 } else {
4038 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4039 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4040 }
4041 return DAG.getBitcast(VT, Vec);
4042}
4043
4044// Helper to determine if the ops are all the extracted subvectors come from a
4045// single source. If we allow commute they don't have to be in order (Lo/Hi).
4046static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
4047 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4048 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4049 LHS.getValueType() != RHS.getValueType() ||
4050 LHS.getOperand(0) != RHS.getOperand(0))
4051 return SDValue();
4052
4053 SDValue Src = LHS.getOperand(0);
4054 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
4055 return SDValue();
4056
4057 unsigned NumElts = LHS.getValueType().getVectorNumElements();
4058 if ((LHS.getConstantOperandAPInt(1) == 0 &&
4059 RHS.getConstantOperandAPInt(1) == NumElts) ||
4060 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
4061 LHS.getConstantOperandAPInt(1) == NumElts))
4062 return Src;
4063
4064 return SDValue();
4065}
4066
4067static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4068 const SDLoc &dl, unsigned vectorWidth) {
4069 EVT VT = Vec.getValueType();
4070 EVT ElVT = VT.getVectorElementType();
4071 unsigned Factor = VT.getSizeInBits() / vectorWidth;
4072 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
4073 VT.getVectorNumElements() / Factor);
4074
4075 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4076 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4077 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4078
4079 // This is the index of the first element of the vectorWidth-bit chunk
4080 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4081 IdxVal &= ~(ElemsPerChunk - 1);
4082
4083 // If the input is a buildvector just emit a smaller one.
4084 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4085 return DAG.getBuildVector(ResultVT, dl,
4086 Vec->ops().slice(IdxVal, ElemsPerChunk));
4087
4088 // Check if we're extracting the upper undef of a widening pattern.
4089 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
4090 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
4091 isNullConstant(Vec.getOperand(2)))
4092 return DAG.getUNDEF(ResultVT);
4093
4094 SDValue VecIdx = DAG.getVectorIdxConstant(IdxVal, dl);
4095 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
4096}
4097
4098/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4099/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4100/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4101/// instructions or a simple subregister reference. Idx is an index in the
4102/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4103/// lowering EXTRACT_VECTOR_ELT operations easier.
4104static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4105 SelectionDAG &DAG, const SDLoc &dl) {
4107 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
4108 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4109}
4110
4111/// Generate a DAG to grab 256-bits from a 512-bit vector.
4112static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4113 SelectionDAG &DAG, const SDLoc &dl) {
4114 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4115 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4116}
4117
4118static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4119 SelectionDAG &DAG, const SDLoc &dl,
4120 unsigned vectorWidth) {
4121 assert((vectorWidth == 128 || vectorWidth == 256) &&
4122 "Unsupported vector width");
4123 // Inserting UNDEF is Result
4124 if (Vec.isUndef())
4125 return Result;
4126 EVT VT = Vec.getValueType();
4127 EVT ElVT = VT.getVectorElementType();
4128 EVT ResultVT = Result.getValueType();
4129
4130 // Insert the relevant vectorWidth bits.
4131 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
4132 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4133
4134 // This is the index of the first element of the vectorWidth-bit chunk
4135 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4136 IdxVal &= ~(ElemsPerChunk - 1);
4137
4138 SDValue VecIdx = DAG.getVectorIdxConstant(IdxVal, dl);
4139 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
4140}
4141
4142/// Generate a DAG to put 128-bits into a vector > 128 bits. This
4143/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4144/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4145/// simple superregister reference. Idx is an index in the 128 bits
4146/// we want. It need not be aligned to a 128-bit boundary. That makes
4147/// lowering INSERT_VECTOR_ELT operations easier.
4148static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4149 SelectionDAG &DAG, const SDLoc &dl) {
4150 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4151 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4152}
4153
4154/// Widen a vector to a larger size with the same scalar type, with the new
4155/// elements either zero or undef.
4156static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
4157 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4158 const SDLoc &dl) {
4159 EVT VecVT = Vec.getValueType();
4161 VecVT.getScalarType() == VT.getScalarType() &&
4162 "Unsupported vector widening type");
4163 // If the upper 128-bits of a build vector are already undef/zero, then try to
4164 // widen from the lower 128-bits.
4165 if (Vec.getOpcode() == ISD::BUILD_VECTOR && VecVT.is256BitVector()) {
4166 unsigned NumSrcElts = VecVT.getVectorNumElements();
4167 ArrayRef<SDUse> Hi = Vec->ops().drop_front(NumSrcElts / 2);
4168 if (all_of(Hi, [&](SDValue V) {
4169 return V.isUndef() || (ZeroNewElements && X86::isZeroNode(V));
4170 }))
4171 Vec = extract128BitVector(Vec, 0, DAG, dl);
4172 }
4173 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
4174 : DAG.getUNDEF(VT);
4175 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
4176 DAG.getVectorIdxConstant(0, dl));
4177}
4178
4179/// Widen a vector to a larger size with the same scalar type, with the new
4180/// elements either zero or undef.
4181static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
4182 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4183 const SDLoc &dl, unsigned WideSizeInBits) {
4184 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
4185 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
4186 "Unsupported vector widening type");
4187 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
4188 MVT SVT = Vec.getSimpleValueType().getScalarType();
4189 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
4190 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4191}
4192
4193/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
4194/// and bitcast with integer types.
4195static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
4196 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
4197 unsigned NumElts = VT.getVectorNumElements();
4198 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4199 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4200 return VT;
4201}
4202
4203/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
4204/// bitcast with integer types.
4205static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
4206 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4207 const SDLoc &dl) {
4208 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
4209 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4210}
4211
4212// Helper function to collect subvector ops that are concatenated together,
4213// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
4214// The subvectors in Ops are guaranteed to be the same type.
4216 SelectionDAG &DAG) {
4217 assert(Ops.empty() && "Expected an empty ops vector");
4218
4219 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4220 Ops.append(N->op_begin(), N->op_end());
4221 return true;
4222 }
4223
4224 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4225 SDValue Src = N->getOperand(0);
4226 SDValue Sub = N->getOperand(1);
4227 const APInt &Idx = N->getConstantOperandAPInt(2);
4228 EVT VT = Src.getValueType();
4229 EVT SubVT = Sub.getValueType();
4230
4231 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4232 // insert_subvector(undef, x, lo)
4233 if (Idx == 0 && Src.isUndef()) {
4234 Ops.push_back(Sub);
4235 Ops.push_back(DAG.getUNDEF(SubVT));
4236 return true;
4237 }
4238 if (Idx == (VT.getVectorNumElements() / 2)) {
4239 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4240 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4241 Src.getOperand(1).getValueType() == SubVT &&
4242 isNullConstant(Src.getOperand(2))) {
4243 // Attempt to recurse into inner (matching) concats.
4244 SDValue Lo = Src.getOperand(1);
4245 SDValue Hi = Sub;
4246 SmallVector<SDValue, 2> LoOps, HiOps;
4247 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4248 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4249 LoOps.size() == HiOps.size()) {
4250 Ops.append(LoOps);
4251 Ops.append(HiOps);
4252 return true;
4253 }
4254 Ops.push_back(Lo);
4255 Ops.push_back(Hi);
4256 return true;
4257 }
4258 // insert_subvector(x, extract_subvector(x, lo), hi)
4259 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4260 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4261 Ops.append(2, Sub);
4262 return true;
4263 }
4264 // insert_subvector(undef, x, hi)
4265 if (Src.isUndef()) {
4266 Ops.push_back(DAG.getUNDEF(SubVT));
4267 Ops.push_back(Sub);
4268 return true;
4269 }
4270 }
4271 }
4272 }
4273
4274 return false;
4275}
4276
4277// Helper to check if \p V can be split into subvectors and the upper subvectors
4278// are all undef. In which case return the lower subvector.
4280 SelectionDAG &DAG) {
4281 SmallVector<SDValue> SubOps;
4282 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4283 return SDValue();
4284
4285 unsigned NumSubOps = SubOps.size();
4286 unsigned HalfNumSubOps = NumSubOps / 2;
4287 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4288
4289 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4290 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4291 return SDValue();
4292
4293 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4294 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4295 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4296}
4297
4298// Helper to check if we can access all the constituent subvectors without any
4299// extract ops.
4302 return collectConcatOps(N, Ops, DAG);
4303}
4304
4305static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4306 const SDLoc &dl) {
4307 EVT VT = Op.getValueType();
4308 unsigned NumElems = VT.getVectorNumElements();
4309 unsigned SizeInBits = VT.getSizeInBits();
4310 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4311 "Can't split odd sized vector");
4312
4313 // If this is a splat value (with no-undefs) then use the lower subvector,
4314 // which should be a free extraction.
4315 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4316 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4317 return std::make_pair(Lo, Lo);
4318
4319 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4320 return std::make_pair(Lo, Hi);
4321}
4322
4323/// Break an operation into 2 half sized ops and then concatenate the results.
4325 unsigned NumOps = Op.getNumOperands();
4326 EVT VT = Op.getValueType();
4327
4328 // Extract the LHS Lo/Hi vectors
4329 SmallVector<SDValue> LoOps(NumOps, SDValue());
4330 SmallVector<SDValue> HiOps(NumOps, SDValue());
4331 for (unsigned I = 0; I != NumOps; ++I) {
4332 SDValue SrcOp = Op.getOperand(I);
4333 if (!SrcOp.getValueType().isVector()) {
4334 LoOps[I] = HiOps[I] = SrcOp;
4335 continue;
4336 }
4337 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4338 }
4339
4340 EVT LoVT, HiVT;
4341 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4342 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4343 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4344 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4345}
4346
4347/// Break an unary integer operation into 2 half sized ops and then
4348/// concatenate the result back.
4350 const SDLoc &dl) {
4351 // Make sure we only try to split 256/512-bit types to avoid creating
4352 // narrow vectors.
4353 [[maybe_unused]] EVT VT = Op.getValueType();
4354 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4355 Op.getOperand(0).getValueType().is512BitVector()) &&
4356 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4357 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4358 VT.getVectorNumElements() &&
4359 "Unexpected VTs!");
4360 return splitVectorOp(Op, DAG, dl);
4361}
4362
4363/// Break a binary integer operation into 2 half sized ops and then
4364/// concatenate the result back.
4366 const SDLoc &dl) {
4367 // Assert that all the types match.
4368 [[maybe_unused]] EVT VT = Op.getValueType();
4369 assert(Op.getOperand(0).getValueType() == VT &&
4370 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4371 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4372 return splitVectorOp(Op, DAG, dl);
4373}
4374
4375// Helper for splitting operands of an operation to legal target size and
4376// apply a function on each part.
4377// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4378// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4379// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4380// The argument Builder is a function that will be applied on each split part:
4381// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4382template <typename F>
4384 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4385 F Builder, bool CheckBWI = true) {
4386 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4387 unsigned NumSubs = 1;
4388 if ((CheckBWI && Subtarget.useBWIRegs()) ||
4389 (!CheckBWI && Subtarget.useAVX512Regs())) {
4390 if (VT.getSizeInBits() > 512) {
4391 NumSubs = VT.getSizeInBits() / 512;
4392 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4393 }
4394 } else if (Subtarget.hasAVX2()) {
4395 if (VT.getSizeInBits() > 256) {
4396 NumSubs = VT.getSizeInBits() / 256;
4397 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4398 }
4399 } else {
4400 if (VT.getSizeInBits() > 128) {
4401 NumSubs = VT.getSizeInBits() / 128;
4402 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4403 }
4404 }
4405
4406 if (NumSubs == 1)
4407 return Builder(DAG, DL, Ops);
4408
4410 for (unsigned i = 0; i != NumSubs; ++i) {
4412 for (SDValue Op : Ops) {
4413 EVT OpVT = Op.getValueType();
4414 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4415 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4416 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4417 }
4418 Subs.push_back(Builder(DAG, DL, SubOps));
4419 }
4420 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4421}
4422
4423// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4424// targets.
4425static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4427 const X86Subtarget &Subtarget) {
4428 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4429 MVT SVT = VT.getScalarType();
4430
4431 // If we have a 32/64 splatted constant, splat it to DstTy to
4432 // encourage a foldable broadcast'd operand.
4433 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4434 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4435 // AVX512 broadcasts 32/64-bit operands.
4436 // TODO: Support float once getAVX512Node is used by fp-ops.
4437 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4439 return SDValue();
4440 // If we're not widening, don't bother if we're not bitcasting.
4441 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4442 return SDValue();
4443 if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {
4444 APInt SplatValue, SplatUndef;
4445 unsigned SplatBitSize;
4446 bool HasAnyUndefs;
4447 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4448 HasAnyUndefs, OpEltSizeInBits) &&
4449 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4450 return DAG.getConstant(SplatValue, DL, DstVT);
4451 }
4452 return SDValue();
4453 };
4454
4455 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4456
4457 MVT DstVT = VT;
4458 if (Widen)
4459 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4460
4461 // Canonicalize src operands.
4462 SmallVector<SDValue> SrcOps(Ops);
4463 for (SDValue &Op : SrcOps) {
4464 MVT OpVT = Op.getSimpleValueType();
4465 // Just pass through scalar operands.
4466 if (!OpVT.isVector())
4467 continue;
4468 assert(OpVT == VT && "Vector type mismatch");
4469
4470 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4471 Op = BroadcastOp;
4472 continue;
4473 }
4474
4475 // Just widen the subvector by inserting into an undef wide vector.
4476 if (Widen)
4477 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4478 }
4479
4480 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4481
4482 // Perform the 512-bit op then extract the bottom subvector.
4483 if (Widen)
4484 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4485 return Res;
4486}
4487
4488/// Insert i1-subvector to i1-vector.
4490 const X86Subtarget &Subtarget) {
4491
4492 SDLoc dl(Op);
4493 SDValue Vec = Op.getOperand(0);
4494 SDValue SubVec = Op.getOperand(1);
4495 SDValue Idx = Op.getOperand(2);
4496 unsigned IdxVal = Op.getConstantOperandVal(2);
4497
4498 // Inserting undef is a nop. We can just return the original vector.
4499 if (SubVec.isUndef())
4500 return Vec;
4501
4502 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4503 return Op;
4504
4505 MVT OpVT = Op.getSimpleValueType();
4506 unsigned NumElems = OpVT.getVectorNumElements();
4507 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
4508
4509 // Extend to natively supported kshift.
4510 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4511
4512 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4513 // if necessary.
4514 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4515 // May need to promote to a legal type.
4516 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4517 DAG.getConstant(0, dl, WideOpVT),
4518 SubVec, Idx);
4519 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4520 }
4521
4522 MVT SubVecVT = SubVec.getSimpleValueType();
4523 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4524 assert(IdxVal + SubVecNumElems <= NumElems &&
4525 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4526 "Unexpected index value in INSERT_SUBVECTOR");
4527
4528 SDValue Undef = DAG.getUNDEF(WideOpVT);
4529
4530 if (IdxVal == 0) {
4531 // Zero lower bits of the Vec
4532 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4533 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4534 ZeroIdx);
4535 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4536 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4537 // Merge them together, SubVec should be zero extended.
4538 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4539 DAG.getConstant(0, dl, WideOpVT),
4540 SubVec, ZeroIdx);
4541 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4542 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4543 }
4544
4545 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4546 Undef, SubVec, ZeroIdx);
4547
4548 if (Vec.isUndef()) {
4549 assert(IdxVal != 0 && "Unexpected index");
4550 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4551 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4552 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4553 }
4554
4556 assert(IdxVal != 0 && "Unexpected index");
4557 // If upper elements of Vec are known undef, then just shift into place.
4558 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4559 [](SDValue V) { return V.isUndef(); })) {
4560 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4561 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4562 } else {
4563 NumElems = WideOpVT.getVectorNumElements();
4564 unsigned ShiftLeft = NumElems - SubVecNumElems;
4565 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4566 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4567 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4568 if (ShiftRight != 0)
4569 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4570 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4571 }
4572 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4573 }
4574
4575 // Simple case when we put subvector in the upper part
4576 if (IdxVal + SubVecNumElems == NumElems) {
4577 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4578 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4579 if (SubVecNumElems * 2 == NumElems) {
4580 // Special case, use legal zero extending insert_subvector. This allows
4581 // isel to optimize when bits are known zero.
4582 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4583 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4584 DAG.getConstant(0, dl, WideOpVT),
4585 Vec, ZeroIdx);
4586 } else {
4587 // Otherwise use explicit shifts to zero the bits.
4588 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4589 Undef, Vec, ZeroIdx);
4590 NumElems = WideOpVT.getVectorNumElements();
4591 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4592 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4593 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4594 }
4595 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4596 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4597 }
4598
4599 // Inserting into the middle is more complicated.
4600
4601 NumElems = WideOpVT.getVectorNumElements();
4602
4603 // Widen the vector if needed.
4604 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4605
4606 unsigned ShiftLeft = NumElems - SubVecNumElems;
4607 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4608
4609 // Do an optimization for the most frequently used types.
4610 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4611 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4612 Mask0.flipAllBits();
4613 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4614 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4615 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4616 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4617 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4618 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4619 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4620 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4621
4622 // Reduce to original width if needed.
4623 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4624 }
4625
4626 // Clear the upper bits of the subvector and move it to its insert position.
4627 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4628 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4629 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4630 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4631
4632 // Isolate the bits below the insertion point.
4633 unsigned LowShift = NumElems - IdxVal;
4634 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4635 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4636 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4637 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4638
4639 // Isolate the bits after the last inserted bit.
4640 unsigned HighShift = IdxVal + SubVecNumElems;
4641 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4642 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4643 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4644 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4645
4646 // Now OR all 3 pieces together.
4647 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4648 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4649
4650 // Reduce to original width if needed.
4651 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4652}
4653
4655 const SDLoc &dl) {
4656 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4657 EVT SubVT = V1.getValueType();
4658 EVT SubSVT = SubVT.getScalarType();
4659 unsigned SubNumElts = SubVT.getVectorNumElements();
4660 unsigned SubVectorWidth = SubVT.getSizeInBits();
4661 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4662 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4663 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4664}
4665
4666/// Returns a vector of specified type with all bits set.
4667/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4668/// Then bitcast to their original type, ensuring they get CSE'd.
4669static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4670 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4671 "Expected a 128/256/512-bit vector type");
4672 unsigned NumElts = VT.getSizeInBits() / 32;
4673 SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
4674 return DAG.getBitcast(VT, Vec);
4675}
4676
4677static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
4678 SDValue In, SelectionDAG &DAG) {
4679 EVT InVT = In.getValueType();
4680 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
4681 assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
4682 ISD::ZERO_EXTEND == Opcode) &&
4683 "Unknown extension opcode");
4684
4685 // For 256-bit vectors, we only need the lower (128-bit) input half.
4686 // For 512-bit vectors, we only need the lower input half or quarter.
4687 if (InVT.getSizeInBits() > 128) {
4688 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
4689 "Expected VTs to be the same size!");
4690 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
4691 In = extractSubVector(In, 0, DAG, DL,
4692 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
4693 InVT = In.getValueType();
4694 }
4695
4696 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
4697 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
4698
4699 return DAG.getNode(Opcode, DL, VT, In);
4700}
4701
4702// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
4703static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS,
4704 SDValue Mask, SelectionDAG &DAG) {
4705 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
4706 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
4707 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
4708}
4709
4711 bool Lo, bool Unary) {
4712 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
4713 "Illegal vector type to unpack");
4714 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4715 int NumElts = VT.getVectorNumElements();
4716 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
4717 for (int i = 0; i < NumElts; ++i) {
4718 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4719 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4720 Pos += (Unary ? 0 : NumElts * (i % 2));
4721 Pos += (Lo ? 0 : NumEltsInLane / 2);
4722 Mask.push_back(Pos);
4723 }
4724}
4725
4726/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4727/// imposed by AVX and specific to the unary pattern. Example:
4728/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4729/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4731 bool Lo) {
4732 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4733 int NumElts = VT.getVectorNumElements();
4734 for (int i = 0; i < NumElts; ++i) {
4735 int Pos = i / 2;
4736 Pos += (Lo ? 0 : NumElts / 2);
4737 Mask.push_back(Pos);
4738 }
4739}
4740
4741// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
4742static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
4743 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
4745 (ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {
4746 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
4747 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
4748 int M = Mask[I];
4749 if (M < 0)
4750 continue;
4751 SDValue V = (M < NumElts) ? V1 : V2;
4752 if (V.isUndef())
4753 continue;
4754 Ops[I] = V.getOperand(M % NumElts);
4755 }
4756 return DAG.getBuildVector(VT, dl, Ops);
4757 }
4758
4759 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4760}
4761
4762/// Returns a vector_shuffle node for an unpackl operation.
4763static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4764 SDValue V1, SDValue V2) {
4766 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
4767 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4768}
4769
4770/// Returns a vector_shuffle node for an unpackh operation.
4771static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4772 SDValue V1, SDValue V2) {
4774 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
4775 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4776}
4777
4778/// Returns a node that packs the LHS + RHS nodes together at half width.
4779/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
4780/// TODO: Add subvector splitting if/when we have a need for it.
4781static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
4782 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
4783 bool PackHiHalf = false) {
4784 MVT OpVT = LHS.getSimpleValueType();
4785 unsigned EltSizeInBits = VT.getScalarSizeInBits();
4786 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
4787 assert(OpVT == RHS.getSimpleValueType() &&
4788 VT.getSizeInBits() == OpVT.getSizeInBits() &&
4789 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
4790 "Unexpected PACK operand types");
4791 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4792 "Unexpected PACK result type");
4793
4794 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4795 if (EltSizeInBits == 32) {
4796 SmallVector<int> PackMask;
4797 int Offset = PackHiHalf ? 1 : 0;
4798 int NumElts = VT.getVectorNumElements();
4799 for (int I = 0; I != NumElts; I += 4) {
4800 PackMask.push_back(I + Offset);
4801 PackMask.push_back(I + Offset + 2);
4802 PackMask.push_back(I + Offset + NumElts);
4803 PackMask.push_back(I + Offset + NumElts + 2);
4804 }
4805 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
4806 DAG.getBitcast(VT, RHS), PackMask);
4807 }
4808
4809 // See if we already have sufficient leading bits for PACKSS/PACKUS.
4810 if (!PackHiHalf) {
4811 if (UsePackUS &&
4812 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
4813 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
4814 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4815
4816 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
4817 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
4818 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4819 }
4820
4821 // Fallback to sign/zero extending the requested half and pack.
4822 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
4823 if (UsePackUS) {
4824 if (PackHiHalf) {
4825 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
4826 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
4827 } else {
4828 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4829 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
4830 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
4831 };
4832 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4833 };
4834
4835 if (!PackHiHalf) {
4836 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
4837 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
4838 }
4839 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
4840 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
4841 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4842}
4843
4844/// Return a vector_shuffle of the specified vector of zero or undef vector.
4845/// This produces a shuffle where the low element of V2 is swizzled into the
4846/// zero/undef vector, landing at element Idx.
4847/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
4849 bool IsZero,
4850 const X86Subtarget &Subtarget,
4851 SelectionDAG &DAG) {
4852 MVT VT = V2.getSimpleValueType();
4853 SDValue V1 = IsZero
4854 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4855 int NumElems = VT.getVectorNumElements();
4856 SmallVector<int, 16> MaskVec(NumElems);
4857 for (int i = 0; i != NumElems; ++i)
4858 // If this is the insertion idx, put the low elt of V2 here.
4859 MaskVec[i] = (i == Idx) ? NumElems : i;
4860 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4861}
4862
4864 if (Ptr.getOpcode() == X86ISD::Wrapper ||
4865 Ptr.getOpcode() == X86ISD::WrapperRIP)
4866 Ptr = Ptr.getOperand(0);
4867 return dyn_cast<ConstantPoolSDNode>(Ptr);
4868}
4869
4870// TODO: Add support for non-zero offsets.
4873 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4874 return nullptr;
4875 return CNode->getConstVal();
4876}
4877
4879 if (!Load || !ISD::isNormalLoad(Load))
4880 return nullptr;
4881 return getTargetConstantFromBasePtr(Load->getBasePtr());
4882}
4883
4886 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
4887}
4888
4889const Constant *
4891 assert(LD && "Unexpected null LoadSDNode");
4892 return getTargetConstantFromNode(LD);
4893}
4894
4895// Extract raw constant bits from constant pools.
4896static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
4897 APInt &UndefElts,
4898 SmallVectorImpl<APInt> &EltBits,
4899 bool AllowWholeUndefs = true,
4900 bool AllowPartialUndefs = false) {
4901 assert(EltBits.empty() && "Expected an empty EltBits vector");
4902
4904
4905 EVT VT = Op.getValueType();
4906 unsigned SizeInBits = VT.getSizeInBits();
4907 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
4908 unsigned NumElts = SizeInBits / EltSizeInBits;
4909
4910 // Bitcast a source array of element bits to the target size.
4911 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
4912 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
4913 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
4914 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
4915 "Constant bit sizes don't match");
4916
4917 // Don't split if we don't allow undef bits.
4918 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
4919 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
4920 return false;
4921
4922 // If we're already the right size, don't bother bitcasting.
4923 if (NumSrcElts == NumElts) {
4924 UndefElts = UndefSrcElts;
4925 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
4926 return true;
4927 }
4928
4929 // Extract all the undef/constant element data and pack into single bitsets.
4930 APInt UndefBits(SizeInBits, 0);
4931 APInt MaskBits(SizeInBits, 0);
4932
4933 for (unsigned i = 0; i != NumSrcElts; ++i) {
4934 unsigned BitOffset = i * SrcEltSizeInBits;
4935 if (UndefSrcElts[i])
4936 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
4937 MaskBits.insertBits(SrcEltBits[i], BitOffset);
4938 }
4939
4940 // Split the undef/constant single bitset data into the target elements.
4941 UndefElts = APInt(NumElts, 0);
4942 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
4943
4944 for (unsigned i = 0; i != NumElts; ++i) {
4945 unsigned BitOffset = i * EltSizeInBits;
4946 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
4947
4948 // Only treat an element as UNDEF if all bits are UNDEF.
4949 if (UndefEltBits.isAllOnes()) {
4950 if (!AllowWholeUndefs)
4951 return false;
4952 UndefElts.setBit(i);
4953 continue;
4954 }
4955
4956 // If only some bits are UNDEF then treat them as zero (or bail if not
4957 // supported).
4958 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
4959 return false;
4960
4961 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
4962 }
4963 return true;
4964 };
4965
4966 // Collect constant bits and insert into mask/undef bit masks.
4967 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
4968 unsigned UndefBitIndex) {
4969 if (!Cst)
4970 return false;
4971 if (isa<UndefValue>(Cst)) {
4972 Undefs.setBit(UndefBitIndex);
4973 return true;
4974 }
4975 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
4976 Mask = CInt->getValue();
4977 return true;
4978 }
4979 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
4980 Mask = CFP->getValueAPF().bitcastToAPInt();
4981 return true;
4982 }
4983 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
4984 Type *Ty = CDS->getType();
4986 Type *EltTy = CDS->getElementType();
4987 bool IsInteger = EltTy->isIntegerTy();
4988 bool IsFP =
4989 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
4990 if (!IsInteger && !IsFP)
4991 return false;
4992 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
4993 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
4994 if (IsInteger)
4995 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
4996 else
4997 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
4998 I * EltBits);
4999 return true;
5000 }
5001 return false;
5002 };
5003
5004 // Handle UNDEFs.
5005 if (Op.isUndef()) {
5006 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
5007 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5008 return CastBitData(UndefSrcElts, SrcEltBits);
5009 }
5010
5011 // Extract scalar constant bits.
5012 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5013 APInt UndefSrcElts = APInt::getZero(1);
5014 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5015 return CastBitData(UndefSrcElts, SrcEltBits);
5016 }
5017 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5018 APInt UndefSrcElts = APInt::getZero(1);
5019 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5020 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5021 return CastBitData(UndefSrcElts, SrcEltBits);
5022 }
5023
5024 // Extract constant bits from build vector.
5025 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
5026 BitVector Undefs;
5027 SmallVector<APInt> SrcEltBits;
5028 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5029 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5030 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
5031 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
5032 if (Undefs[I])
5033 UndefSrcElts.setBit(I);
5034 return CastBitData(UndefSrcElts, SrcEltBits);
5035 }
5036 }
5037
5038 // Extract constant bits from constant pool vector.
5039 if (auto *Cst = getTargetConstantFromNode(Op)) {
5040 Type *CstTy = Cst->getType();
5041 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5042 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5043 return false;
5044
5045 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5046 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5047 if ((SizeInBits % SrcEltSizeInBits) != 0)
5048 return false;
5049
5050 APInt UndefSrcElts(NumSrcElts, 0);
5051 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5052 for (unsigned i = 0; i != NumSrcElts; ++i)
5053 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5054 UndefSrcElts, i))
5055 return false;
5056
5057 return CastBitData(UndefSrcElts, SrcEltBits);
5058 }
5059
5060 // Extract constant bits from a broadcasted constant pool scalar.
5061 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
5062 EltSizeInBits <= VT.getScalarSizeInBits()) {
5063 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5064 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
5065 return false;
5066
5067 SDValue Ptr = MemIntr->getBasePtr();
5069 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5070 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5071
5072 APInt UndefSrcElts(NumSrcElts, 0);
5073 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5074 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
5075 if (UndefSrcElts[0])
5076 UndefSrcElts.setBits(0, NumSrcElts);
5077 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
5078 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5079 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5080 return CastBitData(UndefSrcElts, SrcEltBits);
5081 }
5082 }
5083 }
5084
5085 // Extract constant bits from a subvector broadcast.
5086 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
5087 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5088 SDValue Ptr = MemIntr->getBasePtr();
5089 // The source constant may be larger than the subvector broadcast,
5090 // ensure we extract the correct subvector constants.
5091 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
5092 Type *CstTy = Cst->getType();
5093 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5094 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5095 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5096 (SizeInBits % SubVecSizeInBits) != 0)
5097 return false;
5098 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5099 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5100 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5101 APInt UndefSubElts(NumSubElts, 0);
5102 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
5103 APInt(CstEltSizeInBits, 0));
5104 for (unsigned i = 0; i != NumSubElts; ++i) {
5105 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
5106 UndefSubElts, i))
5107 return false;
5108 for (unsigned j = 1; j != NumSubVecs; ++j)
5109 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5110 }
5111 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
5112 UndefSubElts);
5113 return CastBitData(UndefSubElts, SubEltBits);
5114 }
5115 }
5116
5117 // Extract a rematerialized scalar constant insertion.
5118 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5119 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5120 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5121 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5122 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5123
5124 APInt UndefSrcElts(NumSrcElts, 0);
5125 SmallVector<APInt, 64> SrcEltBits;
5126 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
5127 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
5128 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5129 return CastBitData(UndefSrcElts, SrcEltBits);
5130 }
5131
5132 // Insert constant bits from a base and sub vector sources.
5133 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
5134 // If bitcasts to larger elements we might lose track of undefs - don't
5135 // allow any to be safe.
5136 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5137 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5138
5139 APInt UndefSrcElts, UndefSubElts;
5140 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
5141 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
5142 UndefSubElts, EltSubBits,
5143 AllowWholeUndefs && AllowUndefs,
5144 AllowPartialUndefs && AllowUndefs) &&
5145 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
5146 UndefSrcElts, EltSrcBits,
5147 AllowWholeUndefs && AllowUndefs,
5148 AllowPartialUndefs && AllowUndefs)) {
5149 unsigned BaseIdx = Op.getConstantOperandVal(2);
5150 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
5151 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
5152 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5153 return CastBitData(UndefSrcElts, EltSrcBits);
5154 }
5155 }
5156
5157 // Extract constant bits from a subvector's source.
5158 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
5159 // TODO - support extract_subvector through bitcasts.
5160 if (EltSizeInBits != VT.getScalarSizeInBits())
5161 return false;
5162
5163 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5164 UndefElts, EltBits, AllowWholeUndefs,
5165 AllowPartialUndefs)) {
5166 EVT SrcVT = Op.getOperand(0).getValueType();
5167 unsigned NumSrcElts = SrcVT.getVectorNumElements();
5168 unsigned NumSubElts = VT.getVectorNumElements();
5169 unsigned BaseIdx = Op.getConstantOperandVal(1);
5170 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
5171 if ((BaseIdx + NumSubElts) != NumSrcElts)
5172 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
5173 if (BaseIdx != 0)
5174 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
5175 return true;
5176 }
5177 }
5178
5179 // Extract constant bits from shuffle node sources.
5180 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
5181 // TODO - support shuffle through bitcasts.
5182 if (EltSizeInBits != VT.getScalarSizeInBits())
5183 return false;
5184
5185 ArrayRef<int> Mask = SVN->getMask();
5186 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5187 llvm::any_of(Mask, [](int M) { return M < 0; }))
5188 return false;
5189
5190 APInt UndefElts0, UndefElts1;
5191 SmallVector<APInt, 32> EltBits0, EltBits1;
5192 if (isAnyInRange(Mask, 0, NumElts) &&
5193 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5194 UndefElts0, EltBits0, AllowWholeUndefs,
5195 AllowPartialUndefs))
5196 return false;
5197 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
5198 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
5199 UndefElts1, EltBits1, AllowWholeUndefs,
5200 AllowPartialUndefs))
5201 return false;
5202
5203 UndefElts = APInt::getZero(NumElts);
5204 for (int i = 0; i != (int)NumElts; ++i) {
5205 int M = Mask[i];
5206 if (M < 0) {
5207 UndefElts.setBit(i);
5208 EltBits.push_back(APInt::getZero(EltSizeInBits));
5209 } else if (M < (int)NumElts) {
5210 if (UndefElts0[M])
5211 UndefElts.setBit(i);
5212 EltBits.push_back(EltBits0[M]);
5213 } else {
5214 if (UndefElts1[M - NumElts])
5215 UndefElts.setBit(i);
5216 EltBits.push_back(EltBits1[M - NumElts]);
5217 }
5218 }
5219 return true;
5220 }
5221
5222 return false;
5223}
5224
5225namespace llvm {
5226namespace X86 {
5227bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5228 APInt UndefElts;
5229 SmallVector<APInt, 16> EltBits;
5231 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5232 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5233 int SplatIndex = -1;
5234 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5235 if (UndefElts[i])
5236 continue;
5237 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5238 SplatIndex = -1;
5239 break;
5240 }
5241 SplatIndex = i;
5242 }
5243 if (0 <= SplatIndex) {
5244 SplatVal = EltBits[SplatIndex];
5245 return true;
5246 }
5247 }
5248
5249 return false;
5250}
5251} // namespace X86
5252} // namespace llvm
5253
5255 unsigned MaskEltSizeInBits,
5257 APInt &UndefElts) {
5258 // Extract the raw target constant bits.
5259 SmallVector<APInt, 64> EltBits;
5260 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5261 EltBits, /* AllowWholeUndefs */ true,
5262 /* AllowPartialUndefs */ false))
5263 return false;
5264
5265 // Insert the extracted elements into the mask.
5266 for (const APInt &Elt : EltBits)
5267 RawMask.push_back(Elt.getZExtValue());
5268
5269 return true;
5270}
5271
5272static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts,
5273 bool AllowUndefs) {
5274 APInt UndefElts;
5275 SmallVector<APInt, 64> EltBits;
5276 if (!getTargetConstantBitsFromNode(V, EltSizeInBIts, UndefElts, EltBits,
5277 /*AllowWholeUndefs*/ AllowUndefs,
5278 /*AllowPartialUndefs*/ false))
5279 return false;
5280
5281 bool IsPow2OrUndef = true;
5282 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
5283 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
5284 return IsPow2OrUndef;
5285}
5286
5287// Helper to attempt to return a cheaper, bit-inverted version of \p V.
5289 // TODO: don't always ignore oneuse constraints.
5290 V = peekThroughBitcasts(V);
5291 EVT VT = V.getValueType();
5292
5293 // Match not(xor X, -1) -> X.
5294 if (V.getOpcode() == ISD::XOR &&
5295 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5296 isAllOnesConstant(V.getOperand(1))))
5297 return V.getOperand(0);
5298
5299 // Match not(extract_subvector(not(X)) -> extract_subvector(X).
5300 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5301 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5302 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5303 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5304 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), VT, Not,
5305 V.getOperand(1));
5306 }
5307 }
5308
5309 // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5310 if (V.getOpcode() == X86ISD::PCMPGT &&
5311 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5312 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5313 V.getOperand(0).hasOneUse()) {
5314 APInt UndefElts;
5315 SmallVector<APInt> EltBits;
5316 if (getTargetConstantBitsFromNode(V.getOperand(0),
5317 V.getScalarValueSizeInBits(), UndefElts,
5318 EltBits) &&
5319 !ISD::isBuildVectorOfConstantSDNodes(V.getOperand(1).getNode())) {
5320 // Don't fold min_signed_value -> (min_signed_value - 1)
5321 bool MinSigned = false;
5322 for (APInt &Elt : EltBits) {
5323 MinSigned |= Elt.isMinSignedValue();
5324 Elt -= 1;
5325 }
5326 if (!MinSigned) {
5327 SDLoc DL(V);
5328 MVT VT = V.getSimpleValueType();
5329 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5330 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5331 }
5332 }
5333 }
5334
5335 // Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
5337 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5338 for (SDValue &CatOp : CatOps) {
5339 SDValue NotCat = IsNOT(CatOp, DAG);
5340 if (!NotCat)
5341 return SDValue();
5342 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5343 }
5344 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), VT, CatOps);
5345 }
5346
5347 // Match not(or(not(X),not(Y))) -> and(X, Y).
5348 if (V.getOpcode() == ISD::OR && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5349 V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {
5350 // TODO: Handle cases with single NOT operand -> ANDNP
5351 if (SDValue Op1 = IsNOT(V.getOperand(1), DAG))
5352 if (SDValue Op0 = IsNOT(V.getOperand(0), DAG))
5353 return DAG.getNode(ISD::AND, SDLoc(V), VT, DAG.getBitcast(VT, Op0),
5354 DAG.getBitcast(VT, Op1));
5355 }
5356
5357 return SDValue();
5358}
5359
5360/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5361/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5362/// Note: This ignores saturation, so inputs must be checked first.
5364 bool Unary, unsigned NumStages = 1) {
5365 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5366 unsigned NumElts = VT.getVectorNumElements();
5367 unsigned NumLanes = VT.getSizeInBits() / 128;
5368 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5369 unsigned Offset = Unary ? 0 : NumElts;
5370 unsigned Repetitions = 1u << (NumStages - 1);
5371 unsigned Increment = 1u << NumStages;
5372 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5373
5374 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5375 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5376 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5377 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5378 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5379 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5380 }
5381 }
5382}
5383
5384// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5385static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5386 APInt &DemandedLHS, APInt &DemandedRHS) {
5387 int NumLanes = VT.getSizeInBits() / 128;
5388 int NumElts = DemandedElts.getBitWidth();
5389 int NumInnerElts = NumElts / 2;
5390 int NumEltsPerLane = NumElts / NumLanes;
5391 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5392
5393 DemandedLHS = APInt::getZero(NumInnerElts);
5394 DemandedRHS = APInt::getZero(NumInnerElts);
5395
5396 // Map DemandedElts to the packed operands.
5397 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5398 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5399 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5400 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5401 if (DemandedElts[OuterIdx])
5402 DemandedLHS.setBit(InnerIdx);
5403 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5404 DemandedRHS.setBit(InnerIdx);
5405 }
5406 }
5407}
5408
5409// Split the demanded elts of a HADD/HSUB node between its operands.
5410static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5411 APInt &DemandedLHS, APInt &DemandedRHS) {
5413 DemandedLHS, DemandedRHS);
5414 DemandedLHS |= DemandedLHS << 1;
5415 DemandedRHS |= DemandedRHS << 1;
5416}
5417
5418/// Calculates the shuffle mask corresponding to the target-specific opcode.
5419/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5420/// operands in \p Ops, and returns true.
5421/// Sets \p IsUnary to true if only one source is used. Note that this will set
5422/// IsUnary for shuffles which use a single input multiple times, and in those
5423/// cases it will adjust the mask to only have indices within that single input.
5424/// It is an error to call this with non-empty Mask/Ops vectors.
5425static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5427 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5428 if (!isTargetShuffle(N.getOpcode()))
5429 return false;
5430
5431 MVT VT = N.getSimpleValueType();
5432 unsigned NumElems = VT.getVectorNumElements();
5433 unsigned MaskEltSize = VT.getScalarSizeInBits();
5435 APInt RawUndefs;
5436 uint64_t ImmN;
5437
5438 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5439 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5440
5441 IsUnary = false;
5442 bool IsFakeUnary = false;
5443 switch (N.getOpcode()) {
5444 case X86ISD::BLENDI:
5445 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5446 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5447 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5448 DecodeBLENDMask(NumElems, ImmN, Mask);
5449 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5450 break;
5451 case X86ISD::SHUFP:
5452 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5453 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5454 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5455 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5456 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5457 break;
5458 case X86ISD::INSERTPS:
5459 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5460 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5461 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5462 DecodeINSERTPSMask(ImmN, Mask, /*SrcIsMem=*/false);
5463 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5464 break;
5465 case X86ISD::EXTRQI:
5466 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5467 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5468 isa<ConstantSDNode>(N.getOperand(2))) {
5469 int BitLen = N.getConstantOperandVal(1);
5470 int BitIdx = N.getConstantOperandVal(2);
5471 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5472 IsUnary = true;
5473 }
5474 break;
5475 case X86ISD::INSERTQI:
5476 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5477 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5478 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5479 isa<ConstantSDNode>(N.getOperand(3))) {
5480 int BitLen = N.getConstantOperandVal(2);
5481 int BitIdx = N.getConstantOperandVal(3);
5482 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5483 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5484 }
5485 break;
5486 case X86ISD::UNPCKH:
5487 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5488 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5489 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5490 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5491 break;
5492 case X86ISD::UNPCKL:
5493 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5494 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5495 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5496 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5497 break;
5498 case X86ISD::MOVHLPS:
5499 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5500 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5501 DecodeMOVHLPSMask(NumElems, Mask);
5502 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5503 break;
5504 case X86ISD::MOVLHPS:
5505 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5506 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5507 DecodeMOVLHPSMask(NumElems, Mask);
5508 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5509 break;
5510 case X86ISD::VALIGN:
5511 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5512 "Only 32-bit and 64-bit elements are supported!");
5513 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5514 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5515 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5516 DecodeVALIGNMask(NumElems, ImmN, Mask);
5517 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5518 Ops.push_back(N.getOperand(1));
5519 Ops.push_back(N.getOperand(0));
5520 break;
5521 case X86ISD::PALIGNR:
5522 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5523 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5524 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5525 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5526 DecodePALIGNRMask(NumElems, ImmN, Mask);
5527 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5528 Ops.push_back(N.getOperand(1));
5529 Ops.push_back(N.getOperand(0));
5530 break;
5531 case X86ISD::VSHLDQ:
5532 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5533 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5534 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5535 DecodePSLLDQMask(NumElems, ImmN, Mask);
5536 IsUnary = true;
5537 break;
5538 case X86ISD::VSRLDQ:
5539 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5540 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5541 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5542 DecodePSRLDQMask(NumElems, ImmN, Mask);
5543 IsUnary = true;
5544 break;
5545 case X86ISD::PSHUFD:
5546 case X86ISD::VPERMILPI:
5547 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5548 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5549 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5550 IsUnary = true;
5551 break;
5552 case X86ISD::PSHUFHW:
5553 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5554 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5555 DecodePSHUFHWMask(NumElems, ImmN, Mask);
5556 IsUnary = true;
5557 break;
5558 case X86ISD::PSHUFLW:
5559 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5560 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5561 DecodePSHUFLWMask(NumElems, ImmN, Mask);
5562 IsUnary = true;
5563 break;
5564 case X86ISD::VZEXT_MOVL:
5565 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5566 DecodeZeroMoveLowMask(NumElems, Mask);
5567 IsUnary = true;
5568 break;
5569 case X86ISD::VBROADCAST:
5570 // We only decode broadcasts of same-sized vectors, peeking through to
5571 // extracted subvectors is likely to cause hasOneUse issues with
5572 // SimplifyDemandedBits etc.
5573 if (N.getOperand(0).getValueType() == VT) {
5574 DecodeVectorBroadcast(NumElems, Mask);
5575 IsUnary = true;
5576 break;
5577 }
5578 return false;
5579 case X86ISD::VPERMILPV: {
5580 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5581 IsUnary = true;
5582 SDValue MaskNode = N.getOperand(1);
5583 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5584 RawUndefs)) {
5585 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
5586 break;
5587 }
5588 return false;
5589 }
5590 case X86ISD::PSHUFB: {
5591 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5592 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5593 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5594 IsUnary = true;
5595 SDValue MaskNode = N.getOperand(1);
5596 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5597 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
5598 break;
5599 }
5600 return false;
5601 }
5602 case X86ISD::VPERMI:
5603 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5604 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5605 DecodeVPERMMask(NumElems, ImmN, Mask);
5606 IsUnary = true;
5607 break;
5608 case X86ISD::MOVSS:
5609 case X86ISD::MOVSD:
5610 case X86ISD::MOVSH:
5611 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5612 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5613 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5614 break;
5615 case X86ISD::VPERM2X128:
5616 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5617 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5618 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5619 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
5620 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5621 break;
5622 case X86ISD::SHUF128:
5623 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5624 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5625 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5626 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
5627 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5628 break;
5629 case X86ISD::MOVSLDUP:
5630 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5631 DecodeMOVSLDUPMask(NumElems, Mask);
5632 IsUnary = true;
5633 break;
5634 case X86ISD::MOVSHDUP:
5635 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5636 DecodeMOVSHDUPMask(NumElems, Mask);
5637 IsUnary = true;
5638 break;
5639 case X86ISD::MOVDDUP:
5640 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5641 DecodeMOVDDUPMask(NumElems, Mask);
5642 IsUnary = true;
5643 break;
5644 case X86ISD::VPERMIL2: {
5645 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5646 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5647 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5648 SDValue MaskNode = N.getOperand(2);
5649 SDValue CtrlNode = N.getOperand(3);
5650 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5651 unsigned CtrlImm = CtrlOp->getZExtValue();
5652 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5653 RawUndefs)) {
5654 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
5655 Mask);
5656 break;
5657 }
5658 }
5659 return false;
5660 }
5661 case X86ISD::VPPERM: {
5662 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5663 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5664 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5665 SDValue MaskNode = N.getOperand(2);
5666 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5667 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
5668 break;
5669 }
5670 return false;
5671 }
5672 case X86ISD::VPERMV: {
5673 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5674 IsUnary = true;
5675 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5676 Ops.push_back(N.getOperand(1));
5677 SDValue MaskNode = N.getOperand(0);
5678 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5679 RawUndefs)) {
5680 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
5681 break;
5682 }
5683 return false;
5684 }
5685 case X86ISD::VPERMV3: {
5686 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5687 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
5688 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
5689 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5690 Ops.push_back(N.getOperand(0));
5691 Ops.push_back(N.getOperand(2));
5692 SDValue MaskNode = N.getOperand(1);
5693 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5694 RawUndefs)) {
5695 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
5696 break;
5697 }
5698 return false;
5699 }
5700 default:
5701 llvm_unreachable("unknown target shuffle node");
5702 }
5703
5704 // Empty mask indicates the decode failed.
5705 if (Mask.empty())
5706 return false;
5707
5708 // Check if we're getting a shuffle mask with zero'd elements.
5709 if (!AllowSentinelZero && isAnyZero(Mask))
5710 return false;
5711
5712 // If we have a fake unary shuffle, the shuffle mask is spread across two
5713 // inputs that are actually the same node. Re-map the mask to always point
5714 // into the first input.
5715 if (IsFakeUnary)
5716 for (int &M : Mask)
5717 if (M >= (int)Mask.size())
5718 M -= Mask.size();
5719
5720 // If we didn't already add operands in the opcode-specific code, default to
5721 // adding 1 or 2 operands starting at 0.
5722 if (Ops.empty()) {
5723 Ops.push_back(N.getOperand(0));
5724 if (!IsUnary || IsFakeUnary)
5725 Ops.push_back(N.getOperand(1));
5726 }
5727
5728 return true;
5729}
5730
5731// Wrapper for getTargetShuffleMask with InUnary;
5732static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5734 SmallVectorImpl<int> &Mask) {
5735 bool IsUnary;
5736 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
5737}
5738
5739/// Compute whether each element of a shuffle is zeroable.
5740///
5741/// A "zeroable" vector shuffle element is one which can be lowered to zero.
5742/// Either it is an undef element in the shuffle mask, the element of the input
5743/// referenced is undef, or the element of the input referenced is known to be
5744/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
5745/// as many lanes with this technique as possible to simplify the remaining
5746/// shuffle.
5748 SDValue V1, SDValue V2,
5749 APInt &KnownUndef, APInt &KnownZero) {
5750 int Size = Mask.size();
5751 KnownUndef = KnownZero = APInt::getZero(Size);
5752
5753 V1 = peekThroughBitcasts(V1);
5754 V2 = peekThroughBitcasts(V2);
5755
5756 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
5757 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
5758
5759 int VectorSizeInBits = V1.getValueSizeInBits();
5760 int ScalarSizeInBits = VectorSizeInBits / Size;
5761 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
5762
5763 for (int i = 0; i < Size; ++i) {
5764 int M = Mask[i];
5765 // Handle the easy cases.
5766 if (M < 0) {
5767 KnownUndef.setBit(i);
5768 continue;
5769 }
5770 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
5771 KnownZero.setBit(i);
5772 continue;
5773 }
5774
5775 // Determine shuffle input and normalize the mask.
5776 SDValue V = M < Size ? V1 : V2;
5777 M %= Size;
5778
5779 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5780 if (V.getOpcode() != ISD::BUILD_VECTOR)
5781 continue;
5782
5783 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
5784 // the (larger) source element must be UNDEF/ZERO.
5785 if ((Size % V.getNumOperands()) == 0) {
5786 int Scale = Size / V->getNumOperands();
5787 SDValue Op = V.getOperand(M / Scale);
5788 if (Op.isUndef())
5789 KnownUndef.setBit(i);
5790 if (X86::isZeroNode(Op))
5791 KnownZero.setBit(i);
5792 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
5793 APInt Val = Cst->getAPIntValue();
5794 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5795 if (Val == 0)
5796 KnownZero.setBit(i);
5797 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5798 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5799 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5800 if (Val == 0)
5801 KnownZero.setBit(i);
5802 }
5803 continue;
5804 }
5805
5806 // If the BUILD_VECTOR has more elements then all the (smaller) source
5807 // elements must be UNDEF or ZERO.
5808 if ((V.getNumOperands() % Size) == 0) {
5809 int Scale = V->getNumOperands() / Size;
5810 bool AllUndef = true;
5811 bool AllZero = true;
5812 for (int j = 0; j < Scale; ++j) {
5813 SDValue Op = V.getOperand((M * Scale) + j);
5814 AllUndef &= Op.isUndef();
5815 AllZero &= X86::isZeroNode(Op);
5816 }
5817 if (AllUndef)
5818 KnownUndef.setBit(i);
5819 if (AllZero)
5820 KnownZero.setBit(i);
5821 continue;
5822 }
5823 }
5824}
5825
5826/// Decode a target shuffle mask and inputs and see if any values are
5827/// known to be undef or zero from their inputs.
5828/// Returns true if the target shuffle mask was decoded.
5829/// FIXME: Merge this with computeZeroableShuffleElements?
5832 APInt &KnownUndef, APInt &KnownZero) {
5833 bool IsUnary;
5834 if (!isTargetShuffle(N.getOpcode()))
5835 return false;
5836
5837 MVT VT = N.getSimpleValueType();
5838 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
5839 return false;
5840
5841 int Size = Mask.size();
5842 SDValue V1 = Ops[0];
5843 SDValue V2 = IsUnary ? V1 : Ops[1];
5844 KnownUndef = KnownZero = APInt::getZero(Size);
5845
5846 V1 = peekThroughBitcasts(V1);
5847 V2 = peekThroughBitcasts(V2);
5848
5849 assert((VT.getSizeInBits() % Size) == 0 &&
5850 "Illegal split of shuffle value type");
5851 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
5852
5853 // Extract known constant input data.
5854 APInt UndefSrcElts[2];
5855 SmallVector<APInt, 32> SrcEltBits[2];
5856 bool IsSrcConstant[2] = {
5857 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5858 SrcEltBits[0], /*AllowWholeUndefs*/ true,
5859 /*AllowPartialUndefs*/ false),
5860 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5861 SrcEltBits[1], /*AllowWholeUndefs*/ true,
5862 /*AllowPartialUndefs*/ false)};
5863
5864 for (int i = 0; i < Size; ++i) {
5865 int M = Mask[i];
5866
5867 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5868 if (M < 0) {
5869 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
5870 if (SM_SentinelUndef == M)
5871 KnownUndef.setBit(i);
5872 if (SM_SentinelZero == M)
5873 KnownZero.setBit(i);
5874 continue;
5875 }
5876
5877 // Determine shuffle input and normalize the mask.
5878 unsigned SrcIdx = M / Size;
5879 SDValue V = M < Size ? V1 : V2;
5880 M %= Size;
5881
5882 // We are referencing an UNDEF input.
5883 if (V.isUndef()) {
5884 KnownUndef.setBit(i);
5885 continue;
5886 }
5887
5888 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5889 // TODO: We currently only set UNDEF for integer types - floats use the same
5890 // registers as vectors and many of the scalar folded loads rely on the
5891 // SCALAR_TO_VECTOR pattern.
5892 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5893 (Size % V.getValueType().getVectorNumElements()) == 0) {
5894 int Scale = Size / V.getValueType().getVectorNumElements();
5895 int Idx = M / Scale;
5896 if (Idx != 0 && !VT.isFloatingPoint())
5897 KnownUndef.setBit(i);
5898 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5899 KnownZero.setBit(i);
5900 continue;
5901 }
5902
5903 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
5904 // base vectors.
5905 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
5906 SDValue Vec = V.getOperand(0);
5907 int NumVecElts = Vec.getValueType().getVectorNumElements();
5908 if (Vec.isUndef() && Size == NumVecElts) {
5909 int Idx = V.getConstantOperandVal(2);
5910 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
5911 if (M < Idx || (Idx + NumSubElts) <= M)
5912 KnownUndef.setBit(i);
5913 }
5914 continue;
5915 }
5916
5917 // Attempt to extract from the source's constant bits.
5918 if (IsSrcConstant[SrcIdx]) {
5919 if (UndefSrcElts[SrcIdx][M])
5920 KnownUndef.setBit(i);
5921 else if (SrcEltBits[SrcIdx][M] == 0)
5922 KnownZero.setBit(i);
5923 }
5924 }
5925
5926 assert(VT.getVectorNumElements() == (unsigned)Size &&
5927 "Different mask size from vector size!");
5928 return true;
5929}
5930
5931// Replace target shuffle mask elements with known undef/zero sentinels.
5933 const APInt &KnownUndef,
5934 const APInt &KnownZero,
5935 bool ResolveKnownZeros= true) {
5936 unsigned NumElts = Mask.size();
5937 assert(KnownUndef.getBitWidth() == NumElts &&
5938 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
5939
5940 for (unsigned i = 0; i != NumElts; ++i) {
5941 if (KnownUndef[i])
5942 Mask[i] = SM_SentinelUndef;
5943 else if (ResolveKnownZeros && KnownZero[i])
5944 Mask[i] = SM_SentinelZero;
5945 }
5946}
5947
5948// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
5950 APInt &KnownUndef,
5951 APInt &KnownZero) {
5952 unsigned NumElts = Mask.size();
5953 KnownUndef = KnownZero = APInt::getZero(NumElts);
5954
5955 for (unsigned i = 0; i != NumElts; ++i) {
5956 int M = Mask[i];
5957 if (SM_SentinelUndef == M)
5958 KnownUndef.setBit(i);
5959 if (SM_SentinelZero == M)
5960 KnownZero.setBit(i);
5961 }
5962}
5963
5964// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
5966 SDValue Cond, bool IsBLENDV = false) {
5967 EVT CondVT = Cond.getValueType();
5968 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
5969 unsigned NumElts = CondVT.getVectorNumElements();
5970
5971 APInt UndefElts;
5972 SmallVector<APInt, 32> EltBits;
5973 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
5974 /*AllowWholeUndefs*/ true,
5975 /*AllowPartialUndefs*/ false))
5976 return false;
5977
5978 Mask.resize(NumElts, SM_SentinelUndef);
5979
5980 for (int i = 0; i != (int)NumElts; ++i) {
5981 Mask[i] = i;
5982 // Arbitrarily choose from the 2nd operand if the select condition element
5983 // is undef.
5984 // TODO: Can we do better by matching patterns such as even/odd?
5985 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
5986 (IsBLENDV && EltBits[i].isNonNegative()))
5987 Mask[i] += NumElts;
5988 }
5989
5990 return true;
5991}
5992
5993// Forward declaration (for getFauxShuffleMask recursive check).
5994static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
5997 const SelectionDAG &DAG, unsigned Depth,
5998 bool ResolveKnownElts);
5999
6000// Attempt to decode ops that could be represented as a shuffle mask.
6001// The decoded shuffle mask may contain a different number of elements to the
6002// destination value type.
6003// TODO: Merge into getTargetShuffleInputs()
6004static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
6007 const SelectionDAG &DAG, unsigned Depth,
6008 bool ResolveKnownElts) {
6009 Mask.clear();
6010 Ops.clear();
6011
6012 MVT VT = N.getSimpleValueType();
6013 unsigned NumElts = VT.getVectorNumElements();
6014 unsigned NumSizeInBits = VT.getSizeInBits();
6015 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6016 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6017 return false;
6018 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
6019 unsigned NumSizeInBytes = NumSizeInBits / 8;
6020 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6021
6022 unsigned Opcode = N.getOpcode();
6023 switch (Opcode) {
6024 case ISD::VECTOR_SHUFFLE: {
6025 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6026 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6027 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6028 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6029 Ops.push_back(N.getOperand(0));
6030 Ops.push_back(N.getOperand(1));
6031 return true;
6032 }
6033 return false;
6034 }
6035 case ISD::AND:
6036 case X86ISD::ANDNP: {
6037 // Attempt to decode as a per-byte mask.
6038 APInt UndefElts;
6039 SmallVector<APInt, 32> EltBits;
6040 SDValue N0 = N.getOperand(0);
6041 SDValue N1 = N.getOperand(1);
6042 bool IsAndN = (X86ISD::ANDNP == Opcode);
6043 uint64_t ZeroMask = IsAndN ? 255 : 0;
6044 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
6045 /*AllowWholeUndefs*/ false,
6046 /*AllowPartialUndefs*/ false))
6047 return false;
6048 // We can't assume an undef src element gives an undef dst - the other src
6049 // might be zero.
6050 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
6051 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6052 const APInt &ByteBits = EltBits[i];
6053 if (ByteBits != 0 && ByteBits != 255)
6054 return false;
6055 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6056 }
6057 Ops.push_back(IsAndN ? N1 : N0);
6058 return true;
6059 }
6060 case ISD::OR: {
6061 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
6062 // is a valid shuffle index.
6063 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
6064 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
6065 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
6066 return false;
6067
6068 SmallVector<int, 64> SrcMask0, SrcMask1;
6069 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
6072 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
6073 Depth + 1, true) ||
6074 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
6075 Depth + 1, true))
6076 return false;
6077
6078 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
6079 SmallVector<int, 64> Mask0, Mask1;
6080 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
6081 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
6082 for (int i = 0; i != (int)MaskSize; ++i) {
6083 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
6084 // loops converting between OR and BLEND shuffles due to
6085 // canWidenShuffleElements merging away undef elements, meaning we
6086 // fail to recognise the OR as the undef element isn't known zero.
6087 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
6088 Mask.push_back(SM_SentinelZero);
6089 else if (Mask1[i] == SM_SentinelZero)
6090 Mask.push_back(i);
6091 else if (Mask0[i] == SM_SentinelZero)
6092 Mask.push_back(i + MaskSize);
6093 else
6094 return false;
6095 }
6096 Ops.push_back(N0);
6097 Ops.push_back(N1);
6098 return true;
6099 }
6100 case ISD::INSERT_SUBVECTOR: {
6101 SDValue Src = N.getOperand(0);
6102 SDValue Sub = N.getOperand(1);
6103 EVT SubVT = Sub.getValueType();
6104 unsigned NumSubElts = SubVT.getVectorNumElements();
6105 if (!N->isOnlyUserOf(Sub.getNode()))
6106 return false;
6107 SDValue SubBC = peekThroughBitcasts(Sub);
6108 uint64_t InsertIdx = N.getConstantOperandVal(2);
6109 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
6110 if (SubBC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6111 SubBC.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6112 uint64_t ExtractIdx = SubBC.getConstantOperandVal(1);
6113 SDValue SubBCSrc = SubBC.getOperand(0);
6114 unsigned NumSubSrcBCElts = SubBCSrc.getValueType().getVectorNumElements();
6115 unsigned MaxElts = std::max(NumElts, NumSubSrcBCElts);
6116 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcBCElts) == 0 &&
6117 "Subvector valuetype mismatch");
6118 InsertIdx *= (MaxElts / NumElts);
6119 ExtractIdx *= (MaxElts / NumSubSrcBCElts);
6120 NumSubElts *= (MaxElts / NumElts);
6121 bool SrcIsUndef = Src.isUndef();
6122 for (int i = 0; i != (int)MaxElts; ++i)
6123 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
6124 for (int i = 0; i != (int)NumSubElts; ++i)
6125 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6126 if (!SrcIsUndef)
6127 Ops.push_back(Src);
6128 Ops.push_back(SubBCSrc);
6129 return true;
6130 }
6131 // Handle CONCAT(SUB0, SUB1).
6132 // Limit this to vXi64 512-bit vector cases to make the most of AVX512
6133 // cross lane shuffles.
6134 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6135 NumBitsPerElt == 64 && NumSizeInBits == 512 &&
6136 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6137 Src.getOperand(0).isUndef() &&
6138 Src.getOperand(1).getValueType() == SubVT &&
6139 Src.getConstantOperandVal(2) == 0) {
6140 for (int i = 0; i != (int)NumSubElts; ++i)
6141 Mask.push_back(i);
6142 for (int i = 0; i != (int)NumSubElts; ++i)
6143 Mask.push_back(i + NumElts);
6144 Ops.push_back(Src.getOperand(1));
6145 Ops.push_back(Sub);
6146 return true;
6147 }
6148 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
6149 SmallVector<int, 64> SubMask;
6150 SmallVector<SDValue, 2> SubInputs;
6151 SDValue SubSrc = peekThroughOneUseBitcasts(Sub);
6152 EVT SubSrcVT = SubSrc.getValueType();
6153 if (!SubSrcVT.isVector())
6154 return false;
6155
6156 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
6157 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
6158 Depth + 1, ResolveKnownElts))
6159 return false;
6160
6161 // Subvector shuffle inputs must not be larger than the subvector.
6162 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
6163 return SubVT.getFixedSizeInBits() <
6164 SubInput.getValueSizeInBits().getFixedValue();
6165 }))
6166 return false;
6167
6168 if (SubMask.size() != NumSubElts) {
6169 assert(((SubMask.size() % NumSubElts) == 0 ||
6170 (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
6171 if ((NumSubElts % SubMask.size()) == 0) {
6172 int Scale = NumSubElts / SubMask.size();
6173 SmallVector<int,64> ScaledSubMask;
6174 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
6175 SubMask = ScaledSubMask;
6176 } else {
6177 int Scale = SubMask.size() / NumSubElts;
6178 NumSubElts = SubMask.size();
6179 NumElts *= Scale;
6180 InsertIdx *= Scale;
6181 }
6182 }
6183 Ops.push_back(Src);
6184 Ops.append(SubInputs.begin(), SubInputs.end());
6185 if (ISD::isBuildVectorAllZeros(Src.getNode()))
6186 Mask.append(NumElts, SM_SentinelZero);
6187 else
6188 for (int i = 0; i != (int)NumElts; ++i)
6189 Mask.push_back(i);
6190 for (int i = 0; i != (int)NumSubElts; ++i) {
6191 int M = SubMask[i];
6192 if (0 <= M) {
6193 int InputIdx = M / NumSubElts;
6194 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6195 }
6196 Mask[i + InsertIdx] = M;
6197 }
6198 return true;
6199 }
6200 case X86ISD::PINSRB:
6201 case X86ISD::PINSRW:
6204 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
6205 // vector, for matching src/dst vector types.
6206 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
6207
6208 unsigned DstIdx = 0;
6209 if (Opcode != ISD::SCALAR_TO_VECTOR) {
6210 // Check we have an in-range constant insertion index.
6211 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
6212 N.getConstantOperandAPInt(2).uge(NumElts))
6213 return false;
6214 DstIdx = N.getConstantOperandVal(2);
6215
6216 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
6217 if (X86::isZeroNode(Scl)) {
6218 Ops.push_back(N.getOperand(0));
6219 for (unsigned i = 0; i != NumElts; ++i)
6220 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
6221 return true;
6222 }
6223 }
6224
6225 // Peek through trunc/aext/zext/bitcast.
6226 // TODO: aext shouldn't require SM_SentinelZero padding.
6227 // TODO: handle shift of scalars.
6228 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
6229 while (Scl.getOpcode() == ISD::TRUNCATE ||
6230 Scl.getOpcode() == ISD::ANY_EXTEND ||
6231 Scl.getOpcode() == ISD::ZERO_EXTEND ||
6232 (Scl.getOpcode() == ISD::BITCAST &&
6235 Scl = Scl.getOperand(0);
6236 MinBitsPerElt =
6237 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
6238 }
6239 if ((MinBitsPerElt % 8) != 0)
6240 return false;
6241
6242 // Attempt to find the source vector the scalar was extracted from.
6243 SDValue SrcExtract;
6244 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6245 Scl.getOpcode() == X86ISD::PEXTRW ||
6246 Scl.getOpcode() == X86ISD::PEXTRB) &&
6247 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6248 SrcExtract = Scl;
6249 }
6250 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6251 return false;
6252
6253 SDValue SrcVec = SrcExtract.getOperand(0);
6254 EVT SrcVT = SrcVec.getValueType();
6255 if (!SrcVT.getScalarType().isByteSized())
6256 return false;
6257 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6258 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6259 unsigned DstByte = DstIdx * NumBytesPerElt;
6260 MinBitsPerElt =
6261 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6262
6263 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6264 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6265 Ops.push_back(SrcVec);
6266 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6267 } else {
6268 Ops.push_back(SrcVec);
6269 Ops.push_back(N.getOperand(0));
6270 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6271 Mask.push_back(NumSizeInBytes + i);
6272 }
6273
6274 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6275 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6276 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6277 Mask[DstByte + i] = SrcByte + i;
6278 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6279 Mask[DstByte + i] = SM_SentinelZero;
6280 return true;
6281 }
6282 case X86ISD::PACKSS:
6283 case X86ISD::PACKUS: {
6284 SDValue N0 = N.getOperand(0);
6285 SDValue N1 = N.getOperand(1);
6286 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6287 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6288 "Unexpected input value type");
6289
6290 APInt EltsLHS, EltsRHS;
6291 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6292
6293 // If we know input saturation won't happen (or we don't care for particular
6294 // lanes), we can treat this as a truncation shuffle.
6295 bool Offset0 = false, Offset1 = false;
6296 if (Opcode == X86ISD::PACKSS) {
6297 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6298 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6299 (!(N1.isUndef() || EltsRHS.isZero()) &&
6300 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6301 return false;
6302 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6303 // PACKSS then it was likely being used for sign-extension for a
6304 // truncation, so just peek through and adjust the mask accordingly.
6305 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6306 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6307 Offset0 = true;
6308 N0 = N0.getOperand(0);
6309 }
6310 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6311 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6312 Offset1 = true;
6313 N1 = N1.getOperand(0);
6314 }
6315 } else {
6316 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6317 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6318 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6319 (!(N1.isUndef() || EltsRHS.isZero()) &&
6320 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6321 return false;
6322 }
6323
6324 bool IsUnary = (N0 == N1);
6325
6326 Ops.push_back(N0);
6327 if (!IsUnary)
6328 Ops.push_back(N1);
6329
6330 createPackShuffleMask(VT, Mask, IsUnary);
6331
6332 if (Offset0 || Offset1) {
6333 for (int &M : Mask)
6334 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6335 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6336 ++M;
6337 }
6338 return true;
6339 }
6340 case ISD::VSELECT:
6341 case X86ISD::BLENDV: {
6342 SDValue Cond = N.getOperand(0);
6343 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6344 Ops.push_back(N.getOperand(1));
6345 Ops.push_back(N.getOperand(2));
6346 return true;
6347 }
6348 return false;
6349 }
6350 case X86ISD::VTRUNC: {
6351 SDValue Src = N.getOperand(0);
6352 EVT SrcVT = Src.getValueType();
6353 // Truncated source must be a simple vector.
6354 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6355 (SrcVT.getScalarSizeInBits() % 8) != 0)
6356 return false;
6357 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6358 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6359 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6360 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6361 for (unsigned i = 0; i != NumSrcElts; ++i)
6362 Mask.push_back(i * Scale);
6363 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6364 Ops.push_back(Src);
6365 return true;
6366 }
6367 case ISD::SHL:
6368 case ISD::SRL: {
6369 // We can only decode 'whole byte' bit shifts as shuffles.
6370 std::optional<uint64_t> Amt = DAG.getValidShiftAmount(N, DemandedElts);
6371 if (!Amt || (*Amt % 8) != 0)
6372 return false;
6373
6374 uint64_t ByteShift = *Amt / 8;
6375 Ops.push_back(N.getOperand(0));
6376
6377 // Clear mask to all zeros and insert the shifted byte indices.
6378 Mask.append(NumSizeInBytes, SM_SentinelZero);
6379
6380 if (ISD::SHL == Opcode) {
6381 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6382 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6383 Mask[i + j] = i + j - ByteShift;
6384 } else {
6385 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6386 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6387 Mask[i + j - ByteShift] = i + j;
6388 }
6389 return true;
6390 }
6391 case X86ISD::VSHLI:
6392 case X86ISD::VSRLI: {
6393 uint64_t ShiftVal = N.getConstantOperandVal(1);
6394 // Out of range bit shifts are guaranteed to be zero.
6395 if (NumBitsPerElt <= ShiftVal) {
6396 Mask.append(NumElts, SM_SentinelZero);
6397 return true;
6398 }
6399
6400 // We can only decode 'whole byte' bit shifts as shuffles.
6401 if ((ShiftVal % 8) != 0)
6402 break;
6403
6404 uint64_t ByteShift = ShiftVal / 8;
6405 Ops.push_back(N.getOperand(0));
6406
6407 // Clear mask to all zeros and insert the shifted byte indices.
6408 Mask.append(NumSizeInBytes, SM_SentinelZero);
6409
6410 if (X86ISD::VSHLI == Opcode) {
6411 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6412 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6413 Mask[i + j] = i + j - ByteShift;
6414 } else {
6415 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6416 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6417 Mask[i + j - ByteShift] = i + j;
6418 }
6419 return true;
6420 }
6421 case X86ISD::VROTLI:
6422 case X86ISD::VROTRI: {
6423 // We can only decode 'whole byte' bit rotates as shuffles.
6424 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6425 if ((RotateVal % 8) != 0)
6426 return false;
6427 Ops.push_back(N.getOperand(0));
6428 int Offset = RotateVal / 8;
6429 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6430 for (int i = 0; i != (int)NumElts; ++i) {
6431 int BaseIdx = i * NumBytesPerElt;
6432 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
6433 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
6434 }
6435 }
6436 return true;
6437 }
6438 case X86ISD::VBROADCAST: {
6439 SDValue Src = N.getOperand(0);
6440 if (!Src.getSimpleValueType().isVector()) {
6441 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6442 !isNullConstant(Src.getOperand(1)) ||
6443 Src.getOperand(0).getValueType().getScalarType() !=
6444 VT.getScalarType())
6445 return false;
6446 Src = Src.getOperand(0);
6447 }
6448 Ops.push_back(Src);
6449 Mask.append(NumElts, 0);
6450 return true;
6451 }
6453 SDValue Src = N.getOperand(0);
6454 EVT SrcVT = Src.getValueType();
6455 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6456
6457 // Extended source must be a simple vector.
6458 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6459 (NumBitsPerSrcElt % 8) != 0)
6460 return false;
6461
6462 // We can only handle all-signbits extensions.
6463 APInt DemandedSrcElts =
6464 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
6465 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
6466 return false;
6467
6468 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
6469 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6470 for (unsigned I = 0; I != NumElts; ++I)
6471 Mask.append(Scale, I);
6472 Ops.push_back(Src);
6473 return true;
6474 }
6475 case ISD::ZERO_EXTEND:
6476 case ISD::ANY_EXTEND:
6479 SDValue Src = N.getOperand(0);
6480 EVT SrcVT = Src.getValueType();
6481
6482 // Extended source must be a simple vector.
6483 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6484 (SrcVT.getScalarSizeInBits() % 8) != 0)
6485 return false;
6486
6487 bool IsAnyExtend =
6488 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
6489 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6490 IsAnyExtend, Mask);
6491 Ops.push_back(Src);
6492 return true;
6493 }
6494 }
6495
6496 return false;
6497}
6498
6499/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
6501 SmallVectorImpl<int> &Mask) {
6502 int MaskWidth = Mask.size();
6503 SmallVector<SDValue, 16> UsedInputs;
6504 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6505 int lo = UsedInputs.size() * MaskWidth;
6506 int hi = lo + MaskWidth;
6507
6508 // Strip UNDEF input usage.
6509 if (Inputs[i].isUndef())
6510 for (int &M : Mask)
6511 if ((lo <= M) && (M < hi))
6512 M = SM_SentinelUndef;
6513
6514 // Check for unused inputs.
6515 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6516 for (int &M : Mask)
6517 if (lo <= M)
6518 M -= MaskWidth;
6519 continue;
6520 }
6521
6522 // Check for repeated inputs.
6523 bool IsRepeat = false;
6524 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
6525 if (UsedInputs[j] != Inputs[i])
6526 continue;
6527 for (int &M : Mask)
6528 if (lo <= M)
6529 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6530 IsRepeat = true;
6531 break;
6532 }
6533 if (IsRepeat)
6534 continue;
6535
6536 UsedInputs.push_back(Inputs[i]);
6537 }
6538 Inputs = UsedInputs;
6539}
6540
6541/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
6542/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
6543/// Returns true if the target shuffle mask was decoded.
6544static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6547 APInt &KnownUndef, APInt &KnownZero,
6548 const SelectionDAG &DAG, unsigned Depth,
6549 bool ResolveKnownElts) {
6551 return false; // Limit search depth.
6552
6553 EVT VT = Op.getValueType();
6554 if (!VT.isSimple() || !VT.isVector())
6555 return false;
6556
6557 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
6558 if (ResolveKnownElts)
6559 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
6560 return true;
6561 }
6562 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6563 ResolveKnownElts)) {
6564 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
6565 return true;
6566 }
6567 return false;
6568}
6569
6570static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6573 const SelectionDAG &DAG, unsigned Depth,
6574 bool ResolveKnownElts) {
6575 APInt KnownUndef, KnownZero;
6576 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
6577 KnownZero, DAG, Depth, ResolveKnownElts);
6578}
6579
6582 const SelectionDAG &DAG, unsigned Depth = 0,
6583 bool ResolveKnownElts = true) {
6584 EVT VT = Op.getValueType();
6585 if (!VT.isSimple() || !VT.isVector())
6586 return false;
6587
6588 unsigned NumElts = Op.getValueType().getVectorNumElements();
6589 APInt DemandedElts = APInt::getAllOnes(NumElts);
6590 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6591 ResolveKnownElts);
6592}
6593
6594// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
6595static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
6596 EVT MemVT, MemSDNode *Mem, unsigned Offset,
6597 SelectionDAG &DAG) {
6598 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
6599 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
6600 "Unknown broadcast load type");
6601
6602 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6603 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6604 return SDValue();
6605
6608 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6609 SDValue Ops[] = {Mem->getChain(), Ptr};
6610 SDValue BcstLd = DAG.getMemIntrinsicNode(
6611 Opcode, DL, Tys, Ops, MemVT,
6613 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6614 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
6615 return BcstLd;
6616}
6617
6618/// Returns the scalar element that will make up the i'th
6619/// element of the result of the vector shuffle.
6620static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
6621 SelectionDAG &DAG, unsigned Depth) {
6623 return SDValue(); // Limit search depth.
6624
6625 EVT VT = Op.getValueType();
6626 unsigned Opcode = Op.getOpcode();
6627 unsigned NumElems = VT.getVectorNumElements();
6628
6629 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6630 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
6631 int Elt = SV->getMaskElt(Index);
6632
6633 if (Elt < 0)
6634 return DAG.getUNDEF(VT.getVectorElementType());
6635
6636 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6637 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6638 }
6639
6640 // Recurse into target specific vector shuffles to find scalars.
6641 if (isTargetShuffle(Opcode)) {
6642 MVT ShufVT = VT.getSimpleVT();
6643 MVT ShufSVT = ShufVT.getVectorElementType();
6644 int NumElems = (int)ShufVT.getVectorNumElements();
6645 SmallVector<int, 16> ShuffleMask;
6647 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
6648 return SDValue();
6649
6650 int Elt = ShuffleMask[Index];
6651 if (Elt == SM_SentinelZero)
6652 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
6653 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
6654 if (Elt == SM_SentinelUndef)
6655 return DAG.getUNDEF(ShufSVT);
6656
6657 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
6658 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6659 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6660 }
6661
6662 // Recurse into insert_subvector base/sub vector to find scalars.
6663 if (Opcode == ISD::INSERT_SUBVECTOR) {
6664 SDValue Vec = Op.getOperand(0);
6665 SDValue Sub = Op.getOperand(1);
6666 uint64_t SubIdx = Op.getConstantOperandVal(2);
6667 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
6668
6669 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6670 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6671 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6672 }
6673
6674 // Recurse into concat_vectors sub vector to find scalars.
6675 if (Opcode == ISD::CONCAT_VECTORS) {
6676 EVT SubVT = Op.getOperand(0).getValueType();
6677 unsigned NumSubElts = SubVT.getVectorNumElements();
6678 uint64_t SubIdx = Index / NumSubElts;
6679 uint64_t SubElt = Index % NumSubElts;
6680 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6681 }
6682
6683 // Recurse into extract_subvector src vector to find scalars.
6684 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
6685 SDValue Src = Op.getOperand(0);
6686 uint64_t SrcIdx = Op.getConstantOperandVal(1);
6687 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6688 }
6689
6690 // We only peek through bitcasts of the same vector width.
6691 if (Opcode == ISD::BITCAST) {
6692 SDValue Src = Op.getOperand(0);
6693 EVT SrcVT = Src.getValueType();
6694 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
6695 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6696 return SDValue();
6697 }
6698
6699 // Actual nodes that may contain scalar elements
6700
6701 // For insert_vector_elt - either return the index matching scalar or recurse
6702 // into the base vector.
6703 if (Opcode == ISD::INSERT_VECTOR_ELT &&
6704 isa<ConstantSDNode>(Op.getOperand(2))) {
6705 if (Op.getConstantOperandAPInt(2) == Index)
6706 return Op.getOperand(1);
6707 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6708 }
6709
6710 if (Opcode == ISD::SCALAR_TO_VECTOR)
6711 return (Index == 0) ? Op.getOperand(0)
6712 : DAG.getUNDEF(VT.getVectorElementType());
6713
6714 if (Opcode == ISD::BUILD_VECTOR)
6715 return Op.getOperand(Index);
6716
6717 return SDValue();
6718}
6719
6720// Use PINSRB/PINSRW/PINSRD to create a build vector.
6722 const APInt &NonZeroMask,
6723 unsigned NumNonZero, unsigned NumZero,
6724 SelectionDAG &DAG,
6725 const X86Subtarget &Subtarget) {
6726 MVT VT = Op.getSimpleValueType();
6727 unsigned NumElts = VT.getVectorNumElements();
6728 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6729 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6730 "Illegal vector insertion");
6731
6732 SDValue V;
6733 bool First = true;
6734
6735 for (unsigned i = 0; i < NumElts; ++i) {
6736 bool IsNonZero = NonZeroMask[i];
6737 if (!IsNonZero)
6738 continue;
6739
6740 // If the build vector contains zeros or our first insertion is not the
6741 // first index then insert into zero vector to break any register
6742 // dependency else use SCALAR_TO_VECTOR.
6743 if (First) {
6744 First = false;
6745 if (NumZero || 0 != i)
6746 V = getZeroVector(VT, Subtarget, DAG, DL);
6747 else {
6748 assert(0 == i && "Expected insertion into zero-index");
6749 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6750 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6751 V = DAG.getBitcast(VT, V);
6752 continue;
6753 }
6754 }
6755 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
6756 DAG.getVectorIdxConstant(i, DL));
6757 }
6758
6759 return V;
6760}
6761
6762/// Custom lower build_vector of v16i8.
6764 const APInt &NonZeroMask,
6765 unsigned NumNonZero, unsigned NumZero,
6766 SelectionDAG &DAG,
6767 const X86Subtarget &Subtarget) {
6768 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6769 return SDValue();
6770
6771 // SSE4.1 - use PINSRB to insert each byte directly.
6772 if (Subtarget.hasSSE41())
6773 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
6774 DAG, Subtarget);
6775
6776 SDValue V;
6777
6778 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6779 // If both the lowest 16-bits are non-zero, then convert to MOVD.
6780 if (!NonZeroMask.extractBits(2, 0).isZero() &&
6781 !NonZeroMask.extractBits(2, 2).isZero()) {
6782 for (unsigned I = 0; I != 4; ++I) {
6783 if (!NonZeroMask[I])
6784 continue;
6785 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
6786 if (I != 0)
6787 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
6788 DAG.getConstant(I * 8, DL, MVT::i8));
6789 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
6790 }
6791 assert(V && "Failed to fold v16i8 vector to zero");
6792 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6793 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
6794 V = DAG.getBitcast(MVT::v8i16, V);
6795 }
6796 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
6797 bool ThisIsNonZero = NonZeroMask[i];
6798 bool NextIsNonZero = NonZeroMask[i + 1];
6799 if (!ThisIsNonZero && !NextIsNonZero)
6800 continue;
6801
6802 SDValue Elt;
6803 if (ThisIsNonZero) {
6804 if (NumZero || NextIsNonZero)
6805 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6806 else
6807 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6808 }
6809
6810 if (NextIsNonZero) {
6811 SDValue NextElt = Op.getOperand(i + 1);
6812 if (i == 0 && NumZero)
6813 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
6814 else
6815 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
6816 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
6817 DAG.getConstant(8, DL, MVT::i8));
6818 if (ThisIsNonZero)
6819 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
6820 else
6821 Elt = NextElt;
6822 }
6823
6824 // If our first insertion is not the first index or zeros are needed, then
6825 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
6826 // elements undefined).
6827 if (!V) {
6828 if (i != 0 || NumZero)
6829 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
6830 else {
6831 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
6832 V = DAG.getBitcast(MVT::v8i16, V);
6833 continue;
6834 }
6835 }
6836 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6837 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
6838 DAG.getVectorIdxConstant(i / 2, DL));
6839 }
6840
6841 return DAG.getBitcast(MVT::v16i8, V);
6842}
6843
6844/// Custom lower build_vector of v8i16.
6846 const APInt &NonZeroMask,
6847 unsigned NumNonZero, unsigned NumZero,
6848 SelectionDAG &DAG,
6849 const X86Subtarget &Subtarget) {
6850 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6851 return SDValue();
6852
6853 // Use PINSRW to insert each byte directly.
6854 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
6855 Subtarget);
6856}
6857
6858/// Custom lower build_vector of v4i32 or v4f32.
6860 SelectionDAG &DAG,
6861 const X86Subtarget &Subtarget) {
6862 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
6863 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
6864 // Because we're creating a less complicated build vector here, we may enable
6865 // further folding of the MOVDDUP via shuffle transforms.
6866 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
6867 Op.getOperand(0) == Op.getOperand(2) &&
6868 Op.getOperand(1) == Op.getOperand(3) &&
6869 Op.getOperand(0) != Op.getOperand(1)) {
6870 MVT VT = Op.getSimpleValueType();
6871 MVT EltVT = VT.getVectorElementType();
6872 // Create a new build vector with the first 2 elements followed by undef
6873 // padding, bitcast to v2f64, duplicate, and bitcast back.
6874 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
6875 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
6876 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
6877 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
6878 return DAG.getBitcast(VT, Dup);
6879 }
6880
6881 // Find all zeroable elements.
6882 std::bitset<4> Zeroable, Undefs;
6883 for (int i = 0; i < 4; ++i) {
6884 SDValue Elt = Op.getOperand(i);
6885 Undefs[i] = Elt.isUndef();
6886 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6887 }
6888 assert(Zeroable.size() - Zeroable.count() > 1 &&
6889 "We expect at least two non-zero elements!");
6890
6891 // We only know how to deal with build_vector nodes where elements are either
6892 // zeroable or extract_vector_elt with constant index.
6893 SDValue FirstNonZero;
6894 unsigned FirstNonZeroIdx;
6895 for (unsigned i = 0; i < 4; ++i) {
6896 if (Zeroable[i])
6897 continue;
6898 SDValue Elt = Op.getOperand(i);
6899 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6900 !isa<ConstantSDNode>(Elt.getOperand(1)))
6901 return SDValue();
6902 // Make sure that this node is extracting from a 128-bit vector.
6903 MVT VT = Elt.getOperand(0).getSimpleValueType();
6904 if (!VT.is128BitVector())
6905 return SDValue();
6906 if (!FirstNonZero.getNode()) {
6907 FirstNonZero = Elt;
6908 FirstNonZeroIdx = i;
6909 }
6910 }
6911
6912 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6913 SDValue V1 = FirstNonZero.getOperand(0);
6914 MVT VT = V1.getSimpleValueType();
6915
6916 // See if this build_vector can be lowered as a blend with zero.
6917 SDValue Elt;
6918 unsigned EltMaskIdx, EltIdx;
6919 int Mask[4];
6920 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6921 if (Zeroable[EltIdx]) {
6922 // The zero vector will be on the right hand side.
6923 Mask[EltIdx] = EltIdx+4;
6924 continue;
6925 }
6926
6927 Elt = Op->getOperand(EltIdx);
6928 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6929 EltMaskIdx = Elt.getConstantOperandVal(1);
6930 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6931 break;
6932 Mask[EltIdx] = EltIdx;
6933 }
6934
6935 if (EltIdx == 4) {
6936 // Let the shuffle legalizer deal with blend operations.
6937 SDValue VZeroOrUndef = (Zeroable == Undefs)
6938 ? DAG.getUNDEF(VT)
6939 : getZeroVector(VT, Subtarget, DAG, DL);
6940 if (V1.getSimpleValueType() != VT)
6941 V1 = DAG.getBitcast(VT, V1);
6942 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
6943 }
6944
6945 // See if we can lower this build_vector to a INSERTPS.
6946 if (!Subtarget.hasSSE41())
6947 return SDValue();
6948
6949 SDValue V2 = Elt.getOperand(0);
6950 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6951 V1 = SDValue();
6952
6953 bool CanFold = true;
6954 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6955 if (Zeroable[i])
6956 continue;
6957
6958 SDValue Current = Op->getOperand(i);
6959 SDValue SrcVector = Current->getOperand(0);
6960 if (!V1.getNode())
6961 V1 = SrcVector;
6962 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
6963 }
6964
6965 if (!CanFold)
6966 return SDValue();
6967
6968 assert(V1.getNode() && "Expected at least two non-zero elements!");
6969 if (V1.getSimpleValueType() != MVT::v4f32)
6970 V1 = DAG.getBitcast(MVT::v4f32, V1);
6971 if (V2.getSimpleValueType() != MVT::v4f32)
6972 V2 = DAG.getBitcast(MVT::v4f32, V2);
6973
6974 // Ok, we can emit an INSERTPS instruction.
6975 unsigned ZMask = Zeroable.to_ulong();
6976
6977 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6978 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6979 SDValue Result =
6980 DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6981 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
6982 return DAG.getBitcast(VT, Result);
6983}
6984
6985/// Return a vector logical shift node.
6986static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6987 SelectionDAG &DAG, const TargetLowering &TLI,
6988 const SDLoc &dl) {
6989 assert(VT.is128BitVector() && "Unknown type for VShift");
6990 MVT ShVT = MVT::v16i8;
6991 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6992 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6993 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6994 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
6995 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6996}
6997
6999 SelectionDAG &DAG) {
7000
7001 // Check if the scalar load can be widened into a vector load. And if
7002 // the address is "base + cst" see if the cst can be "absorbed" into
7003 // the shuffle mask.
7004 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
7005 SDValue Ptr = LD->getBasePtr();
7006 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
7007 return SDValue();
7008 EVT PVT = LD->getValueType(0);
7009 if (PVT != MVT::i32 && PVT != MVT::f32)
7010 return SDValue();
7011
7012 int FI = -1;
7013 int64_t Offset = 0;
7014 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
7015 FI = FINode->getIndex();
7016 Offset = 0;
7017 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
7018 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
7019 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
7020 Offset = Ptr.getConstantOperandVal(1);
7021 Ptr = Ptr.getOperand(0);
7022 } else {
7023 return SDValue();
7024 }
7025
7026 // FIXME: 256-bit vector instructions don't require a strict alignment,
7027 // improve this code to support it better.
7028 Align RequiredAlign(VT.getSizeInBits() / 8);
7029 SDValue Chain = LD->getChain();
7030 // Make sure the stack object alignment is at least 16 or 32.
7032 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
7033 if (!InferredAlign || *InferredAlign < RequiredAlign) {
7034 if (MFI.isFixedObjectIndex(FI)) {
7035 // Can't change the alignment. FIXME: It's possible to compute
7036 // the exact stack offset and reference FI + adjust offset instead.
7037 // If someone *really* cares about this. That's the way to implement it.
7038 return SDValue();
7039 } else {
7040 MFI.setObjectAlignment(FI, RequiredAlign);
7041 }
7042 }
7043
7044 // (Offset % 16 or 32) must be multiple of 4. Then address is then
7045 // Ptr + (Offset & ~15).
7046 if (Offset < 0)
7047 return SDValue();
7048 if ((Offset % RequiredAlign.value()) & 3)
7049 return SDValue();
7050 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
7051 if (StartOffset) {
7052 SDLoc DL(Ptr);
7053 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7054 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
7055 }
7056
7057 int EltNo = (Offset - StartOffset) >> 2;
7058 unsigned NumElems = VT.getVectorNumElements();
7059
7060 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
7061 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
7062 LD->getPointerInfo().getWithOffset(StartOffset));
7063
7064 SmallVector<int, 8> Mask(NumElems, EltNo);
7065
7066 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
7067 }
7068
7069 return SDValue();
7070}
7071
7072// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
7073static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
7074 if (ISD::isNON_EXTLoad(Elt.getNode())) {
7075 auto *BaseLd = cast<LoadSDNode>(Elt);
7076 if (!BaseLd->isSimple())
7077 return false;
7078 Ld = BaseLd;
7079 ByteOffset = 0;
7080 return true;
7081 }
7082
7083 switch (Elt.getOpcode()) {
7084 case ISD::BITCAST:
7085 case ISD::TRUNCATE:
7087 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
7088 case ISD::SRL:
7089 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7090 uint64_t Amt = AmtC->getZExtValue();
7091 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
7092 ByteOffset += Amt / 8;
7093 return true;
7094 }
7095 }
7096 break;
7098 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7099 SDValue Src = Elt.getOperand(0);
7100 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
7101 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
7102 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7103 findEltLoadSrc(Src, Ld, ByteOffset)) {
7104 uint64_t Idx = IdxC->getZExtValue();
7105 ByteOffset += Idx * (SrcSizeInBits / 8);
7106 return true;
7107 }
7108 }
7109 break;
7110 }
7111
7112 return false;
7113}
7114
7115/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
7116/// elements can be replaced by a single large load which has the same value as
7117/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
7118///
7119/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
7121 const SDLoc &DL, SelectionDAG &DAG,
7122 const X86Subtarget &Subtarget,
7123 bool IsAfterLegalize) {
7124 if ((VT.getScalarSizeInBits() % 8) != 0)
7125 return SDValue();
7126
7127 unsigned NumElems = Elts.size();
7128
7129 int LastLoadedElt = -1;
7130 APInt LoadMask = APInt::getZero(NumElems);
7131 APInt ZeroMask = APInt::getZero(NumElems);
7132 APInt UndefMask = APInt::getZero(NumElems);
7133
7134 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
7135 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
7136
7137 // For each element in the initializer, see if we've found a load, zero or an
7138 // undef.
7139 for (unsigned i = 0; i < NumElems; ++i) {
7140 SDValue Elt = peekThroughBitcasts(Elts[i]);
7141 if (!Elt.getNode())
7142 return SDValue();
7143 if (Elt.isUndef()) {
7144 UndefMask.setBit(i);
7145 continue;
7146 }
7148 ZeroMask.setBit(i);
7149 continue;
7150 }
7151
7152 // Each loaded element must be the correct fractional portion of the
7153 // requested vector load.
7154 unsigned EltSizeInBits = Elt.getValueSizeInBits();
7155 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
7156 return SDValue();
7157
7158 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7159 return SDValue();
7160 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7161 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7162 return SDValue();
7163
7164 LoadMask.setBit(i);
7165 LastLoadedElt = i;
7166 }
7167 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
7168 NumElems &&
7169 "Incomplete element masks");
7170
7171 // Handle Special Cases - all undef or undef/zero.
7172 if (UndefMask.popcount() == NumElems)
7173 return DAG.getUNDEF(VT);
7174 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
7175 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
7176 : DAG.getConstantFP(0.0, DL, VT);
7177
7178 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7179 int FirstLoadedElt = LoadMask.countr_zero();
7180 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
7181 EVT EltBaseVT = EltBase.getValueType();
7182 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
7183 "Register/Memory size mismatch");
7184 LoadSDNode *LDBase = Loads[FirstLoadedElt];
7185 assert(LDBase && "Did not find base load for merging consecutive loads");
7186 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
7187 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7188 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7189 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7190 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
7191
7192 // TODO: Support offsetting the base load.
7193 if (ByteOffsets[FirstLoadedElt] != 0)
7194 return SDValue();
7195
7196 // Check to see if the element's load is consecutive to the base load
7197 // or offset from a previous (already checked) load.
7198 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
7199 LoadSDNode *Ld = Loads[EltIdx];
7200 int64_t ByteOffset = ByteOffsets[EltIdx];
7201 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7202 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7203 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
7204 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7205 }
7206 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
7207 EltIdx - FirstLoadedElt);
7208 };
7209
7210 // Consecutive loads can contain UNDEFS but not ZERO elements.
7211 // Consecutive loads with UNDEFs and ZEROs elements require a
7212 // an additional shuffle stage to clear the ZERO elements.
7213 bool IsConsecutiveLoad = true;
7214 bool IsConsecutiveLoadWithZeros = true;
7215 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7216 if (LoadMask[i]) {
7217 if (!CheckConsecutiveLoad(LDBase, i)) {
7218 IsConsecutiveLoad = false;
7219 IsConsecutiveLoadWithZeros = false;
7220 break;
7221 }
7222 } else if (ZeroMask[i]) {
7223 IsConsecutiveLoad = false;
7224 }
7225 }
7226
7227 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
7228 auto MMOFlags = LDBase->getMemOperand()->getFlags();
7229 assert(LDBase->isSimple() &&
7230 "Cannot merge volatile or atomic loads.");
7231 SDValue NewLd =
7232 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7233 LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
7234 MMOFlags);
7235 for (auto *LD : Loads)
7236 if (LD)
7237 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
7238 return NewLd;
7239 };
7240
7241 // Check if the base load is entirely dereferenceable.
7242 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7243 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
7244
7245 // LOAD - all consecutive load/undefs (must start/end with a load or be
7246 // entirely dereferenceable). If we have found an entire vector of loads and
7247 // undefs, then return a large load of the entire vector width starting at the
7248 // base pointer. If the vector contains zeros, then attempt to shuffle those
7249 // elements.
7250 if (FirstLoadedElt == 0 &&
7251 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
7252 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7253 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7254 return SDValue();
7255
7256 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7257 // will lower to regular temporal loads and use the cache.
7258 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7259 VT.is256BitVector() && !Subtarget.hasInt256())
7260 return SDValue();
7261
7262 if (NumElems == 1)
7263 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
7264
7265 if (!ZeroMask)
7266 return CreateLoad(VT, LDBase);
7267
7268 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7269 // vector and a zero vector to clear out the zero elements.
7270 if (!IsAfterLegalize && VT.isVector()) {
7271 unsigned NumMaskElts = VT.getVectorNumElements();
7272 if ((NumMaskElts % NumElems) == 0) {
7273 unsigned Scale = NumMaskElts / NumElems;
7274 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7275 for (unsigned i = 0; i < NumElems; ++i) {
7276 if (UndefMask[i])
7277 continue;
7278 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7279 for (unsigned j = 0; j != Scale; ++j)
7280 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7281 }
7282 SDValue V = CreateLoad(VT, LDBase);
7283 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7284 : DAG.getConstantFP(0.0, DL, VT);
7285 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7286 }
7287 }
7288 }
7289
7290 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7291 if (VT.is256BitVector() || VT.is512BitVector()) {
7292 unsigned HalfNumElems = NumElems / 2;
7293 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7294 EVT HalfVT =
7295 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7296 SDValue HalfLD =
7297 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7298 DAG, Subtarget, IsAfterLegalize);
7299 if (HalfLD)
7300 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7301 HalfLD, DAG.getVectorIdxConstant(0, DL));
7302 }
7303 }
7304
7305 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7306 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7307 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7308 LoadSizeInBits == 64) &&
7309 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7310 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7311 : MVT::getIntegerVT(LoadSizeInBits);
7312 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7313 // Allow v4f32 on SSE1 only targets.
7314 // FIXME: Add more isel patterns so we can just use VT directly.
7315 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7316 VecVT = MVT::v4f32;
7317 if (TLI.isTypeLegal(VecVT)) {
7318 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7319 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7320 SDValue ResNode = DAG.getMemIntrinsicNode(
7321 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7323 for (auto *LD : Loads)
7324 if (LD)
7325 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7326 return DAG.getBitcast(VT, ResNode);
7327 }
7328 }
7329
7330 // BROADCAST - match the smallest possible repetition pattern, load that
7331 // scalar/subvector element and then broadcast to the entire vector.
7332 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7333 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7334 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7335 unsigned RepeatSize = SubElems * BaseSizeInBits;
7336 unsigned ScalarSize = std::min(RepeatSize, 64u);
7337 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7338 continue;
7339
7340 // Don't attempt a 1:N subvector broadcast - it should be caught by
7341 // combineConcatVectorOps, else will cause infinite loops.
7342 if (RepeatSize > ScalarSize && SubElems == 1)
7343 continue;
7344
7345 bool Match = true;
7346 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7347 for (unsigned i = 0; i != NumElems && Match; ++i) {
7348 if (!LoadMask[i])
7349 continue;
7350 SDValue Elt = peekThroughBitcasts(Elts[i]);
7351 if (RepeatedLoads[i % SubElems].isUndef())
7352 RepeatedLoads[i % SubElems] = Elt;
7353 else
7354 Match &= (RepeatedLoads[i % SubElems] == Elt);
7355 }
7356
7357 // We must have loads at both ends of the repetition.
7358 Match &= !RepeatedLoads.front().isUndef();
7359 Match &= !RepeatedLoads.back().isUndef();
7360 if (!Match)
7361 continue;
7362
7363 EVT RepeatVT =
7364 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7365 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7366 : EVT::getFloatingPointVT(ScalarSize);
7367 if (RepeatSize > ScalarSize)
7368 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7369 RepeatSize / ScalarSize);
7370 EVT BroadcastVT =
7371 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7372 VT.getSizeInBits() / ScalarSize);
7373 if (TLI.isTypeLegal(BroadcastVT)) {
7374 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7375 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
7376 SDValue Broadcast = RepeatLoad;
7377 if (RepeatSize > ScalarSize) {
7378 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7379 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7380 } else {
7381 if (!Subtarget.hasAVX2() &&
7383 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7384 Subtarget,
7385 /*AssumeSingleUse=*/true))
7386 return SDValue();
7387 Broadcast =
7388 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
7389 }
7390 return DAG.getBitcast(VT, Broadcast);
7391 }
7392 }
7393 }
7394 }
7395
7396 return SDValue();
7397}
7398
7399// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
7400// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
7401// are consecutive, non-overlapping, and in the right order.
7403 SelectionDAG &DAG,
7404 const X86Subtarget &Subtarget,
7405 bool IsAfterLegalize) {
7407 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
7408 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
7409 Elts.push_back(Elt);
7410 continue;
7411 }
7412 return SDValue();
7413 }
7414 assert(Elts.size() == VT.getVectorNumElements());
7415 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
7416 IsAfterLegalize);
7417}
7418
7420 const APInt &Undefs, LLVMContext &C) {
7421 unsigned ScalarSize = VT.getScalarSizeInBits();
7422 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
7423
7424 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7425 if (VT.isFloatingPoint()) {
7426 if (ScalarSize == 16)
7427 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7428 if (ScalarSize == 32)
7429 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7430 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7431 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7432 }
7433 return Constant::getIntegerValue(Ty, Val);
7434 };
7435
7436 SmallVector<Constant *, 32> ConstantVec;
7437 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
7438 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
7439 : getConstantScalar(Bits[I]));
7440
7441 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7442}
7443
7444static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
7445 unsigned SplatBitSize, LLVMContext &C) {
7446 unsigned ScalarSize = VT.getScalarSizeInBits();
7447
7448 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7449 if (VT.isFloatingPoint()) {
7450 if (ScalarSize == 16)
7451 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7452 if (ScalarSize == 32)
7453 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7454 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7455 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7456 }
7457 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
7458 };
7459
7460 if (ScalarSize == SplatBitSize)
7461 return getConstantScalar(SplatValue);
7462
7463 unsigned NumElm = SplatBitSize / ScalarSize;
7464 SmallVector<Constant *, 32> ConstantVec;
7465 for (unsigned I = 0; I != NumElm; ++I) {
7466 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
7467 ConstantVec.push_back(getConstantScalar(Val));
7468 }
7469 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7470}
7471
7473 for (auto *U : N->users()) {
7474 unsigned Opc = U->getOpcode();
7475 // VPERMV/VPERMV3 shuffles can never fold their index operands.
7476 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7477 return false;
7478 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7479 return false;
7480 if (isTargetShuffle(Opc))
7481 return true;
7482 if (Opc == ISD::BITCAST) // Ignore bitcasts
7483 return isFoldableUseOfShuffle(U);
7484 if (N->hasOneUse()) {
7485 // TODO, there may be some general way to know if a SDNode can
7486 // be folded. We now only know whether an MI is foldable.
7487 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7488 return false;
7489 return true;
7490 }
7491 }
7492 return false;
7493}
7494
7495/// Attempt to use the vbroadcast instruction to generate a splat value
7496/// from a splat BUILD_VECTOR which uses:
7497/// a. A single scalar load, or a constant.
7498/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7499///
7500/// The VBROADCAST node is returned when a pattern is found,
7501/// or SDValue() otherwise.
7503 const SDLoc &dl,
7504 const X86Subtarget &Subtarget,
7505 SelectionDAG &DAG) {
7506 // VBROADCAST requires AVX.
7507 // TODO: Splats could be generated for non-AVX CPUs using SSE
7508 // instructions, but there's less potential gain for only 128-bit vectors.
7509 if (!Subtarget.hasAVX())
7510 return SDValue();
7511
7512 MVT VT = BVOp->getSimpleValueType(0);
7513 unsigned NumElts = VT.getVectorNumElements();
7514 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7515 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7516 "Unsupported vector type for broadcast.");
7517
7518 // See if the build vector is a repeating sequence of scalars (inc. splat).
7519 SDValue Ld;
7520 BitVector UndefElements;
7521 SmallVector<SDValue, 16> Sequence;
7522 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7523 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
7524 if (Sequence.size() == 1)
7525 Ld = Sequence[0];
7526 }
7527
7528 // Attempt to use VBROADCASTM
7529 // From this pattern:
7530 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7531 // b. t1 = (build_vector t0 t0)
7532 //
7533 // Create (VBROADCASTM v2i1 X)
7534 if (!Sequence.empty() && Subtarget.hasCDI()) {
7535 // If not a splat, are the upper sequence values zeroable?
7536 unsigned SeqLen = Sequence.size();
7537 bool UpperZeroOrUndef =
7538 SeqLen == 1 ||
7539 llvm::all_of(ArrayRef(Sequence).drop_front(),
7540 [](SDValue V) { return !V || isNullConstantOrUndef(V); });
7541 SDValue Op0 = Sequence[0];
7542 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
7543 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
7544 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
7545 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
7546 ? Op0.getOperand(0)
7547 : Op0.getOperand(0).getOperand(0);
7548 MVT MaskVT = BOperand.getSimpleValueType();
7549 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
7550 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7551 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7552 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
7553 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
7554 unsigned Scale = 512 / VT.getSizeInBits();
7555 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
7556 }
7557 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
7558 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
7559 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
7560 return DAG.getBitcast(VT, Bcst);
7561 }
7562 }
7563 }
7564
7565 unsigned NumUndefElts = UndefElements.count();
7566 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7567 APInt SplatValue, Undef;
7568 unsigned SplatBitSize;
7569 bool HasUndef;
7570 // Check if this is a repeated constant pattern suitable for broadcasting.
7571 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7572 SplatBitSize > VT.getScalarSizeInBits() &&
7573 SplatBitSize < VT.getSizeInBits()) {
7574 // Avoid replacing with broadcast when it's a use of a shuffle
7575 // instruction to preserve the present custom lowering of shuffles.
7576 if (isFoldableUseOfShuffle(BVOp))
7577 return SDValue();
7578 // replace BUILD_VECTOR with broadcast of the repeated constants.
7579 LLVMContext *Ctx = DAG.getContext();
7580 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7581 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7582 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
7583 // Load the constant scalar/subvector and broadcast it.
7584 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7585 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7586 SDValue CP = DAG.getConstantPool(C, PVT);
7587 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7588
7589 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7590 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
7591 SDValue Ops[] = {DAG.getEntryNode(), CP};
7592 MachinePointerInfo MPI =
7594 SDValue Brdcst =
7595 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7596 MPI, Alignment, MachineMemOperand::MOLoad);
7597 return DAG.getBitcast(VT, Brdcst);
7598 }
7599 if (SplatBitSize > 64) {
7600 // Load the vector of constants and broadcast it.
7601 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7602 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7603 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7604 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
7605 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7606 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7607 SDValue Ops[] = {DAG.getEntryNode(), VCP};
7608 MachinePointerInfo MPI =
7611 Ops, VVT, MPI, Alignment,
7613 }
7614 }
7615
7616 // If we are moving a scalar into a vector (Ld must be set and all elements
7617 // but 1 are undef) and that operation is not obviously supported by
7618 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7619 // That's better than general shuffling and may eliminate a load to GPR and
7620 // move from scalar to vector register.
7621 if (!Ld || NumElts - NumUndefElts != 1)
7622 return SDValue();
7623 unsigned ScalarSize = Ld.getValueSizeInBits();
7624 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7625 return SDValue();
7626 }
7627
7628 bool ConstSplatVal =
7629 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7630 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7631
7632 // TODO: Handle broadcasts of non-constant sequences.
7633
7634 // Make sure that all of the users of a non-constant load are from the
7635 // BUILD_VECTOR node.
7636 // FIXME: Is the use count needed for non-constant, non-load case?
7637 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7638 return SDValue();
7639
7640 unsigned ScalarSize = Ld.getValueSizeInBits();
7641 bool IsGE256 = (VT.getSizeInBits() >= 256);
7642
7643 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7644 // instruction to save 8 or more bytes of constant pool data.
7645 // TODO: If multiple splats are generated to load the same constant,
7646 // it may be detrimental to overall size. There needs to be a way to detect
7647 // that condition to know if this is truly a size win.
7648 bool OptForSize = DAG.shouldOptForSize();
7649
7650 // Handle broadcasting a single constant scalar from the constant pool
7651 // into a vector.
7652 // On Sandybridge (no AVX2), it is still better to load a constant vector
7653 // from the constant pool and not to broadcast it from a scalar.
7654 // But override that restriction when optimizing for size.
7655 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7656 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7657 EVT CVT = Ld.getValueType();
7658 assert(!CVT.isVector() && "Must not broadcast a vector type");
7659
7660 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
7661 // For size optimization, also splat v2f64 and v2i64, and for size opt
7662 // with AVX2, also splat i8 and i16.
7663 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7664 if (ScalarSize == 32 ||
7665 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7666 (CVT == MVT::f16 && Subtarget.hasAVX2()) ||
7667 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7668 const Constant *C = nullptr;
7669 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
7670 C = CI->getConstantIntValue();
7671 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
7672 C = CF->getConstantFPValue();
7673
7674 assert(C && "Invalid constant type");
7675
7676 SDValue CP =
7678 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7679
7680 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7681 SDValue Ops[] = {DAG.getEntryNode(), CP};
7682 MachinePointerInfo MPI =
7684 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7685 MPI, Alignment, MachineMemOperand::MOLoad);
7686 }
7687 }
7688
7689 // Handle AVX2 in-register broadcasts.
7690 if (!IsLoad && Subtarget.hasInt256() &&
7691 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7692 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7693
7694 // The scalar source must be a normal load.
7695 if (!IsLoad)
7696 return SDValue();
7697
7698 // Make sure the non-chain result is only used by this build vector.
7699 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7700 return SDValue();
7701
7702 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7703 (Subtarget.hasVLX() && ScalarSize == 64)) {
7704 auto *LN = cast<LoadSDNode>(Ld);
7705 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7706 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7707 SDValue BCast =
7709 LN->getMemoryVT(), LN->getMemOperand());
7710 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7711 return BCast;
7712 }
7713
7714 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7715 // double since there is no vbroadcastsd xmm
7716 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
7717 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
7718 auto *LN = cast<LoadSDNode>(Ld);
7719 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7720 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7721 SDValue BCast =
7723 LN->getMemoryVT(), LN->getMemOperand());
7724 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7725 return BCast;
7726 }
7727
7728 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
7729 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7730
7731 // Unsupported broadcast.
7732 return SDValue();
7733}
7734
7735/// For an EXTRACT_VECTOR_ELT with a constant index return the real
7736/// underlying vector and index.
7737///
7738/// Modifies \p ExtractedFromVec to the real vector and returns the real
7739/// index.
7740static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7741 SDValue ExtIdx) {
7742 int Idx = ExtIdx->getAsZExtVal();
7743 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7744 return Idx;
7745
7746 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7747 // lowered this:
7748 // (extract_vector_elt (v8f32 %1), Constant<6>)
7749 // to:
7750 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7751 // (extract_subvector (v8f32 %0), Constant<4>),
7752 // undef)
7753 // Constant<0>)
7754 // In this case the vector is the extract_subvector expression and the index
7755 // is 2, as specified by the shuffle.
7756 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7757 SDValue ShuffleVec = SVOp->getOperand(0);
7758 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7759 assert(ShuffleVecVT.getVectorElementType() ==
7760 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7761
7762 int ShuffleIdx = SVOp->getMaskElt(Idx);
7763 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7764 ExtractedFromVec = ShuffleVec;
7765 return ShuffleIdx;
7766 }
7767 return Idx;
7768}
7769
7771 SelectionDAG &DAG) {
7772 MVT VT = Op.getSimpleValueType();
7773
7774 // Skip if insert_vec_elt is not supported.
7775 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7777 return SDValue();
7778
7779 unsigned NumElems = Op.getNumOperands();
7780 SDValue VecIn1;
7781 SDValue VecIn2;
7782 SmallVector<unsigned, 4> InsertIndices;
7783 SmallVector<int, 8> Mask(NumElems, -1);
7784
7785 for (unsigned i = 0; i != NumElems; ++i) {
7786 unsigned Opc = Op.getOperand(i).getOpcode();
7787
7788 if (Opc == ISD::UNDEF)
7789 continue;
7790
7791 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
7792 // Quit if more than 1 elements need inserting.
7793 if (InsertIndices.size() > 1)
7794 return SDValue();
7795
7796 InsertIndices.push_back(i);
7797 continue;
7798 }
7799
7800 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7801 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7802
7803 // Quit if non-constant index.
7804 if (!isa<ConstantSDNode>(ExtIdx))
7805 return SDValue();
7806 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7807
7808 // Quit if extracted from vector of different type.
7809 if (ExtractedFromVec.getValueType() != VT)
7810 return SDValue();
7811
7812 if (!VecIn1.getNode())
7813 VecIn1 = ExtractedFromVec;
7814 else if (VecIn1 != ExtractedFromVec) {
7815 if (!VecIn2.getNode())
7816 VecIn2 = ExtractedFromVec;
7817 else if (VecIn2 != ExtractedFromVec)
7818 // Quit if more than 2 vectors to shuffle
7819 return SDValue();
7820 }
7821
7822 if (ExtractedFromVec == VecIn1)
7823 Mask[i] = Idx;
7824 else if (ExtractedFromVec == VecIn2)
7825 Mask[i] = Idx + NumElems;
7826 }
7827
7828 if (!VecIn1.getNode())
7829 return SDValue();
7830
7831 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7832 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7833
7834 for (unsigned Idx : InsertIndices)
7835 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7837
7838 return NV;
7839}
7840
7841// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
7843 const X86Subtarget &Subtarget) {
7844 MVT VT = Op.getSimpleValueType();
7845 MVT IVT =
7846 VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);
7848 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
7849 NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,
7850 Op.getOperand(I)));
7851 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
7852 return DAG.getBitcast(VT, Res);
7853}
7854
7855// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7857 SelectionDAG &DAG,
7858 const X86Subtarget &Subtarget) {
7859
7860 MVT VT = Op.getSimpleValueType();
7861 assert((VT.getVectorElementType() == MVT::i1) &&
7862 "Unexpected type in LowerBUILD_VECTORvXi1!");
7863 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
7864 ISD::isBuildVectorAllOnes(Op.getNode()))
7865 return Op;
7866
7867 uint64_t Immediate = 0;
7868 SmallVector<unsigned, 16> NonConstIdx;
7869 bool IsSplat = true;
7870 bool HasConstElts = false;
7871 int SplatIdx = -1;
7872 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7873 SDValue In = Op.getOperand(idx);
7874 if (In.isUndef())
7875 continue;
7876 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
7877 Immediate |= (InC->getZExtValue() & 0x1) << idx;
7878 HasConstElts = true;
7879 } else {
7880 NonConstIdx.push_back(idx);
7881 }
7882 if (SplatIdx < 0)
7883 SplatIdx = idx;
7884 else if (In != Op.getOperand(SplatIdx))
7885 IsSplat = false;
7886 }
7887
7888 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7889 if (IsSplat) {
7890 // The build_vector allows the scalar element to be larger than the vector
7891 // element type. We need to mask it to use as a condition unless we know
7892 // the upper bits are zero.
7893 // FIXME: Use computeKnownBits instead of checking specific opcode?
7894 SDValue Cond = Op.getOperand(SplatIdx);
7895 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
7896 if (Cond.getOpcode() != ISD::SETCC)
7897 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
7898 DAG.getConstant(1, dl, MVT::i8));
7899
7900 // Perform the select in the scalar domain so we can use cmov.
7901 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7902 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
7903 DAG.getAllOnesConstant(dl, MVT::i32),
7904 DAG.getConstant(0, dl, MVT::i32));
7905 Select = DAG.getBitcast(MVT::v32i1, Select);
7906 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
7907 } else {
7908 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
7909 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
7910 DAG.getAllOnesConstant(dl, ImmVT),
7911 DAG.getConstant(0, dl, ImmVT));
7912 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
7913 Select = DAG.getBitcast(VecVT, Select);
7914 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
7915 DAG.getVectorIdxConstant(0, dl));
7916 }
7917 }
7918
7919 // insert elements one by one
7920 SDValue DstVec;
7921 if (HasConstElts) {
7922 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7923 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
7924 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
7925 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
7926 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
7927 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
7928 } else {
7929 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
7930 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
7931 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
7932 DstVec = DAG.getBitcast(VecVT, Imm);
7933 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
7934 DAG.getVectorIdxConstant(0, dl));
7935 }
7936 } else
7937 DstVec = DAG.getUNDEF(VT);
7938
7939 for (unsigned InsertIdx : NonConstIdx) {
7940 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7941 Op.getOperand(InsertIdx),
7942 DAG.getVectorIdxConstant(InsertIdx, dl));
7943 }
7944 return DstVec;
7945}
7946
7947LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
7948 switch (Opcode) {
7949 case X86ISD::PACKSS:
7950 case X86ISD::PACKUS:
7951 case X86ISD::FHADD:
7952 case X86ISD::FHSUB:
7953 case X86ISD::HADD:
7954 case X86ISD::HSUB:
7955 return true;
7956 }
7957 return false;
7958}
7959
7960/// This is a helper function of LowerToHorizontalOp().
7961/// This function checks that the build_vector \p N in input implements a
7962/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
7963/// may not match the layout of an x86 256-bit horizontal instruction.
7964/// In other words, if this returns true, then some extraction/insertion will
7965/// be required to produce a valid horizontal instruction.
7966///
7967/// Parameter \p Opcode defines the kind of horizontal operation to match.
7968/// For example, if \p Opcode is equal to ISD::ADD, then this function
7969/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7970/// is equal to ISD::SUB, then this function checks if this is a horizontal
7971/// arithmetic sub.
7972///
7973/// This function only analyzes elements of \p N whose indices are
7974/// in range [BaseIdx, LastIdx).
7975///
7976/// TODO: This function was originally used to match both real and fake partial
7977/// horizontal operations, but the index-matching logic is incorrect for that.
7978/// See the corrected implementation in isHopBuildVector(). Can we reduce this
7979/// code because it is only used for partial h-op matching now?
7980static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
7981 const SDLoc &DL, SelectionDAG &DAG,
7982 unsigned BaseIdx, unsigned LastIdx,
7983 SDValue &V0, SDValue &V1) {
7984 EVT VT = N->getValueType(0);
7985 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
7986 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7987 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7988 "Invalid Vector in input!");
7989
7990 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7991 bool CanFold = true;
7992 unsigned ExpectedVExtractIdx = BaseIdx;
7993 unsigned NumElts = LastIdx - BaseIdx;
7994 V0 = DAG.getUNDEF(VT);
7995 V1 = DAG.getUNDEF(VT);
7996
7997 // Check if N implements a horizontal binop.
7998 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7999 SDValue Op = N->getOperand(i + BaseIdx);
8000
8001 // Skip UNDEFs.
8002 if (Op->isUndef()) {
8003 // Update the expected vector extract index.
8004 if (i * 2 == NumElts)
8005 ExpectedVExtractIdx = BaseIdx;
8006 ExpectedVExtractIdx += 2;
8007 continue;
8008 }
8009
8010 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
8011
8012 if (!CanFold)
8013 break;
8014
8015 SDValue Op0 = Op.getOperand(0);
8016 SDValue Op1 = Op.getOperand(1);
8017
8018 // Try to match the following pattern:
8019 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
8020 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8022 Op0.getOperand(0) == Op1.getOperand(0) &&
8023 isa<ConstantSDNode>(Op0.getOperand(1)) &&
8024 isa<ConstantSDNode>(Op1.getOperand(1)));
8025 if (!CanFold)
8026 break;
8027
8028 unsigned I0 = Op0.getConstantOperandVal(1);
8029 unsigned I1 = Op1.getConstantOperandVal(1);
8030
8031 if (i * 2 < NumElts) {
8032 if (V0.isUndef()) {
8033 V0 = Op0.getOperand(0);
8034 if (V0.getValueType() != VT)
8035 return false;
8036 }
8037 } else {
8038 if (V1.isUndef()) {
8039 V1 = Op0.getOperand(0);
8040 if (V1.getValueType() != VT)
8041 return false;
8042 }
8043 if (i * 2 == NumElts)
8044 ExpectedVExtractIdx = BaseIdx;
8045 }
8046
8047 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
8048 if (I0 == ExpectedVExtractIdx)
8049 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
8050 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8051 // Try to match the following dag sequence:
8052 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
8053 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
8054 } else
8055 CanFold = false;
8056
8057 ExpectedVExtractIdx += 2;
8058 }
8059
8060 return CanFold;
8061}
8062
8063/// Emit a sequence of two 128-bit horizontal add/sub followed by
8064/// a concat_vector.
8065///
8066/// This is a helper function of LowerToHorizontalOp().
8067/// This function expects two 256-bit vectors called V0 and V1.
8068/// At first, each vector is split into two separate 128-bit vectors.
8069/// Then, the resulting 128-bit vectors are used to implement two
8070/// horizontal binary operations.
8071///
8072/// The kind of horizontal binary operation is defined by \p X86Opcode.
8073///
8074/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
8075/// the two new horizontal binop.
8076/// When Mode is set, the first horizontal binop dag node would take as input
8077/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
8078/// horizontal binop dag node would take as input the lower 128-bit of V1
8079/// and the upper 128-bit of V1.
8080/// Example:
8081/// HADD V0_LO, V0_HI
8082/// HADD V1_LO, V1_HI
8083///
8084/// Otherwise, the first horizontal binop dag node takes as input the lower
8085/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
8086/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
8087/// Example:
8088/// HADD V0_LO, V1_LO
8089/// HADD V0_HI, V1_HI
8090///
8091/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
8092/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
8093/// the upper 128-bits of the result.
8094static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
8095 const SDLoc &DL, SelectionDAG &DAG,
8096 unsigned X86Opcode, bool Mode,
8097 bool isUndefLO, bool isUndefHI) {
8098 MVT VT = V0.getSimpleValueType();
8099 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
8100 "Invalid nodes in input!");
8101
8102 unsigned NumElts = VT.getVectorNumElements();
8103 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
8104 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
8105 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
8106 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
8107 MVT NewVT = V0_LO.getSimpleValueType();
8108
8109 SDValue LO = DAG.getUNDEF(NewVT);
8110 SDValue HI = DAG.getUNDEF(NewVT);
8111
8112 if (Mode) {
8113 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8114 if (!isUndefLO && !V0->isUndef())
8115 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
8116 if (!isUndefHI && !V1->isUndef())
8117 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
8118 } else {
8119 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8120 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
8121 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
8122
8123 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
8124 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
8125 }
8126
8127 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
8128}
8129
8130/// Returns true iff \p BV builds a vector with the result equivalent to
8131/// the result of ADDSUB/SUBADD operation.
8132/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
8133/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
8134/// \p Opnd0 and \p Opnd1.
8136 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8137 SDValue &Opnd0, SDValue &Opnd1,
8138 unsigned &NumExtracts,
8139 bool &IsSubAdd) {
8140
8141 MVT VT = BV->getSimpleValueType(0);
8142 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
8143 return false;
8144
8145 unsigned NumElts = VT.getVectorNumElements();
8146 SDValue InVec0 = DAG.getUNDEF(VT);
8147 SDValue InVec1 = DAG.getUNDEF(VT);
8148
8149 NumExtracts = 0;
8150
8151 // Odd-numbered elements in the input build vector are obtained from
8152 // adding/subtracting two integer/float elements.
8153 // Even-numbered elements in the input build vector are obtained from
8154 // subtracting/adding two integer/float elements.
8155 unsigned Opc[2] = {0, 0};
8156 for (unsigned i = 0, e = NumElts; i != e; ++i) {
8157 SDValue Op = BV->getOperand(i);
8158
8159 // Skip 'undef' values.
8160 unsigned Opcode = Op.getOpcode();
8161 if (Opcode == ISD::UNDEF)
8162 continue;
8163
8164 // Early exit if we found an unexpected opcode.
8165 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
8166 return false;
8167
8168 SDValue Op0 = Op.getOperand(0);
8169 SDValue Op1 = Op.getOperand(1);
8170
8171 // Try to match the following pattern:
8172 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
8173 // Early exit if we cannot match that sequence.
8174 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8176 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
8177 Op0.getOperand(1) != Op1.getOperand(1))
8178 return false;
8179
8180 unsigned I0 = Op0.getConstantOperandVal(1);
8181 if (I0 != i)
8182 return false;
8183
8184 // We found a valid add/sub node, make sure its the same opcode as previous
8185 // elements for this parity.
8186 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
8187 return false;
8188 Opc[i % 2] = Opcode;
8189
8190 // Update InVec0 and InVec1.
8191 if (InVec0.isUndef()) {
8192 InVec0 = Op0.getOperand(0);
8193 if (InVec0.getSimpleValueType() != VT)
8194 return false;
8195 }
8196 if (InVec1.isUndef()) {
8197 InVec1 = Op1.getOperand(0);
8198 if (InVec1.getSimpleValueType() != VT)
8199 return false;
8200 }
8201
8202 // Make sure that operands in input to each add/sub node always
8203 // come from a same pair of vectors.
8204 if (InVec0 != Op0.getOperand(0)) {
8205 if (Opcode == ISD::FSUB)
8206 return false;
8207
8208 // FADD is commutable. Try to commute the operands
8209 // and then test again.
8210 std::swap(Op0, Op1);
8211 if (InVec0 != Op0.getOperand(0))
8212 return false;
8213 }
8214
8215 if (InVec1 != Op1.getOperand(0))
8216 return false;
8217
8218 // Increment the number of extractions done.
8219 ++NumExtracts;
8220 }
8221
8222 // Ensure we have found an opcode for both parities and that they are
8223 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
8224 // inputs are undef.
8225 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
8226 InVec0.isUndef() || InVec1.isUndef())
8227 return false;
8228
8229 IsSubAdd = Opc[0] == ISD::FADD;
8230
8231 Opnd0 = InVec0;
8232 Opnd1 = InVec1;
8233 return true;
8234}
8235
8236/// Returns true if is possible to fold MUL and an idiom that has already been
8237/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
8238/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
8239/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
8240///
8241/// Prior to calling this function it should be known that there is some
8242/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
8243/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
8244/// before replacement of such SDNode with ADDSUB operation. Thus the number
8245/// of \p Opnd0 uses is expected to be equal to 2.
8246/// For example, this function may be called for the following IR:
8247/// %AB = fmul fast <2 x double> %A, %B
8248/// %Sub = fsub fast <2 x double> %AB, %C
8249/// %Add = fadd fast <2 x double> %AB, %C
8250/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8251/// <2 x i32> <i32 0, i32 3>
8252/// There is a def for %Addsub here, which potentially can be replaced by
8253/// X86ISD::ADDSUB operation:
8254/// %Addsub = X86ISD::ADDSUB %AB, %C
8255/// and such ADDSUB can further be replaced with FMADDSUB:
8256/// %Addsub = FMADDSUB %A, %B, %C.
8257///
8258/// The main reason why this method is called before the replacement of the
8259/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8260/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8261/// FMADDSUB is.
8262static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8263 SelectionDAG &DAG,
8264 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
8265 unsigned ExpectedUses) {
8266 if (Opnd0.getOpcode() != ISD::FMUL ||
8267 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8268 return false;
8269
8270 // FIXME: These checks must match the similar ones in
8271 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8272 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8273 // or MUL + ADDSUB to FMADDSUB.
8274 const TargetOptions &Options = DAG.getTarget().Options;
8275 bool AllowFusion =
8276 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
8277 if (!AllowFusion)
8278 return false;
8279
8280 Opnd2 = Opnd1;
8281 Opnd1 = Opnd0.getOperand(1);
8282 Opnd0 = Opnd0.getOperand(0);
8283
8284 return true;
8285}
8286
8287/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8288/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8289/// X86ISD::FMSUBADD node.
8291 const SDLoc &DL,
8292 const X86Subtarget &Subtarget,
8293 SelectionDAG &DAG) {
8294 SDValue Opnd0, Opnd1;
8295 unsigned NumExtracts;
8296 bool IsSubAdd;
8297 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
8298 IsSubAdd))
8299 return SDValue();
8300
8301 MVT VT = BV->getSimpleValueType(0);
8302
8303 // Try to generate X86ISD::FMADDSUB node here.
8304 SDValue Opnd2;
8305 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
8306 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8307 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8308 }
8309
8310 // We only support ADDSUB.
8311 if (IsSubAdd)
8312 return SDValue();
8313
8314 // There are no known X86 targets with 512-bit ADDSUB instructions!
8315 // Convert to blend(fsub,fadd).
8316 if (VT.is512BitVector()) {
8317 SmallVector<int> Mask;
8318 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8319 Mask.push_back(I);
8320 Mask.push_back(I + E + 1);
8321 }
8322 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8323 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8324 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8325 }
8326
8327 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8328}
8329
8331 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8332 // Initialize outputs to known values.
8333 MVT VT = BV->getSimpleValueType(0);
8334 HOpcode = ISD::DELETED_NODE;
8335 V0 = DAG.getUNDEF(VT);
8336 V1 = DAG.getUNDEF(VT);
8337
8338 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8339 // half of the result is calculated independently from the 128-bit halves of
8340 // the inputs, so that makes the index-checking logic below more complicated.
8341 unsigned NumElts = VT.getVectorNumElements();
8342 unsigned GenericOpcode = ISD::DELETED_NODE;
8343 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8344 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8345 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8346 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8347 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8348 // Ignore undef elements.
8349 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8350 if (Op.isUndef())
8351 continue;
8352
8353 // If there's an opcode mismatch, we're done.
8354 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8355 return false;
8356
8357 // Initialize horizontal opcode.
8358 if (HOpcode == ISD::DELETED_NODE) {
8359 GenericOpcode = Op.getOpcode();
8360 switch (GenericOpcode) {
8361 // clang-format off
8362 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8363 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8364 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8365 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8366 default: return false;
8367 // clang-format on
8368 }
8369 }
8370
8371 SDValue Op0 = Op.getOperand(0);
8372 SDValue Op1 = Op.getOperand(1);
8373 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8375 Op0.getOperand(0) != Op1.getOperand(0) ||
8376 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
8377 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8378 return false;
8379
8380 // The source vector is chosen based on which 64-bit half of the
8381 // destination vector is being calculated.
8382 if (j < NumEltsIn64Bits) {
8383 if (V0.isUndef())
8384 V0 = Op0.getOperand(0);
8385 } else {
8386 if (V1.isUndef())
8387 V1 = Op0.getOperand(0);
8388 }
8389
8390 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8391 if (SourceVec != Op0.getOperand(0))
8392 return false;
8393
8394 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8395 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8396 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8397 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8398 (j % NumEltsIn64Bits) * 2;
8399 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8400 continue;
8401
8402 // If this is not a commutative op, this does not match.
8403 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
8404 return false;
8405
8406 // Addition is commutative, so try swapping the extract indexes.
8407 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
8408 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8409 continue;
8410
8411 // Extract indexes do not match horizontal requirement.
8412 return false;
8413 }
8414 }
8415 // We matched. Opcode and operands are returned by reference as arguments.
8416 return true;
8417}
8418
8420 const SDLoc &DL, SelectionDAG &DAG,
8421 unsigned HOpcode, SDValue V0, SDValue V1) {
8422 // If either input vector is not the same size as the build vector,
8423 // extract/insert the low bits to the correct size.
8424 // This is free (examples: zmm --> xmm, xmm --> ymm).
8425 MVT VT = BV->getSimpleValueType(0);
8426 unsigned Width = VT.getSizeInBits();
8427 if (V0.getValueSizeInBits() > Width)
8428 V0 = extractSubVector(V0, 0, DAG, DL, Width);
8429 else if (V0.getValueSizeInBits() < Width)
8430 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
8431
8432 if (V1.getValueSizeInBits() > Width)
8433 V1 = extractSubVector(V1, 0, DAG, DL, Width);
8434 else if (V1.getValueSizeInBits() < Width)
8435 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
8436
8437 unsigned NumElts = VT.getVectorNumElements();
8438 APInt DemandedElts = APInt::getAllOnes(NumElts);
8439 for (unsigned i = 0; i != NumElts; ++i)
8440 if (BV->getOperand(i).isUndef())
8441 DemandedElts.clearBit(i);
8442
8443 // If we don't need the upper xmm, then perform as a xmm hop.
8444 unsigned HalfNumElts = NumElts / 2;
8445 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
8446 MVT HalfVT = VT.getHalfNumVectorElementsVT();
8447 V0 = extractSubVector(V0, 0, DAG, DL, 128);
8448 V1 = extractSubVector(V1, 0, DAG, DL, 128);
8449 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
8450 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
8451 }
8452
8453 return DAG.getNode(HOpcode, DL, VT, V0, V1);
8454}
8455
8456/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
8458 const X86Subtarget &Subtarget,
8459 SelectionDAG &DAG) {
8460 // We need at least 2 non-undef elements to make this worthwhile by default.
8461 unsigned NumNonUndefs =
8462 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8463 if (NumNonUndefs < 2)
8464 return SDValue();
8465
8466 // There are 4 sets of horizontal math operations distinguished by type:
8467 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8468 // subtarget feature. Try to match those "native" patterns first.
8469 MVT VT = BV->getSimpleValueType(0);
8470 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
8471 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
8472 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
8473 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
8474 unsigned HOpcode;
8475 SDValue V0, V1;
8476 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8477 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
8478 }
8479
8480 // Try harder to match 256-bit ops by using extract/concat.
8481 if (!Subtarget.hasAVX() || !VT.is256BitVector())
8482 return SDValue();
8483
8484 // Count the number of UNDEF operands in the build_vector in input.
8485 unsigned NumElts = VT.getVectorNumElements();
8486 unsigned Half = NumElts / 2;
8487 unsigned NumUndefsLO = 0;
8488 unsigned NumUndefsHI = 0;
8489 for (unsigned i = 0, e = Half; i != e; ++i)
8490 if (BV->getOperand(i)->isUndef())
8491 NumUndefsLO++;
8492
8493 for (unsigned i = Half, e = NumElts; i != e; ++i)
8494 if (BV->getOperand(i)->isUndef())
8495 NumUndefsHI++;
8496
8497 SDValue InVec0, InVec1;
8498 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8499 SDValue InVec2, InVec3;
8500 unsigned X86Opcode;
8501 bool CanFold = true;
8502
8503 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
8504 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
8505 InVec3) &&
8506 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8507 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8508 X86Opcode = X86ISD::HADD;
8509 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
8510 InVec1) &&
8511 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
8512 InVec3) &&
8513 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8514 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8515 X86Opcode = X86ISD::HSUB;
8516 else
8517 CanFold = false;
8518
8519 if (CanFold) {
8520 // Do not try to expand this build_vector into a pair of horizontal
8521 // add/sub if we can emit a pair of scalar add/sub.
8522 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8523 return SDValue();
8524
8525 // Convert this build_vector into a pair of horizontal binops followed by
8526 // a concat vector. We must adjust the outputs from the partial horizontal
8527 // matching calls above to account for undefined vector halves.
8528 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8529 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8530 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8531 bool isUndefLO = NumUndefsLO == Half;
8532 bool isUndefHI = NumUndefsHI == Half;
8533 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8534 isUndefHI);
8535 }
8536 }
8537
8538 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8539 VT == MVT::v16i16) {
8540 unsigned X86Opcode;
8541 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
8542 InVec1))
8543 X86Opcode = X86ISD::HADD;
8544 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
8545 InVec1))
8546 X86Opcode = X86ISD::HSUB;
8547 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
8548 InVec1))
8549 X86Opcode = X86ISD::FHADD;
8550 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
8551 InVec1))
8552 X86Opcode = X86ISD::FHSUB;
8553 else
8554 return SDValue();
8555
8556 // Don't try to expand this build_vector into a pair of horizontal add/sub
8557 // if we can simply emit a pair of scalar add/sub.
8558 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8559 return SDValue();
8560
8561 // Convert this build_vector into two horizontal add/sub followed by
8562 // a concat vector.
8563 bool isUndefLO = NumUndefsLO == Half;
8564 bool isUndefHI = NumUndefsHI == Half;
8565 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8566 isUndefLO, isUndefHI);
8567 }
8568
8569 return SDValue();
8570}
8571
8572static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
8573 SelectionDAG &DAG);
8574
8575/// If a BUILD_VECTOR's source elements all apply the same bit operation and
8576/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8577/// just apply the bit to the vectors.
8578/// NOTE: Its not in our interest to start make a general purpose vectorizer
8579/// from this, but enough scalar bit operations are created from the later
8580/// legalization + scalarization stages to need basic support.
8582 const X86Subtarget &Subtarget,
8583 SelectionDAG &DAG) {
8584 MVT VT = Op->getSimpleValueType(0);
8585 unsigned NumElems = VT.getVectorNumElements();
8586 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8587
8588 // Check that all elements have the same opcode.
8589 // TODO: Should we allow UNDEFS and if so how many?
8590 unsigned Opcode = Op->getOperand(0).getOpcode();
8591 for (unsigned i = 1; i < NumElems; ++i)
8592 if (Opcode != Op->getOperand(i).getOpcode())
8593 return SDValue();
8594
8595 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
8596 bool IsShift = false;
8597 switch (Opcode) {
8598 default:
8599 return SDValue();
8600 case ISD::SHL:
8601 case ISD::SRL:
8602 case ISD::SRA:
8603 IsShift = true;
8604 break;
8605 case ISD::AND:
8606 case ISD::XOR:
8607 case ISD::OR:
8608 // Don't do this if the buildvector is a splat - we'd replace one
8609 // constant with an entire vector.
8610 if (Op->getSplatValue())
8611 return SDValue();
8612 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8613 return SDValue();
8614 break;
8615 }
8616
8617 SmallVector<SDValue, 4> LHSElts, RHSElts;
8618 for (SDValue Elt : Op->ops()) {
8619 SDValue LHS = Elt.getOperand(0);
8620 SDValue RHS = Elt.getOperand(1);
8621
8622 // We expect the canonicalized RHS operand to be the constant.
8623 if (!isa<ConstantSDNode>(RHS))
8624 return SDValue();
8625
8626 // Extend shift amounts.
8627 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
8628 if (!IsShift)
8629 return SDValue();
8630 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
8631 }
8632
8633 LHSElts.push_back(LHS);
8634 RHSElts.push_back(RHS);
8635 }
8636
8637 // Limit to shifts by uniform immediates.
8638 // TODO: Only accept vXi8/vXi64 special cases?
8639 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8640 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
8641 return SDValue();
8642
8643 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
8644 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
8645 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
8646
8647 if (!IsShift)
8648 return Res;
8649
8650 // Immediately lower the shift to ensure the constant build vector doesn't
8651 // get converted to a constant pool before the shift is lowered.
8652 return LowerShift(Res, Subtarget, DAG);
8653}
8654
8655/// Create a vector constant without a load. SSE/AVX provide the bare minimum
8656/// functionality to do this, so it's all zeros, all ones, or some derivation
8657/// that is cheap to calculate.
8659 SelectionDAG &DAG,
8660 const X86Subtarget &Subtarget) {
8661 MVT VT = Op.getSimpleValueType();
8662
8663 // Vectors containing all zeros can be matched by pxor and xorps.
8664 if (ISD::isBuildVectorAllZeros(Op.getNode()))
8665 return Op;
8666
8667 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8668 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8669 // vpcmpeqd on 256-bit vectors.
8670 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8671 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8672 return Op;
8673
8674 return getOnesVector(VT, DAG, DL);
8675 }
8676
8677 return SDValue();
8678}
8679
8680/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8681/// from a vector of source values and a vector of extraction indices.
8682/// The vectors might be manipulated to match the type of the permute op.
8683static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8684 const SDLoc &DL, SelectionDAG &DAG,
8685 const X86Subtarget &Subtarget) {
8686 MVT ShuffleVT = VT;
8687 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8688 unsigned NumElts = VT.getVectorNumElements();
8689 unsigned SizeInBits = VT.getSizeInBits();
8690
8691 // Adjust IndicesVec to match VT size.
8692 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8693 "Illegal variable permute mask size");
8694 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
8695 // Narrow/widen the indices vector to the correct size.
8696 if (IndicesVec.getValueSizeInBits() > SizeInBits)
8697 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8698 NumElts * VT.getScalarSizeInBits());
8699 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
8700 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
8701 SDLoc(IndicesVec), SizeInBits);
8702 // Zero-extend the index elements within the vector.
8703 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8704 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
8705 IndicesVT, IndicesVec);
8706 }
8707 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8708
8709 // Handle SrcVec that don't match VT type.
8710 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8711 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8712 // Handle larger SrcVec by treating it as a larger permute.
8713 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8714 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8715 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8716 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8717 Subtarget, DAG, SDLoc(IndicesVec));
8718 SDValue NewSrcVec =
8719 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8720 if (NewSrcVec)
8721 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
8722 return SDValue();
8723 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8724 // Widen smaller SrcVec to match VT.
8725 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8726 } else
8727 return SDValue();
8728 }
8729
8730 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8731 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8732 EVT SrcVT = Idx.getValueType();
8733 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8734 uint64_t IndexScale = 0;
8735 uint64_t IndexOffset = 0;
8736
8737 // If we're scaling a smaller permute op, then we need to repeat the
8738 // indices, scaling and offsetting them as well.
8739 // e.g. v4i32 -> v16i8 (Scale = 4)
8740 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8741 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8742 for (uint64_t i = 0; i != Scale; ++i) {
8743 IndexScale |= Scale << (i * NumDstBits);
8744 IndexOffset |= i << (i * NumDstBits);
8745 }
8746
8747 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8748 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8749 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8750 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8751 return Idx;
8752 };
8753
8754 unsigned Opcode = 0;
8755 switch (VT.SimpleTy) {
8756 default:
8757 break;
8758 case MVT::v16i8:
8759 if (Subtarget.hasSSSE3())
8760 Opcode = X86ISD::PSHUFB;
8761 break;
8762 case MVT::v8i16:
8763 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8764 Opcode = X86ISD::VPERMV;
8765 else if (Subtarget.hasSSSE3()) {
8766 Opcode = X86ISD::PSHUFB;
8767 ShuffleVT = MVT::v16i8;
8768 }
8769 break;
8770 case MVT::v4f32:
8771 case MVT::v4i32:
8772 if (Subtarget.hasAVX()) {
8773 Opcode = X86ISD::VPERMILPV;
8774 ShuffleVT = MVT::v4f32;
8775 } else if (Subtarget.hasSSSE3()) {
8776 Opcode = X86ISD::PSHUFB;
8777 ShuffleVT = MVT::v16i8;
8778 }
8779 break;
8780 case MVT::v2f64:
8781 case MVT::v2i64:
8782 if (Subtarget.hasAVX()) {
8783 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8784 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8785 Opcode = X86ISD::VPERMILPV;
8786 ShuffleVT = MVT::v2f64;
8787 } else if (Subtarget.hasSSE41()) {
8788 // SSE41 can compare v2i64 - select between indices 0 and 1.
8789 return DAG.getSelectCC(
8790 DL, IndicesVec,
8791 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8792 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8793 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8795 }
8796 break;
8797 case MVT::v32i8:
8798 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8799 Opcode = X86ISD::VPERMV;
8800 else if (Subtarget.hasXOP()) {
8801 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8802 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
8803 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
8804 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
8805 return DAG.getNode(
8807 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
8808 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
8809 } else if (Subtarget.hasAVX()) {
8810 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
8811 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
8812 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
8813 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
8814 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
8815 ArrayRef<SDValue> Ops) {
8816 // Permute Lo and Hi and then select based on index range.
8817 // This works as SHUFB uses bits[3:0] to permute elements and we don't
8818 // care about the bit[7] as its just an index vector.
8819 SDValue Idx = Ops[2];
8820 EVT VT = Idx.getValueType();
8821 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
8822 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
8823 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
8825 };
8826 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
8827 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
8828 PSHUFBBuilder);
8829 }
8830 break;
8831 case MVT::v16i16:
8832 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8833 Opcode = X86ISD::VPERMV;
8834 else if (Subtarget.hasAVX()) {
8835 // Scale to v32i8 and perform as v32i8.
8836 IndicesVec = ScaleIndices(IndicesVec, 2);
8837 return DAG.getBitcast(
8839 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
8840 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
8841 }
8842 break;
8843 case MVT::v8f32:
8844 case MVT::v8i32:
8845 if (Subtarget.hasAVX2())
8846 Opcode = X86ISD::VPERMV;
8847 else if (Subtarget.hasAVX()) {
8848 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
8849 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8850 {0, 1, 2, 3, 0, 1, 2, 3});
8851 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8852 {4, 5, 6, 7, 4, 5, 6, 7});
8853 if (Subtarget.hasXOP())
8854 return DAG.getBitcast(
8855 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
8856 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
8857 // Permute Lo and Hi and then select based on index range.
8858 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
8859 SDValue Res = DAG.getSelectCC(
8860 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
8861 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
8862 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
8864 return DAG.getBitcast(VT, Res);
8865 }
8866 break;
8867 case MVT::v4i64:
8868 case MVT::v4f64:
8869 if (Subtarget.hasAVX512()) {
8870 if (!Subtarget.hasVLX()) {
8871 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
8872 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
8873 SDLoc(SrcVec));
8874 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
8875 DAG, SDLoc(IndicesVec));
8876 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
8877 DAG, Subtarget);
8878 return extract256BitVector(Res, 0, DAG, DL);
8879 }
8880 Opcode = X86ISD::VPERMV;
8881 } else if (Subtarget.hasAVX()) {
8882 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
8883 SDValue LoLo =
8884 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
8885 SDValue HiHi =
8886 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
8887 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
8888 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8889 if (Subtarget.hasXOP())
8890 return DAG.getBitcast(
8891 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
8892 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
8893 // Permute Lo and Hi and then select based on index range.
8894 // This works as VPERMILPD only uses index bit[1] to permute elements.
8895 SDValue Res = DAG.getSelectCC(
8896 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
8897 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
8898 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
8900 return DAG.getBitcast(VT, Res);
8901 }
8902 break;
8903 case MVT::v64i8:
8904 if (Subtarget.hasVBMI())
8905 Opcode = X86ISD::VPERMV;
8906 break;
8907 case MVT::v32i16:
8908 if (Subtarget.hasBWI())
8909 Opcode = X86ISD::VPERMV;
8910 break;
8911 case MVT::v16f32:
8912 case MVT::v16i32:
8913 case MVT::v8f64:
8914 case MVT::v8i64:
8915 if (Subtarget.hasAVX512())
8916 Opcode = X86ISD::VPERMV;
8917 break;
8918 }
8919 if (!Opcode)
8920 return SDValue();
8921
8922 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
8923 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
8924 "Illegal variable permute shuffle type");
8925
8926 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
8927 if (Scale > 1)
8928 IndicesVec = ScaleIndices(IndicesVec, Scale);
8929
8930 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
8931 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
8932
8933 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
8934 SDValue Res = Opcode == X86ISD::VPERMV
8935 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
8936 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
8937 return DAG.getBitcast(VT, Res);
8938}
8939
8940// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
8941// reasoned to be a permutation of a vector by indices in a non-constant vector.
8942// (build_vector (extract_elt V, (extract_elt I, 0)),
8943// (extract_elt V, (extract_elt I, 1)),
8944// ...
8945// ->
8946// (vpermv I, V)
8947//
8948// TODO: Handle undefs
8949// TODO: Utilize pshufb and zero mask blending to support more efficient
8950// construction of vectors with constant-0 elements.
8951static SDValue
8953 SelectionDAG &DAG,
8954 const X86Subtarget &Subtarget) {
8955 SDValue SrcVec, IndicesVec;
8956 // Check for a match of the permute source vector and permute index elements.
8957 // This is done by checking that the i-th build_vector operand is of the form:
8958 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
8959 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
8960 SDValue Op = V.getOperand(Idx);
8961 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8962 return SDValue();
8963
8964 // If this is the first extract encountered in V, set the source vector,
8965 // otherwise verify the extract is from the previously defined source
8966 // vector.
8967 if (!SrcVec)
8968 SrcVec = Op.getOperand(0);
8969 else if (SrcVec != Op.getOperand(0))
8970 return SDValue();
8971 SDValue ExtractedIndex = Op->getOperand(1);
8972 // Peek through extends.
8973 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
8974 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
8975 ExtractedIndex = ExtractedIndex.getOperand(0);
8976 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8977 return SDValue();
8978
8979 // If this is the first extract from the index vector candidate, set the
8980 // indices vector, otherwise verify the extract is from the previously
8981 // defined indices vector.
8982 if (!IndicesVec)
8983 IndicesVec = ExtractedIndex.getOperand(0);
8984 else if (IndicesVec != ExtractedIndex.getOperand(0))
8985 return SDValue();
8986
8987 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
8988 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
8989 return SDValue();
8990 }
8991
8992 MVT VT = V.getSimpleValueType();
8993 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8994}
8995
8996SDValue
8997X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
8998 SDLoc dl(Op);
8999
9000 MVT VT = Op.getSimpleValueType();
9001 MVT EltVT = VT.getVectorElementType();
9002 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
9003 unsigned NumElems = Op.getNumOperands();
9004
9005 // Generate vectors for predicate vectors.
9006 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
9007 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
9008
9009 if (VT.getVectorElementType() == MVT::bf16 &&
9010 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
9011 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
9012
9013 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
9014 return VectorCst;
9015
9016 unsigned EVTBits = EltVT.getSizeInBits();
9017 APInt UndefMask = APInt::getZero(NumElems);
9018 APInt FrozenUndefMask = APInt::getZero(NumElems);
9019 APInt ZeroMask = APInt::getZero(NumElems);
9020 APInt NonZeroMask = APInt::getZero(NumElems);
9021 bool IsAllConstants = true;
9022 bool OneUseFrozenUndefs = true;
9023 SmallSet<SDValue, 8> Values;
9024 unsigned NumConstants = NumElems;
9025 for (unsigned i = 0; i < NumElems; ++i) {
9026 SDValue Elt = Op.getOperand(i);
9027 if (Elt.isUndef()) {
9028 UndefMask.setBit(i);
9029 continue;
9030 }
9031 if (ISD::isFreezeUndef(Elt.getNode())) {
9032 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
9033 FrozenUndefMask.setBit(i);
9034 continue;
9035 }
9036 Values.insert(Elt);
9037 if (!isIntOrFPConstant(Elt)) {
9038 IsAllConstants = false;
9039 NumConstants--;
9040 }
9041 if (X86::isZeroNode(Elt)) {
9042 ZeroMask.setBit(i);
9043 } else {
9044 NonZeroMask.setBit(i);
9045 }
9046 }
9047
9048 // All undef vector. Return an UNDEF.
9049 if (UndefMask.isAllOnes())
9050 return DAG.getUNDEF(VT);
9051
9052 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
9053 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
9054 return DAG.getFreeze(DAG.getUNDEF(VT));
9055
9056 // All undef/freeze(undef)/zero vector. Return a zero vector.
9057 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
9058 return getZeroVector(VT, Subtarget, DAG, dl);
9059
9060 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
9061 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
9062 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
9063 // and blend the FREEZE-UNDEF operands back in.
9064 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
9065 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
9066 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
9067 SmallVector<int, 16> BlendMask(NumElems, -1);
9068 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
9069 for (unsigned i = 0; i < NumElems; ++i) {
9070 if (UndefMask[i]) {
9071 BlendMask[i] = -1;
9072 continue;
9073 }
9074 BlendMask[i] = i;
9075 if (!FrozenUndefMask[i])
9076 Elts[i] = Op.getOperand(i);
9077 else
9078 BlendMask[i] += NumElems;
9079 }
9080 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
9081 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
9082 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
9083 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
9084 }
9085
9086 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
9087
9088 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
9089 // be better off lowering to a smaller build vector and padding with
9090 // undef/zero.
9091 if ((VT.is256BitVector() || VT.is512BitVector()) &&
9093 unsigned UpperElems = NumElems / 2;
9094 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
9095 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
9096 if (NumUpperUndefsOrZeros >= UpperElems) {
9097 if (VT.is512BitVector() &&
9098 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9099 UpperElems = NumElems - (NumElems / 4);
9100 // If freeze(undef) is in any upper elements, force to zero.
9101 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
9102 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
9103 SDValue NewBV =
9104 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
9105 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
9106 }
9107 }
9108
9109 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
9110 return AddSub;
9111 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
9112 return HorizontalOp;
9113 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
9114 return Broadcast;
9115 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
9116 return BitOp;
9117
9118 unsigned NumZero = ZeroMask.popcount();
9119 unsigned NumNonZero = NonZeroMask.popcount();
9120
9121 // If we are inserting one variable into a vector of non-zero constants, try
9122 // to avoid loading each constant element as a scalar. Load the constants as a
9123 // vector and then insert the variable scalar element. If insertion is not
9124 // supported, fall back to a shuffle to get the scalar blended with the
9125 // constants. Insertion into a zero vector is handled as a special-case
9126 // somewhere below here.
9127 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9128 FrozenUndefMask.isZero() &&
9131 // Create an all-constant vector. The variable element in the old
9132 // build vector is replaced by undef in the constant vector. Save the
9133 // variable scalar element and its index for use in the insertelement.
9134 LLVMContext &Context = *DAG.getContext();
9135 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
9136 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
9137 SDValue VarElt;
9138 SDValue InsIndex;
9139 for (unsigned i = 0; i != NumElems; ++i) {
9140 SDValue Elt = Op.getOperand(i);
9141 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
9142 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
9143 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
9144 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
9145 else if (!Elt.isUndef()) {
9146 assert(!VarElt.getNode() && !InsIndex.getNode() &&
9147 "Expected one variable element in this vector");
9148 VarElt = Elt;
9149 InsIndex = DAG.getVectorIdxConstant(i, dl);
9150 }
9151 }
9152 Constant *CV = ConstantVector::get(ConstVecOps);
9153 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
9154
9155 // The constants we just created may not be legal (eg, floating point). We
9156 // must lower the vector right here because we can not guarantee that we'll
9157 // legalize it before loading it. This is also why we could not just create
9158 // a new build vector here. If the build vector contains illegal constants,
9159 // it could get split back up into a series of insert elements.
9160 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
9161 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9164 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
9165 unsigned InsertC = InsIndex->getAsZExtVal();
9166 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
9167 if (InsertC < NumEltsInLow128Bits)
9168 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
9169
9170 // There's no good way to insert into the high elements of a >128-bit
9171 // vector, so use shuffles to avoid an extract/insert sequence.
9172 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
9173 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
9174 SmallVector<int, 8> ShuffleMask;
9175 unsigned NumElts = VT.getVectorNumElements();
9176 for (unsigned i = 0; i != NumElts; ++i)
9177 ShuffleMask.push_back(i == InsertC ? NumElts : i);
9178 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
9179 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
9180 }
9181
9182 // Special case for single non-zero, non-undef, element.
9183 if (NumNonZero == 1) {
9184 unsigned Idx = NonZeroMask.countr_zero();
9185 SDValue Item = Op.getOperand(Idx);
9186
9187 // If we have a constant or non-constant insertion into the low element of
9188 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
9189 // the rest of the elements. This will be matched as movd/movq/movss/movsd
9190 // depending on what the source datatype is.
9191 if (Idx == 0) {
9192 if (NumZero == 0)
9193 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9194
9195 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
9196 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
9197 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
9198 assert((VT.is128BitVector() || VT.is256BitVector() ||
9199 VT.is512BitVector()) &&
9200 "Expected an SSE value type!");
9201 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9202 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
9203 // zero vector.
9204 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9205 }
9206
9207 // We can't directly insert an i8 or i16 into a vector, so zero extend
9208 // it to i32 first.
9209 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9210 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
9211 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
9212 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
9213 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9214 return DAG.getBitcast(VT, Item);
9215 }
9216 }
9217
9218 // Is it a vector logical left shift?
9219 if (NumElems == 2 && Idx == 1 &&
9220 X86::isZeroNode(Op.getOperand(0)) &&
9221 !X86::isZeroNode(Op.getOperand(1))) {
9222 unsigned NumBits = VT.getSizeInBits();
9223 return getVShift(true, VT,
9225 VT, Op.getOperand(1)),
9226 NumBits/2, DAG, *this, dl);
9227 }
9228
9229 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
9230 return SDValue();
9231
9232 // Otherwise, if this is a vector with i32 or f32 elements, and the element
9233 // is a non-constant being inserted into an element other than the low one,
9234 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
9235 // movd/movss) to move this into the low element, then shuffle it into
9236 // place.
9237 if (EVTBits == 32) {
9238 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9239 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
9240 }
9241 }
9242
9243 // Splat is obviously ok. Let legalizer expand it to a shuffle.
9244 if (Values.size() == 1) {
9245 if (EVTBits == 32) {
9246 // Instead of a shuffle like this:
9247 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
9248 // Check if it's possible to issue this instead.
9249 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9250 unsigned Idx = NonZeroMask.countr_zero();
9251 SDValue Item = Op.getOperand(Idx);
9252 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9253 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9254 }
9255 return SDValue();
9256 }
9257
9258 // A vector full of immediates; various special cases are already
9259 // handled, so this is best done with a single constant-pool load.
9260 if (IsAllConstants)
9261 return SDValue();
9262
9263 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
9264 return V;
9265
9266 // See if we can use a vector load to get all of the elements.
9267 {
9268 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
9269 if (SDValue LD =
9270 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9271 return LD;
9272 }
9273
9274 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9275 // build_vector and broadcast it.
9276 // TODO: We could probably generalize this more.
9277 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
9278 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9279 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9280 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
9281 // Make sure all the even/odd operands match.
9282 for (unsigned i = 2; i != NumElems; ++i)
9283 if (Ops[i % 2] != Op.getOperand(i))
9284 return false;
9285 return true;
9286 };
9287 if (CanSplat(Op, NumElems, Ops)) {
9288 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
9289 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
9290 // Create a new build vector and cast to v2i64/v2f64.
9291 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
9292 DAG.getBuildVector(NarrowVT, dl, Ops));
9293 // Broadcast from v2i64/v2f64 and cast to final VT.
9294 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
9295 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
9296 NewBV));
9297 }
9298 }
9299
9300 // For AVX-length vectors, build the individual 128-bit pieces and use
9301 // shuffles to put them in place.
9302 if (VT.getSizeInBits() > 128) {
9303 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
9304
9305 // Build both the lower and upper subvector.
9306 SDValue Lower =
9307 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9309 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9310
9311 // Recreate the wider vector with the lower and upper part.
9312 return concatSubVectors(Lower, Upper, DAG, dl);
9313 }
9314
9315 // Let legalizer expand 2-wide build_vectors.
9316 if (EVTBits == 64) {
9317 if (NumNonZero == 1) {
9318 // One half is zero or undef.
9319 unsigned Idx = NonZeroMask.countr_zero();
9321 Op.getOperand(Idx));
9322 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
9323 }
9324 return SDValue();
9325 }
9326
9327 // If element VT is < 32 bits, convert it to inserts into a zero vector.
9328 if (EVTBits == 8 && NumElems == 16)
9329 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
9330 NumZero, DAG, Subtarget))
9331 return V;
9332
9333 if (EltVT == MVT::i16 && NumElems == 8)
9334 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
9335 NumZero, DAG, Subtarget))
9336 return V;
9337
9338 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
9339 if (EVTBits == 32 && NumElems == 4)
9340 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
9341 return V;
9342
9343 // If element VT is == 32 bits, turn it into a number of shuffles.
9344 if (NumElems == 4 && NumZero > 0) {
9345 SmallVector<SDValue, 8> Ops(NumElems);
9346 for (unsigned i = 0; i < 4; ++i) {
9347 bool isZero = !NonZeroMask[i];
9348 if (isZero)
9349 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
9350 else
9351 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9352 }
9353
9354 for (unsigned i = 0; i < 2; ++i) {
9355 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
9356 default: llvm_unreachable("Unexpected NonZero count");
9357 case 0:
9358 Ops[i] = Ops[i*2]; // Must be a zero vector.
9359 break;
9360 case 1:
9361 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9362 break;
9363 case 2:
9364 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9365 break;
9366 case 3:
9367 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9368 break;
9369 }
9370 }
9371
9372 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
9373 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
9374 int MaskVec[] = {
9375 Reverse1 ? 1 : 0,
9376 Reverse1 ? 0 : 1,
9377 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9378 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9379 };
9380 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
9381 }
9382
9383 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9384
9385 // Check for a build vector from mostly shuffle plus few inserting.
9386 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
9387 return Sh;
9388
9389 // For SSE 4.1, use insertps to put the high elements into the low element.
9390 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
9392 if (!Op.getOperand(0).isUndef())
9393 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
9394 else
9395 Result = DAG.getUNDEF(VT);
9396
9397 for (unsigned i = 1; i < NumElems; ++i) {
9398 if (Op.getOperand(i).isUndef()) continue;
9399 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
9400 Op.getOperand(i), DAG.getVectorIdxConstant(i, dl));
9401 }
9402 return Result;
9403 }
9404
9405 // Otherwise, expand into a number of unpckl*, start by extending each of
9406 // our (non-undef) elements to the full vector width with the element in the
9407 // bottom slot of the vector (which generates no code for SSE).
9408 SmallVector<SDValue, 8> Ops(NumElems);
9409 for (unsigned i = 0; i < NumElems; ++i) {
9410 if (!Op.getOperand(i).isUndef())
9411 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9412 else
9413 Ops[i] = DAG.getUNDEF(VT);
9414 }
9415
9416 // Next, we iteratively mix elements, e.g. for v4f32:
9417 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
9418 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
9419 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
9420 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9421 // Generate scaled UNPCKL shuffle mask.
9423 for(unsigned i = 0; i != Scale; ++i)
9424 Mask.push_back(i);
9425 for (unsigned i = 0; i != Scale; ++i)
9426 Mask.push_back(NumElems+i);
9427 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9428
9429 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
9430 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
9431 }
9432 return Ops[0];
9433}
9434
9435// 256-bit AVX can use the vinsertf128 instruction
9436// to create 256-bit vectors from two other 128-bit ones.
9437// TODO: Detect subvector broadcast here instead of DAG combine?
9439 const X86Subtarget &Subtarget) {
9440 SDLoc dl(Op);
9441 MVT ResVT = Op.getSimpleValueType();
9442
9443 assert((ResVT.is256BitVector() ||
9444 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
9445
9446 unsigned NumOperands = Op.getNumOperands();
9447 unsigned NumFreezeUndef = 0;
9448 unsigned NumZero = 0;
9449 unsigned NumNonZero = 0;
9450 unsigned NonZeros = 0;
9451 for (unsigned i = 0; i != NumOperands; ++i) {
9452 SDValue SubVec = Op.getOperand(i);
9453 if (SubVec.isUndef())
9454 continue;
9455 if (ISD::isFreezeUndef(SubVec.getNode())) {
9456 // If the freeze(undef) has multiple uses then we must fold to zero.
9457 if (SubVec.hasOneUse())
9458 ++NumFreezeUndef;
9459 else
9460 ++NumZero;
9461 }
9462 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9463 ++NumZero;
9464 else {
9465 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9466 NonZeros |= 1 << i;
9467 ++NumNonZero;
9468 }
9469 }
9470
9471 // If we have more than 2 non-zeros, build each half separately.
9472 if (NumNonZero > 2) {
9473 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9474 ArrayRef<SDUse> Ops = Op->ops();
9475 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9476 Ops.slice(0, NumOperands/2));
9477 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9478 Ops.slice(NumOperands/2));
9479 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9480 }
9481
9482 // Otherwise, build it up through insert_subvectors.
9483 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9484 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
9485 : DAG.getUNDEF(ResVT));
9486
9487 MVT SubVT = Op.getOperand(0).getSimpleValueType();
9488 unsigned NumSubElems = SubVT.getVectorNumElements();
9489 for (unsigned i = 0; i != NumOperands; ++i) {
9490 if ((NonZeros & (1 << i)) == 0)
9491 continue;
9492
9493 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(i),
9494 DAG.getVectorIdxConstant(i * NumSubElems, dl));
9495 }
9496
9497 return Vec;
9498}
9499
9500// Returns true if the given node is a type promotion (by concatenating i1
9501// zeros) of the result of a node that already zeros all upper bits of
9502// k-register.
9503// TODO: Merge this with LowerAVXCONCAT_VECTORS?
9505 const X86Subtarget &Subtarget,
9506 SelectionDAG & DAG) {
9507 SDLoc dl(Op);
9508 MVT ResVT = Op.getSimpleValueType();
9509 unsigned NumOperands = Op.getNumOperands();
9510
9511 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9512 "Unexpected number of operands in CONCAT_VECTORS");
9513
9514 uint64_t Zeros = 0;
9515 uint64_t NonZeros = 0;
9516 for (unsigned i = 0; i != NumOperands; ++i) {
9517 SDValue SubVec = Op.getOperand(i);
9518 if (SubVec.isUndef())
9519 continue;
9520 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9521 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9522 Zeros |= (uint64_t)1 << i;
9523 else
9524 NonZeros |= (uint64_t)1 << i;
9525 }
9526
9527 unsigned NumElems = ResVT.getVectorNumElements();
9528
9529 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9530 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
9531 // insert_subvector will give us two kshifts.
9532 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9533 Log2_64(NonZeros) != NumOperands - 1) {
9534 unsigned Idx = Log2_64(NonZeros);
9535 SDValue SubVec = Op.getOperand(Idx);
9536 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9537 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
9538 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
9539 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
9540 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
9541 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
9542 DAG.getVectorIdxConstant(0, dl));
9543 }
9544
9545 // If there are zero or one non-zeros we can handle this very simply.
9546 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
9547 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
9548 if (!NonZeros)
9549 return Vec;
9550 unsigned Idx = Log2_64(NonZeros);
9551 SDValue SubVec = Op.getOperand(Idx);
9552 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9553 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
9554 DAG.getVectorIdxConstant(Idx * SubVecNumElts, dl));
9555 }
9556
9557 if (NumOperands > 2) {
9558 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9559 ArrayRef<SDUse> Ops = Op->ops();
9560 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9561 Ops.slice(0, NumOperands / 2));
9562 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9563 Ops.slice(NumOperands / 2));
9564 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9565 }
9566
9567 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
9568
9569 if (ResVT.getVectorNumElements() >= 16)
9570 return Op; // The operation is legal with KUNPCK
9571
9572 SDValue Vec =
9573 DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, DAG.getUNDEF(ResVT),
9574 Op.getOperand(0), DAG.getVectorIdxConstant(0, dl));
9575 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
9576 DAG.getVectorIdxConstant(NumElems / 2, dl));
9577}
9578
9580 const X86Subtarget &Subtarget,
9581 SelectionDAG &DAG) {
9582 MVT VT = Op.getSimpleValueType();
9583 if (VT.getVectorElementType() == MVT::i1)
9584 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
9585
9586 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
9587 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
9588 Op.getNumOperands() == 4)));
9589
9590 // AVX can use the vinsertf128 instruction to create 256-bit vectors
9591 // from two other 128-bit ones.
9592
9593 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9594 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
9595}
9596
9597//===----------------------------------------------------------------------===//
9598// Vector shuffle lowering
9599//
9600// This is an experimental code path for lowering vector shuffles on x86. It is
9601// designed to handle arbitrary vector shuffles and blends, gracefully
9602// degrading performance as necessary. It works hard to recognize idiomatic
9603// shuffles and lower them to optimal instruction patterns without leaving
9604// a framework that allows reasonably efficient handling of all vector shuffle
9605// patterns.
9606//===----------------------------------------------------------------------===//
9607
9608/// Tiny helper function to identify a no-op mask.
9609///
9610/// This is a somewhat boring predicate function. It checks whether the mask
9611/// array input, which is assumed to be a single-input shuffle mask of the kind
9612/// used by the X86 shuffle instructions (not a fully general
9613/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
9614/// in-place shuffle are 'no-op's.
9616 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9617 assert(Mask[i] >= -1 && "Out of bound mask element!");
9618 if (Mask[i] >= 0 && Mask[i] != i)
9619 return false;
9620 }
9621 return true;
9622}
9623
9624/// Test whether there are elements crossing LaneSizeInBits lanes in this
9625/// shuffle mask.
9626///
9627/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9628/// and we routinely test for these.
9629static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
9630 unsigned ScalarSizeInBits,
9631 ArrayRef<int> Mask) {
9632 assert(LaneSizeInBits && ScalarSizeInBits &&
9633 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9634 "Illegal shuffle lane size");
9635 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
9636 int Size = Mask.size();
9637 for (int i = 0; i < Size; ++i)
9638 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9639 return true;
9640 return false;
9641}
9642
9643/// Test whether there are elements crossing 128-bit lanes in this
9644/// shuffle mask.
9646 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
9647}
9648
9649/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
9650/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9651/// better support 'repeated mask + lane permute' style shuffles.
9652static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
9653 unsigned ScalarSizeInBits,
9654 ArrayRef<int> Mask) {
9655 assert(LaneSizeInBits && ScalarSizeInBits &&
9656 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9657 "Illegal shuffle lane size");
9658 int NumElts = Mask.size();
9659 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
9660 int NumLanes = NumElts / NumEltsPerLane;
9661 if (NumLanes > 1) {
9662 for (int i = 0; i != NumLanes; ++i) {
9663 int SrcLane = -1;
9664 for (int j = 0; j != NumEltsPerLane; ++j) {
9665 int M = Mask[(i * NumEltsPerLane) + j];
9666 if (M < 0)
9667 continue;
9668 int Lane = (M % NumElts) / NumEltsPerLane;
9669 if (SrcLane >= 0 && SrcLane != Lane)
9670 return true;
9671 SrcLane = Lane;
9672 }
9673 }
9674 }
9675 return false;
9676}
9677
9678/// Test whether a shuffle mask is equivalent within each sub-lane.
9679///
9680/// This checks a shuffle mask to see if it is performing the same
9681/// lane-relative shuffle in each sub-lane. This trivially implies
9682/// that it is also not lane-crossing. It may however involve a blend from the
9683/// same lane of a second vector.
9684///
9685/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
9686/// non-trivial to compute in the face of undef lanes. The representation is
9687/// suitable for use with existing 128-bit shuffles as entries from the second
9688/// vector have been remapped to [LaneSize, 2*LaneSize).
9689static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
9690 ArrayRef<int> Mask,
9691 SmallVectorImpl<int> &RepeatedMask) {
9692 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
9693 RepeatedMask.assign(LaneSize, -1);
9694 int Size = Mask.size();
9695 for (int i = 0; i < Size; ++i) {
9696 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
9697 if (Mask[i] < 0)
9698 continue;
9699 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9700 // This entry crosses lanes, so there is no way to model this shuffle.
9701 return false;
9702
9703 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
9704 // Adjust second vector indices to start at LaneSize instead of Size.
9705 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
9706 : Mask[i] % LaneSize + LaneSize;
9707 if (RepeatedMask[i % LaneSize] < 0)
9708 // This is the first non-undef entry in this slot of a 128-bit lane.
9709 RepeatedMask[i % LaneSize] = LocalM;
9710 else if (RepeatedMask[i % LaneSize] != LocalM)
9711 // Found a mismatch with the repeated mask.
9712 return false;
9713 }
9714 return true;
9715}
9716
9717/// Test whether a shuffle mask is equivalent within each 128-bit lane.
9718static bool
9720 SmallVectorImpl<int> &RepeatedMask) {
9721 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
9722}
9723
9724static bool
9726 SmallVector<int, 32> RepeatedMask;
9727 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
9728}
9729
9730/// Test whether a shuffle mask is equivalent within each 256-bit lane.
9731static bool
9733 SmallVectorImpl<int> &RepeatedMask) {
9734 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
9735}
9736
9737/// Test whether a target shuffle mask is equivalent within each sub-lane.
9738/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
9739static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
9740 unsigned EltSizeInBits,
9741 ArrayRef<int> Mask,
9742 SmallVectorImpl<int> &RepeatedMask) {
9743 int LaneSize = LaneSizeInBits / EltSizeInBits;
9744 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
9745 int Size = Mask.size();
9746 for (int i = 0; i < Size; ++i) {
9747 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
9748 if (Mask[i] == SM_SentinelUndef)
9749 continue;
9750 if (Mask[i] == SM_SentinelZero) {
9751 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
9752 return false;
9753 RepeatedMask[i % LaneSize] = SM_SentinelZero;
9754 continue;
9755 }
9756 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9757 // This entry crosses lanes, so there is no way to model this shuffle.
9758 return false;
9759
9760 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
9761 // later vector indices to start at multiples of LaneSize instead of Size.
9762 int LaneM = Mask[i] / Size;
9763 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
9764 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
9765 // This is the first non-undef entry in this slot of a 128-bit lane.
9766 RepeatedMask[i % LaneSize] = LocalM;
9767 else if (RepeatedMask[i % LaneSize] != LocalM)
9768 // Found a mismatch with the repeated mask.
9769 return false;
9770 }
9771 return true;
9772}
9773
9774/// Test whether a target shuffle mask is equivalent within each sub-lane.
9775/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
9776static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
9777 ArrayRef<int> Mask,
9778 SmallVectorImpl<int> &RepeatedMask) {
9779 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
9780 Mask, RepeatedMask);
9781}
9782
9783/// Checks whether the vector elements referenced by two shuffle masks are
9784/// equivalent.
9785static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
9786 int Idx, int ExpectedIdx) {
9787 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
9788 ExpectedIdx < MaskSize && "Out of range element index");
9789 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
9790 return false;
9791
9792 switch (Op.getOpcode()) {
9793 case ISD::BUILD_VECTOR:
9794 // If the values are build vectors, we can look through them to find
9795 // equivalent inputs that make the shuffles equivalent.
9796 // TODO: Handle MaskSize != Op.getNumOperands()?
9797 if (MaskSize == (int)Op.getNumOperands() &&
9798 MaskSize == (int)ExpectedOp.getNumOperands())
9799 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
9800 break;
9801 case X86ISD::VBROADCAST:
9803 // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
9804 return (Op == ExpectedOp &&
9805 (int)Op.getValueType().getVectorNumElements() == MaskSize);
9806 case X86ISD::HADD:
9807 case X86ISD::HSUB:
9808 case X86ISD::FHADD:
9809 case X86ISD::FHSUB:
9810 case X86ISD::PACKSS:
9811 case X86ISD::PACKUS:
9812 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
9813 // TODO: Handle MaskSize != NumElts?
9814 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
9815 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
9816 MVT VT = Op.getSimpleValueType();
9817 int NumElts = VT.getVectorNumElements();
9818 if (MaskSize == NumElts) {
9819 int NumLanes = VT.getSizeInBits() / 128;
9820 int NumEltsPerLane = NumElts / NumLanes;
9821 int NumHalfEltsPerLane = NumEltsPerLane / 2;
9822 bool SameLane =
9823 (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
9824 bool SameElt =
9825 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
9826 return SameLane && SameElt;
9827 }
9828 }
9829 break;
9830 }
9831
9832 return false;
9833}
9834
9835/// Checks whether a shuffle mask is equivalent to an explicit list of
9836/// arguments.
9837///
9838/// This is a fast way to test a shuffle mask against a fixed pattern:
9839///
9840/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
9841///
9842/// It returns true if the mask is exactly as wide as the argument list, and
9843/// each element of the mask is either -1 (signifying undef) or the value given
9844/// in the argument.
9845static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
9846 SDValue V1 = SDValue(),
9847 SDValue V2 = SDValue()) {
9848 int Size = Mask.size();
9849 if (Size != (int)ExpectedMask.size())
9850 return false;
9851
9852 for (int i = 0; i < Size; ++i) {
9853 assert(Mask[i] >= -1 && "Out of bound mask element!");
9854 int MaskIdx = Mask[i];
9855 int ExpectedIdx = ExpectedMask[i];
9856 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
9857 SDValue MaskV = MaskIdx < Size ? V1 : V2;
9858 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9859 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9860 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9861 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
9862 return false;
9863 }
9864 }
9865 return true;
9866}
9867
9868/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
9869///
9870/// The masks must be exactly the same width.
9871///
9872/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
9873/// value in ExpectedMask is always accepted. Otherwise the indices must match.
9874///
9875/// SM_SentinelZero is accepted as a valid negative index but must match in
9876/// both, or via a known bits test.
9878 ArrayRef<int> ExpectedMask,
9879 const SelectionDAG &DAG,
9880 SDValue V1 = SDValue(),
9881 SDValue V2 = SDValue()) {
9882 int Size = Mask.size();
9883 if (Size != (int)ExpectedMask.size())
9884 return false;
9885 assert(llvm::all_of(ExpectedMask,
9886 [Size](int M) { return isInRange(M, 0, 2 * Size); }) &&
9887 "Illegal target shuffle mask");
9888
9889 // Check for out-of-range target shuffle mask indices.
9890 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
9891 return false;
9892
9893 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
9894 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
9895 !V1.getValueType().isVector()))
9896 V1 = SDValue();
9897 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
9898 !V2.getValueType().isVector()))
9899 V2 = SDValue();
9900
9901 APInt ZeroV1 = APInt::getZero(Size);
9902 APInt ZeroV2 = APInt::getZero(Size);
9903
9904 for (int i = 0; i < Size; ++i) {
9905 int MaskIdx = Mask[i];
9906 int ExpectedIdx = ExpectedMask[i];
9907 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
9908 continue;
9909 if (MaskIdx == SM_SentinelZero) {
9910 // If we need this expected index to be a zero element, then update the
9911 // relevant zero mask and perform the known bits at the end to minimize
9912 // repeated computes.
9913 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9914 if (ExpectedV &&
9915 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
9916 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9917 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
9918 ZeroMask.setBit(BitIdx);
9919 continue;
9920 }
9921 }
9922 if (MaskIdx >= 0) {
9923 SDValue MaskV = MaskIdx < Size ? V1 : V2;
9924 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9925 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9926 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9927 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
9928 continue;
9929 }
9930 return false;
9931 }
9932 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
9933 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
9934}
9935
9936// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
9937// instructions.
9939 const SelectionDAG &DAG) {
9940 if (VT != MVT::v8i32 && VT != MVT::v8f32)
9941 return false;
9942
9943 SmallVector<int, 8> Unpcklwd;
9944 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
9945 /* Unary = */ false);
9946 SmallVector<int, 8> Unpckhwd;
9947 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
9948 /* Unary = */ false);
9949 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
9950 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
9951 return IsUnpackwdMask;
9952}
9953
9955 const SelectionDAG &DAG) {
9956 // Create 128-bit vector type based on mask size.
9957 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
9958 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
9959
9960 // We can't assume a canonical shuffle mask, so try the commuted version too.
9961 SmallVector<int, 4> CommutedMask(Mask);
9963
9964 // Match any of unary/binary or low/high.
9965 for (unsigned i = 0; i != 4; ++i) {
9966 SmallVector<int, 16> UnpackMask;
9967 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
9968 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
9969 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
9970 return true;
9971 }
9972 return false;
9973}
9974
9975/// Return true if a shuffle mask chooses elements identically in its top and
9976/// bottom halves. For example, any splat mask has the same top and bottom
9977/// halves. If an element is undefined in only one half of the mask, the halves
9978/// are not considered identical.
9980 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
9981 unsigned HalfSize = Mask.size() / 2;
9982 for (unsigned i = 0; i != HalfSize; ++i) {
9983 if (Mask[i] != Mask[i + HalfSize])
9984 return false;
9985 }
9986 return true;
9987}
9988
9989/// Get a 4-lane 8-bit shuffle immediate for a mask.
9990///
9991/// This helper function produces an 8-bit shuffle immediate corresponding to
9992/// the ubiquitous shuffle encoding scheme used in x86 instructions for
9993/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
9994/// example.
9995///
9996/// NB: We rely heavily on "undef" masks preserving the input lane.
9997static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
9998 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
9999 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
10000 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
10001 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
10002 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
10003
10004 // If the mask only uses one non-undef element, then fully 'splat' it to
10005 // improve later broadcast matching.
10006 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10007 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
10008
10009 int FirstElt = Mask[FirstIndex];
10010 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
10011 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
10012
10013 unsigned Imm = 0;
10014 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10015 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10016 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10017 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10018 return Imm;
10019}
10020
10022 SelectionDAG &DAG) {
10023 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
10024}
10025
10026// Canonicalize SHUFPD mask to improve chances of further folding.
10027// Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.
10028static unsigned getSHUFPDImm(ArrayRef<int> Mask) {
10029 assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
10030 "Unexpected SHUFPD mask size");
10031 assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&
10032 "Unexpected SHUFPD mask elements");
10033
10034 // If the mask only uses one non-undef element, then fully 'splat' it to
10035 // improve later broadcast matching.
10036 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10037 assert(0 <= FirstIndex && FirstIndex < (int)Mask.size() &&
10038 "All undef shuffle mask");
10039
10040 int FirstElt = Mask[FirstIndex];
10041 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }) &&
10042 count_if(Mask, [FirstElt](int M) { return M == FirstElt; }) > 1) {
10043 unsigned Imm = 0;
10044 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10045 Imm |= FirstElt << I;
10046 return Imm;
10047 }
10048
10049 // Attempt to keep any undef elements in place to improve chances of the
10050 // shuffle becoming a (commutative) blend.
10051 unsigned Imm = 0;
10052 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10053 Imm |= (Mask[I] < 0 ? (I & 1) : Mask[I]) << I;
10054
10055 return Imm;
10056}
10057
10059 SelectionDAG &DAG) {
10060 return DAG.getTargetConstant(getSHUFPDImm(Mask), DL, MVT::i8);
10061}
10062
10063// The Shuffle result is as follow:
10064// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
10065// Each Zeroable's element correspond to a particular Mask's element.
10066// As described in computeZeroableShuffleElements function.
10067//
10068// The function looks for a sub-mask that the nonzero elements are in
10069// increasing order. If such sub-mask exist. The function returns true.
10070static bool isNonZeroElementsInOrder(const APInt &Zeroable,
10071 ArrayRef<int> Mask, const EVT &VectorType,
10072 bool &IsZeroSideLeft) {
10073 int NextElement = -1;
10074 // Check if the Mask's nonzero elements are in increasing order.
10075 for (int i = 0, e = Mask.size(); i < e; i++) {
10076 // Checks if the mask's zeros elements are built from only zeros.
10077 assert(Mask[i] >= -1 && "Out of bound mask element!");
10078 if (Mask[i] < 0)
10079 return false;
10080 if (Zeroable[i])
10081 continue;
10082 // Find the lowest non zero element
10083 if (NextElement < 0) {
10084 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
10085 IsZeroSideLeft = NextElement != 0;
10086 }
10087 // Exit if the mask's non zero elements are not in increasing order.
10088 if (NextElement != Mask[i])
10089 return false;
10090 NextElement++;
10091 }
10092 return true;
10093}
10094
10095/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
10097 ArrayRef<int> Mask, SDValue V1,
10098 SDValue V2, const APInt &Zeroable,
10099 const X86Subtarget &Subtarget,
10100 SelectionDAG &DAG) {
10101 int Size = Mask.size();
10102 int LaneSize = 128 / VT.getScalarSizeInBits();
10103 const int NumBytes = VT.getSizeInBits() / 8;
10104 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
10105
10106 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
10107 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
10108 (Subtarget.hasBWI() && VT.is512BitVector()));
10109
10110 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
10111 // Sign bit set in i8 mask means zero element.
10112 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
10113
10114 SDValue V;
10115 for (int i = 0; i < NumBytes; ++i) {
10116 int M = Mask[i / NumEltBytes];
10117 if (M < 0) {
10118 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
10119 continue;
10120 }
10121 if (Zeroable[i / NumEltBytes]) {
10122 PSHUFBMask[i] = ZeroMask;
10123 continue;
10124 }
10125
10126 // We can only use a single input of V1 or V2.
10127 SDValue SrcV = (M >= Size ? V2 : V1);
10128 if (V && V != SrcV)
10129 return SDValue();
10130 V = SrcV;
10131 M %= Size;
10132
10133 // PSHUFB can't cross lanes, ensure this doesn't happen.
10134 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10135 return SDValue();
10136
10137 M = M % LaneSize;
10138 M = M * NumEltBytes + (i % NumEltBytes);
10139 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
10140 }
10141 assert(V && "Failed to find a source input");
10142
10143 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
10144 return DAG.getBitcast(
10145 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
10146 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
10147}
10148
10149static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
10150 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10151 const SDLoc &dl);
10152
10153// X86 has dedicated shuffle that can be lowered to VEXPAND
10155 SDValue V2, ArrayRef<int> Mask,
10156 const APInt &Zeroable,
10157 const X86Subtarget &Subtarget,
10158 SelectionDAG &DAG) {
10159 bool IsLeftZeroSide = true;
10160 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
10161 IsLeftZeroSide))
10162 return SDValue();
10163 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10165 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
10166 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
10167 unsigned NumElts = VT.getVectorNumElements();
10168 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
10169 "Unexpected number of vector elements");
10170 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
10171 Subtarget, DAG, DL);
10172 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
10173 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10174 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
10175}
10176
10177static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
10178 unsigned &UnpackOpcode, bool IsUnary,
10179 ArrayRef<int> TargetMask, const SDLoc &DL,
10180 SelectionDAG &DAG,
10181 const X86Subtarget &Subtarget) {
10182 int NumElts = VT.getVectorNumElements();
10183
10184 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
10185 for (int i = 0; i != NumElts; i += 2) {
10186 int M1 = TargetMask[i + 0];
10187 int M2 = TargetMask[i + 1];
10188 Undef1 &= (SM_SentinelUndef == M1);
10189 Undef2 &= (SM_SentinelUndef == M2);
10190 Zero1 &= isUndefOrZero(M1);
10191 Zero2 &= isUndefOrZero(M2);
10192 }
10193 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
10194 "Zeroable shuffle detected");
10195
10196 // Attempt to match the target mask against the unpack lo/hi mask patterns.
10197 SmallVector<int, 64> Unpckl, Unpckh;
10198 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
10199 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
10200 (IsUnary ? V1 : V2))) {
10201 UnpackOpcode = X86ISD::UNPCKL;
10202 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10203 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10204 return true;
10205 }
10206
10207 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
10208 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
10209 (IsUnary ? V1 : V2))) {
10210 UnpackOpcode = X86ISD::UNPCKH;
10211 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10212 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10213 return true;
10214 }
10215
10216 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
10217 if (IsUnary && (Zero1 || Zero2)) {
10218 // Don't bother if we can blend instead.
10219 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
10220 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
10221 return false;
10222
10223 bool MatchLo = true, MatchHi = true;
10224 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
10225 int M = TargetMask[i];
10226
10227 // Ignore if the input is known to be zero or the index is undef.
10228 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
10229 (M == SM_SentinelUndef))
10230 continue;
10231
10232 MatchLo &= (M == Unpckl[i]);
10233 MatchHi &= (M == Unpckh[i]);
10234 }
10235
10236 if (MatchLo || MatchHi) {
10237 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10238 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10239 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10240 return true;
10241 }
10242 }
10243
10244 // If a binary shuffle, commute and try again.
10245 if (!IsUnary) {
10247 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
10248 UnpackOpcode = X86ISD::UNPCKL;
10249 std::swap(V1, V2);
10250 return true;
10251 }
10252
10254 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
10255 UnpackOpcode = X86ISD::UNPCKH;
10256 std::swap(V1, V2);
10257 return true;
10258 }
10259 }
10260
10261 return false;
10262}
10263
10264// X86 has dedicated unpack instructions that can handle specific blend
10265// operations: UNPCKH and UNPCKL.
10267 SDValue V2, ArrayRef<int> Mask,
10268 SelectionDAG &DAG) {
10269 SmallVector<int, 8> Unpckl;
10270 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
10271 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10272 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
10273
10274 SmallVector<int, 8> Unpckh;
10275 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
10276 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10277 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
10278
10279 // Commute and try again.
10281 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10282 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
10283
10285 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10286 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
10287
10288 return SDValue();
10289}
10290
10291/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10292/// followed by unpack 256-bit.
10294 SDValue V2, ArrayRef<int> Mask,
10295 SelectionDAG &DAG) {
10296 SmallVector<int, 32> Unpckl, Unpckh;
10297 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
10298 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
10299
10300 unsigned UnpackOpcode;
10301 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10302 UnpackOpcode = X86ISD::UNPCKL;
10303 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10304 UnpackOpcode = X86ISD::UNPCKH;
10305 else
10306 return SDValue();
10307
10308 // This is a "natural" unpack operation (rather than the 128-bit sectored
10309 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10310 // input in order to use the x86 instruction.
10311 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
10312 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10313 V1 = DAG.getBitcast(VT, V1);
10314 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
10315}
10316
10317// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
10318// source into the lower elements and zeroing the upper elements.
10319static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
10320 ArrayRef<int> Mask, const APInt &Zeroable,
10321 const X86Subtarget &Subtarget) {
10322 if (!VT.is512BitVector() && !Subtarget.hasVLX())
10323 return false;
10324
10325 unsigned NumElts = Mask.size();
10326 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10327 unsigned MaxScale = 64 / EltSizeInBits;
10328
10329 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10330 unsigned SrcEltBits = EltSizeInBits * Scale;
10331 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10332 continue;
10333 unsigned NumSrcElts = NumElts / Scale;
10334 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
10335 continue;
10336 unsigned UpperElts = NumElts - NumSrcElts;
10337 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10338 continue;
10339 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
10340 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
10341 DstVT = MVT::getIntegerVT(EltSizeInBits);
10342 if ((NumSrcElts * EltSizeInBits) >= 128) {
10343 // ISD::TRUNCATE
10344 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
10345 } else {
10346 // X86ISD::VTRUNC
10347 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
10348 }
10349 return true;
10350 }
10351
10352 return false;
10353}
10354
10355// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
10356// element padding to the final DstVT.
10357static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
10358 const X86Subtarget &Subtarget,
10359 SelectionDAG &DAG, bool ZeroUppers) {
10360 MVT SrcVT = Src.getSimpleValueType();
10361 MVT DstSVT = DstVT.getScalarType();
10362 unsigned NumDstElts = DstVT.getVectorNumElements();
10363 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10364 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
10365
10366 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
10367 return SDValue();
10368
10369 // Perform a direct ISD::TRUNCATE if possible.
10370 if (NumSrcElts == NumDstElts)
10371 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
10372
10373 if (NumSrcElts > NumDstElts) {
10374 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10375 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10376 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
10377 }
10378
10379 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
10380 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10381 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10382 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10383 DstVT.getSizeInBits());
10384 }
10385
10386 // Non-VLX targets must truncate from a 512-bit type, so we need to
10387 // widen, truncate and then possibly extract the original subvector.
10388 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
10389 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
10390 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
10391 }
10392
10393 // Fallback to a X86ISD::VTRUNC, padding if necessary.
10394 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
10395 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
10396 if (DstVT != TruncVT)
10397 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10398 DstVT.getSizeInBits());
10399 return Trunc;
10400}
10401
10402// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
10403//
10404// An example is the following:
10405//
10406// t0: ch = EntryToken
10407// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
10408// t25: v4i32 = truncate t2
10409// t41: v8i16 = bitcast t25
10410// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
10411// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
10412// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
10413// t18: v2i64 = bitcast t51
10414//
10415// One can just use a single vpmovdw instruction, without avx512vl we need to
10416// use the zmm variant and extract the lower subvector, padding with zeroes.
10417// TODO: Merge with lowerShuffleAsVTRUNC.
10419 SDValue V2, ArrayRef<int> Mask,
10420 const APInt &Zeroable,
10421 const X86Subtarget &Subtarget,
10422 SelectionDAG &DAG) {
10423 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
10424 if (!Subtarget.hasAVX512())
10425 return SDValue();
10426
10427 unsigned NumElts = VT.getVectorNumElements();
10428 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10429 unsigned MaxScale = 64 / EltSizeInBits;
10430 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10431 unsigned SrcEltBits = EltSizeInBits * Scale;
10432 unsigned NumSrcElts = NumElts / Scale;
10433 unsigned UpperElts = NumElts - NumSrcElts;
10434 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
10435 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10436 continue;
10437
10438 // Attempt to find a matching source truncation, but as a fall back VLX
10439 // cases can use the VPMOV directly.
10440 SDValue Src = peekThroughBitcasts(V1);
10441 if (Src.getOpcode() == ISD::TRUNCATE &&
10442 Src.getScalarValueSizeInBits() == SrcEltBits) {
10443 Src = Src.getOperand(0);
10444 } else if (Subtarget.hasVLX()) {
10445 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10446 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10447 Src = DAG.getBitcast(SrcVT, Src);
10448 // Don't do this if PACKSS/PACKUS could perform it cheaper.
10449 if (Scale == 2 &&
10450 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
10451 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
10452 return SDValue();
10453 } else
10454 return SDValue();
10455
10456 // VPMOVWB is only available with avx512bw.
10457 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
10458 return SDValue();
10459
10460 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
10461 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10462 }
10463
10464 return SDValue();
10465}
10466
10467// Attempt to match binary shuffle patterns as a truncate.
10469 SDValue V2, ArrayRef<int> Mask,
10470 const APInt &Zeroable,
10471 const X86Subtarget &Subtarget,
10472 SelectionDAG &DAG) {
10473 assert((VT.is128BitVector() || VT.is256BitVector()) &&
10474 "Unexpected VTRUNC type");
10475 if (!Subtarget.hasAVX512())
10476 return SDValue();
10477
10478 unsigned NumElts = VT.getVectorNumElements();
10479 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10480 unsigned MaxScale = 64 / EltSizeInBits;
10481 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10482 // TODO: Support non-BWI VPMOVWB truncations?
10483 unsigned SrcEltBits = EltSizeInBits * Scale;
10484 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10485 continue;
10486
10487 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
10488 // Bail if the V2 elements are undef.
10489 unsigned NumHalfSrcElts = NumElts / Scale;
10490 unsigned NumSrcElts = 2 * NumHalfSrcElts;
10491 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
10492 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
10493 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
10494 continue;
10495
10496 // The elements beyond the truncation must be undef/zero.
10497 unsigned UpperElts = NumElts - NumSrcElts;
10498 if (UpperElts > 0 &&
10499 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10500 continue;
10501 bool UndefUppers =
10502 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
10503
10504 // For offset truncations, ensure that the concat is cheap.
10505 if (Offset) {
10506 auto IsCheapConcat = [&](SDValue Lo, SDValue Hi) {
10507 if (Lo.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
10508 Hi.getOpcode() == ISD::EXTRACT_SUBVECTOR)
10509 return Lo.getOperand(0) == Hi.getOperand(0);
10510 if (ISD::isNormalLoad(Lo.getNode()) &&
10511 ISD::isNormalLoad(Hi.getNode())) {
10512 auto *LDLo = cast<LoadSDNode>(Lo);
10513 auto *LDHi = cast<LoadSDNode>(Hi);
10515 LDHi, LDLo, Lo.getValueType().getStoreSize(), 1);
10516 }
10517 return false;
10518 };
10519 if (!IsCheapConcat(peekThroughBitcasts(V1), peekThroughBitcasts(V2)))
10520 continue;
10521 }
10522
10523 // As we're using both sources then we need to concat them together
10524 // and truncate from the double-sized src.
10525 MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
10526 SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
10527
10528 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10529 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10530 Src = DAG.getBitcast(SrcVT, Src);
10531
10532 // Shift the offset'd elements into place for the truncation.
10533 // TODO: Use getTargetVShiftByConstNode.
10534 if (Offset)
10535 Src = DAG.getNode(
10536 X86ISD::VSRLI, DL, SrcVT, Src,
10537 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
10538
10539 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10540 }
10541 }
10542
10543 return SDValue();
10544}
10545
10546/// Check whether a compaction lowering can be done by dropping even/odd
10547/// elements and compute how many times even/odd elements must be dropped.
10548///
10549/// This handles shuffles which take every Nth element where N is a power of
10550/// two. Example shuffle masks:
10551///
10552/// (even)
10553/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10554/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10555/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10556/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10557/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10558/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10559///
10560/// (odd)
10561/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
10562/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
10563///
10564/// Any of these lanes can of course be undef.
10565///
10566/// This routine only supports N <= 3.
10567/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10568/// for larger N.
10569///
10570/// \returns N above, or the number of times even/odd elements must be dropped
10571/// if there is such a number. Otherwise returns zero.
10572static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
10573 bool IsSingleInput) {
10574 // The modulus for the shuffle vector entries is based on whether this is
10575 // a single input or not.
10576 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10577 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10578 "We should only be called with masks with a power-of-2 size!");
10579
10580 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10581 int Offset = MatchEven ? 0 : 1;
10582
10583 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10584 // and 2^3 simultaneously. This is because we may have ambiguity with
10585 // partially undef inputs.
10586 bool ViableForN[3] = {true, true, true};
10587
10588 for (int i = 0, e = Mask.size(); i < e; ++i) {
10589 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10590 // want.
10591 if (Mask[i] < 0)
10592 continue;
10593
10594 bool IsAnyViable = false;
10595 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10596 if (ViableForN[j]) {
10597 uint64_t N = j + 1;
10598
10599 // The shuffle mask must be equal to (i * 2^N) % M.
10600 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10601 IsAnyViable = true;
10602 else
10603 ViableForN[j] = false;
10604 }
10605 // Early exit if we exhaust the possible powers of two.
10606 if (!IsAnyViable)
10607 break;
10608 }
10609
10610 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10611 if (ViableForN[j])
10612 return j + 1;
10613
10614 // Return 0 as there is no viable power of two.
10615 return 0;
10616}
10617
10618// X86 has dedicated pack instructions that can handle specific truncation
10619// operations: PACKSS and PACKUS.
10620// Checks for compaction shuffle masks if MaxStages > 1.
10621// TODO: Add support for matching multiple PACKSS/PACKUS stages.
10622static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
10623 unsigned &PackOpcode, ArrayRef<int> TargetMask,
10624 const SelectionDAG &DAG,
10625 const X86Subtarget &Subtarget,
10626 unsigned MaxStages = 1) {
10627 unsigned NumElts = VT.getVectorNumElements();
10628 unsigned BitSize = VT.getScalarSizeInBits();
10629 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
10630 "Illegal maximum compaction");
10631
10632 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
10633 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
10634 unsigned NumPackedBits = NumSrcBits - BitSize;
10635 N1 = peekThroughBitcasts(N1);
10636 N2 = peekThroughBitcasts(N2);
10637 unsigned NumBits1 = N1.getScalarValueSizeInBits();
10638 unsigned NumBits2 = N2.getScalarValueSizeInBits();
10639 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
10640 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
10641 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
10642 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
10643 return false;
10644 if (Subtarget.hasSSE41() || BitSize == 8) {
10645 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
10646 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
10647 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
10648 V1 = N1;
10649 V2 = N2;
10650 SrcVT = PackVT;
10651 PackOpcode = X86ISD::PACKUS;
10652 return true;
10653 }
10654 }
10655 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
10656 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
10657 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
10658 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
10659 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
10660 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
10661 V1 = N1;
10662 V2 = N2;
10663 SrcVT = PackVT;
10664 PackOpcode = X86ISD::PACKSS;
10665 return true;
10666 }
10667 return false;
10668 };
10669
10670 // Attempt to match against wider and wider compaction patterns.
10671 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
10672 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
10673 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
10674
10675 // Try binary shuffle.
10676 SmallVector<int, 32> BinaryMask;
10677 createPackShuffleMask(VT, BinaryMask, false, NumStages);
10678 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
10679 if (MatchPACK(V1, V2, PackVT))
10680 return true;
10681
10682 // Try unary shuffle.
10683 SmallVector<int, 32> UnaryMask;
10684 createPackShuffleMask(VT, UnaryMask, true, NumStages);
10685 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
10686 if (MatchPACK(V1, V1, PackVT))
10687 return true;
10688 }
10689
10690 return false;
10691}
10692
10694 SDValue V2, ArrayRef<int> Mask,
10695 const X86Subtarget &Subtarget,
10696 SelectionDAG &DAG) {
10697 MVT PackVT;
10698 unsigned PackOpcode;
10699 unsigned SizeBits = VT.getSizeInBits();
10700 unsigned EltBits = VT.getScalarSizeInBits();
10701 unsigned MaxStages = Log2_32(64 / EltBits);
10702 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
10703 Subtarget, MaxStages))
10704 return SDValue();
10705
10706 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
10707 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
10708
10709 // Don't lower multi-stage packs on AVX512, truncation is better.
10710 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
10711 return SDValue();
10712
10713 // Pack to the largest type possible:
10714 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10715 unsigned MaxPackBits = 16;
10716 if (CurrentEltBits > 16 &&
10717 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
10718 MaxPackBits = 32;
10719
10720 // Repeatedly pack down to the target size.
10721 SDValue Res;
10722 for (unsigned i = 0; i != NumStages; ++i) {
10723 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
10724 unsigned NumSrcElts = SizeBits / SrcEltBits;
10725 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10726 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
10727 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10728 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
10729 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
10730 DAG.getBitcast(SrcVT, V2));
10731 V1 = V2 = Res;
10732 CurrentEltBits /= 2;
10733 }
10734 assert(Res && Res.getValueType() == VT &&
10735 "Failed to lower compaction shuffle");
10736 return Res;
10737}
10738
10739/// Try to emit a bitmask instruction for a shuffle.
10740///
10741/// This handles cases where we can model a blend exactly as a bitmask due to
10742/// one of the inputs being zeroable.
10744 SDValue V2, ArrayRef<int> Mask,
10745 const APInt &Zeroable,
10746 const X86Subtarget &Subtarget,
10747 SelectionDAG &DAG) {
10748 MVT MaskVT = VT;
10749 MVT EltVT = VT.getVectorElementType();
10750 SDValue Zero, AllOnes;
10751 // Use f64 if i64 isn't legal.
10752 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
10753 EltVT = MVT::f64;
10754 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
10755 }
10756
10757 MVT LogicVT = VT;
10758 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
10759 Zero = DAG.getConstantFP(0.0, DL, EltVT);
10760 APFloat AllOnesValue = APFloat::getAllOnesValue(EltVT.getFltSemantics());
10761 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
10762 LogicVT =
10763 MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
10764 } else {
10765 Zero = DAG.getConstant(0, DL, EltVT);
10766 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10767 }
10768
10769 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
10770 SDValue V;
10771 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10772 if (Zeroable[i])
10773 continue;
10774 if (Mask[i] % Size != i)
10775 return SDValue(); // Not a blend.
10776 if (!V)
10777 V = Mask[i] < Size ? V1 : V2;
10778 else if (V != (Mask[i] < Size ? V1 : V2))
10779 return SDValue(); // Can only let one input through the mask.
10780
10781 VMaskOps[i] = AllOnes;
10782 }
10783 if (!V)
10784 return SDValue(); // No non-zeroable elements!
10785
10786 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
10787 VMask = DAG.getBitcast(LogicVT, VMask);
10788 V = DAG.getBitcast(LogicVT, V);
10789 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
10790 return DAG.getBitcast(VT, And);
10791}
10792
10793/// Try to emit a blend instruction for a shuffle using bit math.
10794///
10795/// This is used as a fallback approach when first class blend instructions are
10796/// unavailable. Currently it is only suitable for integer vectors, but could
10797/// be generalized for floating point vectors if desirable.
10799 SDValue V2, ArrayRef<int> Mask,
10800 SelectionDAG &DAG) {
10801 assert(VT.isInteger() && "Only supports integer vector types!");
10802 MVT EltVT = VT.getVectorElementType();
10803 SDValue Zero = DAG.getConstant(0, DL, EltVT);
10804 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10806 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10807 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
10808 return SDValue(); // Shuffled input!
10809 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
10810 }
10811
10812 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
10813 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
10814}
10815
10817 SDValue PreservedSrc,
10818 const X86Subtarget &Subtarget,
10819 SelectionDAG &DAG);
10820
10823 const APInt &Zeroable, bool &ForceV1Zero,
10824 bool &ForceV2Zero, uint64_t &BlendMask) {
10825 bool V1IsZeroOrUndef =
10827 bool V2IsZeroOrUndef =
10828 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
10829
10830 BlendMask = 0;
10831 ForceV1Zero = false, ForceV2Zero = false;
10832 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
10833
10834 int NumElts = Mask.size();
10835 int NumLanes = VT.getSizeInBits() / 128;
10836 int NumEltsPerLane = NumElts / NumLanes;
10837 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
10838
10839 // For 32/64-bit elements, if we only reference one input (plus any undefs),
10840 // then ensure the blend mask part for that lane just references that input.
10841 bool ForceWholeLaneMasks =
10842 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
10843
10844 // Attempt to generate the binary blend mask. If an input is zero then
10845 // we can use any lane.
10846 for (int Lane = 0; Lane != NumLanes; ++Lane) {
10847 // Keep track of the inputs used per lane.
10848 bool LaneV1InUse = false;
10849 bool LaneV2InUse = false;
10850 uint64_t LaneBlendMask = 0;
10851 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
10852 int Elt = (Lane * NumEltsPerLane) + LaneElt;
10853 int M = Mask[Elt];
10854 if (M == SM_SentinelUndef)
10855 continue;
10856 if (M == Elt || (0 <= M && M < NumElts &&
10857 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
10858 Mask[Elt] = Elt;
10859 LaneV1InUse = true;
10860 continue;
10861 }
10862 if (M == (Elt + NumElts) ||
10863 (NumElts <= M &&
10864 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
10865 LaneBlendMask |= 1ull << LaneElt;
10866 Mask[Elt] = Elt + NumElts;
10867 LaneV2InUse = true;
10868 continue;
10869 }
10870 if (Zeroable[Elt]) {
10871 if (V1IsZeroOrUndef) {
10872 ForceV1Zero = true;
10873 Mask[Elt] = Elt;
10874 LaneV1InUse = true;
10875 continue;
10876 }
10877 if (V2IsZeroOrUndef) {
10878 ForceV2Zero = true;
10879 LaneBlendMask |= 1ull << LaneElt;
10880 Mask[Elt] = Elt + NumElts;
10881 LaneV2InUse = true;
10882 continue;
10883 }
10884 }
10885 return false;
10886 }
10887
10888 // If we only used V2 then splat the lane blend mask to avoid any demanded
10889 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
10890 // blend mask bit).
10891 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
10892 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
10893
10894 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
10895 }
10896 return true;
10897}
10898
10899/// Try to emit a blend instruction for a shuffle.
10900///
10901/// This doesn't do any checks for the availability of instructions for blending
10902/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
10903/// be matched in the backend with the type given. What it does check for is
10904/// that the shuffle mask is a blend, or convertible into a blend with zero.
10906 SDValue V2, ArrayRef<int> Original,
10907 const APInt &Zeroable,
10908 const X86Subtarget &Subtarget,
10909 SelectionDAG &DAG) {
10910 uint64_t BlendMask = 0;
10911 bool ForceV1Zero = false, ForceV2Zero = false;
10912 SmallVector<int, 64> Mask(Original);
10913 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
10914 BlendMask))
10915 return SDValue();
10916
10917 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
10918 if (ForceV1Zero)
10919 V1 = getZeroVector(VT, Subtarget, DAG, DL);
10920 if (ForceV2Zero)
10921 V2 = getZeroVector(VT, Subtarget, DAG, DL);
10922
10923 unsigned NumElts = VT.getVectorNumElements();
10924
10925 switch (VT.SimpleTy) {
10926 case MVT::v4i64:
10927 case MVT::v8i32:
10928 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
10929 [[fallthrough]];
10930 case MVT::v4f64:
10931 case MVT::v8f32:
10932 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
10933 [[fallthrough]];
10934 case MVT::v2f64:
10935 case MVT::v2i64:
10936 case MVT::v4f32:
10937 case MVT::v4i32:
10938 case MVT::v8i16:
10939 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
10940 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
10941 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
10942 case MVT::v16i16: {
10943 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
10944 SmallVector<int, 8> RepeatedMask;
10945 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
10946 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
10947 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
10948 BlendMask = 0;
10949 for (int i = 0; i < 8; ++i)
10950 if (RepeatedMask[i] >= 8)
10951 BlendMask |= 1ull << i;
10952 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10953 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
10954 }
10955 // Use PBLENDW for lower/upper lanes and then blend lanes.
10956 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
10957 // merge to VSELECT where useful.
10958 uint64_t LoMask = BlendMask & 0xFF;
10959 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
10960 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
10961 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10962 DAG.getTargetConstant(LoMask, DL, MVT::i8));
10963 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10964 DAG.getTargetConstant(HiMask, DL, MVT::i8));
10965 return DAG.getVectorShuffle(
10966 MVT::v16i16, DL, Lo, Hi,
10967 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
10968 }
10969 [[fallthrough]];
10970 }
10971 case MVT::v32i8:
10972 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
10973 [[fallthrough]];
10974 case MVT::v16i8: {
10975 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
10976
10977 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
10978 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
10979 Subtarget, DAG))
10980 return Masked;
10981
10982 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
10983 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
10984 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
10985 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
10986 }
10987
10988 // If we have VPTERNLOG, we can use that as a bit blend.
10989 if (Subtarget.hasVLX())
10990 if (SDValue BitBlend =
10991 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
10992 return BitBlend;
10993
10994 // Scale the blend by the number of bytes per element.
10995 int Scale = VT.getScalarSizeInBits() / 8;
10996
10997 // This form of blend is always done on bytes. Compute the byte vector
10998 // type.
10999 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11000
11001 // x86 allows load folding with blendvb from the 2nd source operand. But
11002 // we are still using LLVM select here (see comment below), so that's V1.
11003 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
11004 // allow that load-folding possibility.
11005 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
11007 std::swap(V1, V2);
11008 }
11009
11010 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
11011 // mix of LLVM's code generator and the x86 backend. We tell the code
11012 // generator that boolean values in the elements of an x86 vector register
11013 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11014 // mapping a select to operand #1, and 'false' mapping to operand #2. The
11015 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11016 // of the element (the remaining are ignored) and 0 in that high bit would
11017 // mean operand #1 while 1 in the high bit would mean operand #2. So while
11018 // the LLVM model for boolean values in vector elements gets the relevant
11019 // bit set, it is set backwards and over constrained relative to x86's
11020 // actual model.
11021 SmallVector<SDValue, 32> VSELECTMask;
11022 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11023 for (int j = 0; j < Scale; ++j)
11024 VSELECTMask.push_back(
11025 Mask[i] < 0
11026 ? DAG.getUNDEF(MVT::i8)
11027 : DAG.getSignedConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8));
11028
11029 V1 = DAG.getBitcast(BlendVT, V1);
11030 V2 = DAG.getBitcast(BlendVT, V2);
11031 return DAG.getBitcast(
11032 VT,
11033 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
11034 V1, V2));
11035 }
11036 case MVT::v16f32:
11037 case MVT::v8f64:
11038 case MVT::v8i64:
11039 case MVT::v16i32:
11040 case MVT::v32i16:
11041 case MVT::v64i8: {
11042 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
11043 bool OptForSize = DAG.shouldOptForSize();
11044 if (!OptForSize) {
11045 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11046 Subtarget, DAG))
11047 return Masked;
11048 }
11049
11050 // Otherwise load an immediate into a GPR, cast to k-register, and use a
11051 // masked move.
11052 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11053 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11054 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11055 }
11056 default:
11057 llvm_unreachable("Not a supported integer vector type!");
11058 }
11059}
11060
11061/// Try to lower as a blend of elements from two inputs followed by
11062/// a single-input permutation.
11063///
11064/// This matches the pattern where we can blend elements from two inputs and
11065/// then reduce the shuffle to a single-input permutation.
11067 SDValue V1, SDValue V2,
11068 ArrayRef<int> Mask,
11069 SelectionDAG &DAG,
11070 bool ImmBlends = false) {
11071 // We build up the blend mask while checking whether a blend is a viable way
11072 // to reduce the shuffle.
11073 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11074 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
11075
11076 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11077 if (Mask[i] < 0)
11078 continue;
11079
11080 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
11081
11082 if (BlendMask[Mask[i] % Size] < 0)
11083 BlendMask[Mask[i] % Size] = Mask[i];
11084 else if (BlendMask[Mask[i] % Size] != Mask[i])
11085 return SDValue(); // Can't blend in the needed input!
11086
11087 PermuteMask[i] = Mask[i] % Size;
11088 }
11089
11090 // If only immediate blends, then bail if the blend mask can't be widened to
11091 // i16.
11092 unsigned EltSize = VT.getScalarSizeInBits();
11093 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
11094 return SDValue();
11095
11096 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11097 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
11098}
11099
11100/// Try to lower as an unpack of elements from two inputs followed by
11101/// a single-input permutation.
11102///
11103/// This matches the pattern where we can unpack elements from two inputs and
11104/// then reduce the shuffle to a single-input (wider) permutation.
11106 SDValue V1, SDValue V2,
11107 ArrayRef<int> Mask,
11108 SelectionDAG &DAG) {
11109 int NumElts = Mask.size();
11110 int NumLanes = VT.getSizeInBits() / 128;
11111 int NumLaneElts = NumElts / NumLanes;
11112 int NumHalfLaneElts = NumLaneElts / 2;
11113
11114 bool MatchLo = true, MatchHi = true;
11115 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11116
11117 // Determine UNPCKL/UNPCKH type and operand order.
11118 for (int Elt = 0; Elt != NumElts; ++Elt) {
11119 int M = Mask[Elt];
11120 if (M < 0)
11121 continue;
11122
11123 // Normalize the mask value depending on whether it's V1 or V2.
11124 int NormM = M;
11125 SDValue &Op = Ops[Elt & 1];
11126 if (M < NumElts && (Op.isUndef() || Op == V1))
11127 Op = V1;
11128 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
11129 Op = V2;
11130 NormM -= NumElts;
11131 } else
11132 return SDValue();
11133
11134 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
11135 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11136 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
11137 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
11138 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
11139 if (MatchLoAnyLane || MatchHiAnyLane) {
11140 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
11141 "Failed to match UNPCKLO/UNPCKHI");
11142 break;
11143 }
11144 }
11145 MatchLo &= MatchLoAnyLane;
11146 MatchHi &= MatchHiAnyLane;
11147 if (!MatchLo && !MatchHi)
11148 return SDValue();
11149 }
11150 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
11151
11152 // Element indices have changed after unpacking. Calculate permute mask
11153 // so that they will be put back to the position as dictated by the
11154 // original shuffle mask indices.
11155 SmallVector<int, 32> PermuteMask(NumElts, -1);
11156 for (int Elt = 0; Elt != NumElts; ++Elt) {
11157 int M = Mask[Elt];
11158 if (M < 0)
11159 continue;
11160 int NormM = M;
11161 if (NumElts <= M)
11162 NormM -= NumElts;
11163 bool IsFirstOp = M < NumElts;
11164 int BaseMaskElt =
11165 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
11166 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
11167 PermuteMask[Elt] = BaseMaskElt;
11168 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
11169 PermuteMask[Elt] = BaseMaskElt + 1;
11170 assert(PermuteMask[Elt] != -1 &&
11171 "Input mask element is defined but failed to assign permute mask");
11172 }
11173
11174 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11175 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
11176 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
11177}
11178
11179/// Try to lower a shuffle as a permute of the inputs followed by an
11180/// UNPCK instruction.
11181///
11182/// This specifically targets cases where we end up with alternating between
11183/// the two inputs, and so can permute them into something that feeds a single
11184/// UNPCK instruction. Note that this routine only targets integer vectors
11185/// because for floating point vectors we have a generalized SHUFPS lowering
11186/// strategy that handles everything that doesn't *exactly* match an unpack,
11187/// making this clever lowering unnecessary.
11189 SDValue V1, SDValue V2,
11190 ArrayRef<int> Mask,
11191 const X86Subtarget &Subtarget,
11192 SelectionDAG &DAG) {
11193 int Size = Mask.size();
11194 assert(Mask.size() >= 2 && "Single element masks are invalid.");
11195
11196 // This routine only supports 128-bit integer dual input vectors.
11197 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
11198 return SDValue();
11199
11200 int NumLoInputs =
11201 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
11202 int NumHiInputs =
11203 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
11204
11205 bool UnpackLo = NumLoInputs >= NumHiInputs;
11206
11207 auto TryUnpack = [&](int ScalarSize, int Scale) {
11208 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11209 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11210
11211 for (int i = 0; i < Size; ++i) {
11212 if (Mask[i] < 0)
11213 continue;
11214
11215 // Each element of the unpack contains Scale elements from this mask.
11216 int UnpackIdx = i / Scale;
11217
11218 // We only handle the case where V1 feeds the first slots of the unpack.
11219 // We rely on canonicalization to ensure this is the case.
11220 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
11221 return SDValue();
11222
11223 // Setup the mask for this input. The indexing is tricky as we have to
11224 // handle the unpack stride.
11225 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
11226 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
11227 Mask[i] % Size;
11228 }
11229
11230 // If we will have to shuffle both inputs to use the unpack, check whether
11231 // we can just unpack first and shuffle the result. If so, skip this unpack.
11232 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
11233 !isNoopShuffleMask(V2Mask))
11234 return SDValue();
11235
11236 // Shuffle the inputs into place.
11237 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11238 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11239
11240 // Cast the inputs to the type we will use to unpack them.
11241 MVT UnpackVT =
11242 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
11243 V1 = DAG.getBitcast(UnpackVT, V1);
11244 V2 = DAG.getBitcast(UnpackVT, V2);
11245
11246 // Unpack the inputs and cast the result back to the desired type.
11247 return DAG.getBitcast(
11248 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11249 UnpackVT, V1, V2));
11250 };
11251
11252 // We try each unpack from the largest to the smallest to try and find one
11253 // that fits this mask.
11254 int OrigScalarSize = VT.getScalarSizeInBits();
11255 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11256 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11257 return Unpack;
11258
11259 // If we're shuffling with a zero vector then we're better off not doing
11260 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
11262 ISD::isBuildVectorAllZeros(V2.getNode()))
11263 return SDValue();
11264
11265 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11266 // initial unpack.
11267 if (NumLoInputs == 0 || NumHiInputs == 0) {
11268 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11269 "We have to have *some* inputs!");
11270 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11271
11272 // FIXME: We could consider the total complexity of the permute of each
11273 // possible unpacking. Or at the least we should consider how many
11274 // half-crossings are created.
11275 // FIXME: We could consider commuting the unpacks.
11276
11277 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11278 for (int i = 0; i < Size; ++i) {
11279 if (Mask[i] < 0)
11280 continue;
11281
11282 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11283
11284 PermMask[i] =
11285 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11286 }
11287 return DAG.getVectorShuffle(
11288 VT, DL,
11289 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
11290 V1, V2),
11291 DAG.getUNDEF(VT), PermMask);
11292 }
11293
11294 return SDValue();
11295}
11296
11297/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11298/// permuting the elements of the result in place.
11300 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11301 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11302 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11303 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11304 (VT.is512BitVector() && !Subtarget.hasBWI()))
11305 return SDValue();
11306
11307 // We don't currently support lane crossing permutes.
11308 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11309 return SDValue();
11310
11311 int Scale = VT.getScalarSizeInBits() / 8;
11312 int NumLanes = VT.getSizeInBits() / 128;
11313 int NumElts = VT.getVectorNumElements();
11314 int NumEltsPerLane = NumElts / NumLanes;
11315
11316 // Determine range of mask elts.
11317 bool Blend1 = true;
11318 bool Blend2 = true;
11319 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
11320 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
11321 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11322 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11323 int M = Mask[Lane + Elt];
11324 if (M < 0)
11325 continue;
11326 if (M < NumElts) {
11327 Blend1 &= (M == (Lane + Elt));
11328 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11329 M = M % NumEltsPerLane;
11330 Range1.first = std::min(Range1.first, M);
11331 Range1.second = std::max(Range1.second, M);
11332 } else {
11333 M -= NumElts;
11334 Blend2 &= (M == (Lane + Elt));
11335 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11336 M = M % NumEltsPerLane;
11337 Range2.first = std::min(Range2.first, M);
11338 Range2.second = std::max(Range2.second, M);
11339 }
11340 }
11341 }
11342
11343 // Bail if we don't need both elements.
11344 // TODO - it might be worth doing this for unary shuffles if the permute
11345 // can be widened.
11346 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11347 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11348 return SDValue();
11349
11350 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11351 return SDValue();
11352
11353 // Rotate the 2 ops so we can access both ranges, then permute the result.
11354 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11355 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11356 SDValue Rotate = DAG.getBitcast(
11357 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11358 DAG.getBitcast(ByteVT, Lo),
11359 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11360 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11361 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11362 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11363 int M = Mask[Lane + Elt];
11364 if (M < 0)
11365 continue;
11366 if (M < NumElts)
11367 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11368 else
11369 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11370 }
11371 }
11372 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11373 };
11374
11375 // Check if the ranges are small enough to rotate from either direction.
11376 if (Range2.second < Range1.first)
11377 return RotateAndPermute(V1, V2, Range1.first, 0);
11378 if (Range1.second < Range2.first)
11379 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11380 return SDValue();
11381}
11382
11384 return isUndefOrEqual(Mask, 0);
11385}
11386
11388 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
11389}
11390
11391/// Check if the Mask consists of the same element repeated multiple times.
11393 size_t NumUndefs = 0;
11394 std::optional<int> UniqueElt;
11395 for (int Elt : Mask) {
11396 if (Elt == SM_SentinelUndef) {
11397 NumUndefs++;
11398 continue;
11399 }
11400 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
11401 return false;
11402 UniqueElt = Elt;
11403 }
11404 // Make sure the element is repeated enough times by checking the number of
11405 // undefs is small.
11406 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
11407}
11408
11409/// Generic routine to decompose a shuffle and blend into independent
11410/// blends and permutes.
11411///
11412/// This matches the extremely common pattern for handling combined
11413/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11414/// operations. It will try to pick the best arrangement of shuffles and
11415/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
11417 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11418 const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11419 int NumElts = Mask.size();
11420 int NumLanes = VT.getSizeInBits() / 128;
11421 int NumEltsPerLane = NumElts / NumLanes;
11422
11423 // Shuffle the input elements into the desired positions in V1 and V2 and
11424 // unpack/blend them together.
11425 bool IsAlternating = true;
11426 bool V1Zero = true, V2Zero = true;
11427 SmallVector<int, 32> V1Mask(NumElts, -1);
11428 SmallVector<int, 32> V2Mask(NumElts, -1);
11429 SmallVector<int, 32> FinalMask(NumElts, -1);
11430 for (int i = 0; i < NumElts; ++i) {
11431 int M = Mask[i];
11432 if (M >= 0 && M < NumElts) {
11433 V1Mask[i] = M;
11434 FinalMask[i] = i;
11435 V1Zero &= Zeroable[i];
11436 IsAlternating &= (i & 1) == 0;
11437 } else if (M >= NumElts) {
11438 V2Mask[i] = M - NumElts;
11439 FinalMask[i] = i + NumElts;
11440 V2Zero &= Zeroable[i];
11441 IsAlternating &= (i & 1) == 1;
11442 }
11443 }
11444
11445 // If we effectively only demand the 0'th element of \p Input, and not only
11446 // as 0'th element, then broadcast said input,
11447 // and change \p InputMask to be a no-op (identity) mask.
11448 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
11449 &DAG](SDValue &Input,
11450 MutableArrayRef<int> InputMask) {
11451 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
11452 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
11453 !X86::mayFoldLoad(Input, Subtarget)))
11454 return;
11455 if (isNoopShuffleMask(InputMask))
11456 return;
11457 assert(isBroadcastShuffleMask(InputMask) &&
11458 "Expected to demand only the 0'th element.");
11459 Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);
11460 for (auto I : enumerate(InputMask)) {
11461 int &InputMaskElt = I.value();
11462 if (InputMaskElt >= 0)
11463 InputMaskElt = I.index();
11464 }
11465 };
11466
11467 // Currently, we may need to produce one shuffle per input, and blend results.
11468 // It is possible that the shuffle for one of the inputs is already a no-op.
11469 // See if we can simplify non-no-op shuffles into broadcasts,
11470 // which we consider to be strictly better than an arbitrary shuffle.
11471 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
11473 canonicalizeBroadcastableInput(V1, V1Mask);
11474 canonicalizeBroadcastableInput(V2, V2Mask);
11475 }
11476
11477 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11478 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11479 // the shuffle may be able to fold with a load or other benefit. However, when
11480 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11481 // pre-shuffle first is a better strategy.
11482 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11483 // Only prefer immediate blends to unpack/rotate.
11484 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11485 DAG, true))
11486 return BlendPerm;
11487 // If either input vector provides only a single element which is repeated
11488 // multiple times, unpacking from both input vectors would generate worse
11489 // code. e.g. for
11490 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
11491 // it is better to process t4 first to create a vector of t4[0], then unpack
11492 // that vector with t2.
11493 if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&
11495 if (SDValue UnpackPerm =
11496 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
11497 return UnpackPerm;
11499 DL, VT, V1, V2, Mask, Subtarget, DAG))
11500 return RotatePerm;
11501 // Unpack/rotate failed - try again with variable blends.
11502 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11503 DAG))
11504 return BlendPerm;
11505 if (VT.getScalarSizeInBits() >= 32)
11506 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11507 DL, VT, V1, V2, Mask, Subtarget, DAG))
11508 return PermUnpack;
11509 }
11510
11511 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
11512 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
11513 // TODO: It doesn't have to be alternating - but each lane mustn't have more
11514 // than half the elements coming from each source.
11515 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
11516 V1Mask.assign(NumElts, -1);
11517 V2Mask.assign(NumElts, -1);
11518 FinalMask.assign(NumElts, -1);
11519 for (int i = 0; i != NumElts; i += NumEltsPerLane)
11520 for (int j = 0; j != NumEltsPerLane; ++j) {
11521 int M = Mask[i + j];
11522 if (M >= 0 && M < NumElts) {
11523 V1Mask[i + (j / 2)] = M;
11524 FinalMask[i + j] = i + (j / 2);
11525 } else if (M >= NumElts) {
11526 V2Mask[i + (j / 2)] = M - NumElts;
11527 FinalMask[i + j] = i + (j / 2) + NumElts;
11528 }
11529 }
11530 }
11531
11532 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11533 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11534 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
11535}
11536
11537static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11538 const X86Subtarget &Subtarget,
11539 ArrayRef<int> Mask) {
11540 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11541 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11542
11543 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11544 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11545 int MaxSubElts = 64 / EltSizeInBits;
11546 unsigned RotateAmt, NumSubElts;
11547 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
11548 MaxSubElts, NumSubElts, RotateAmt))
11549 return -1;
11550 unsigned NumElts = Mask.size();
11551 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11552 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11553 return RotateAmt;
11554}
11555
11556/// Lower shuffle using X86ISD::VROTLI rotations.
11558 ArrayRef<int> Mask,
11559 const X86Subtarget &Subtarget,
11560 SelectionDAG &DAG) {
11561 // Only XOP + AVX512 targets have bit rotation instructions.
11562 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
11563 bool IsLegal =
11564 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11565 if (!IsLegal && Subtarget.hasSSE3())
11566 return SDValue();
11567
11568 MVT RotateVT;
11569 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11570 Subtarget, Mask);
11571 if (RotateAmt < 0)
11572 return SDValue();
11573
11574 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11575 // expanded to OR(SRL,SHL), will be more efficient, but if they can
11576 // widen to vXi16 or more then existing lowering should will be better.
11577 if (!IsLegal) {
11578 if ((RotateAmt % 16) == 0)
11579 return SDValue();
11580 // TODO: Use getTargetVShiftByConstNode.
11581 unsigned ShlAmt = RotateAmt;
11582 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11583 V1 = DAG.getBitcast(RotateVT, V1);
11584 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11585 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11586 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11587 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11588 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11589 return DAG.getBitcast(VT, Rot);
11590 }
11591
11592 SDValue Rot =
11593 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11594 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11595 return DAG.getBitcast(VT, Rot);
11596}
11597
11598/// Try to match a vector shuffle as an element rotation.
11599///
11600/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11602 ArrayRef<int> Mask) {
11603 int NumElts = Mask.size();
11604
11605 // We need to detect various ways of spelling a rotation:
11606 // [11, 12, 13, 14, 15, 0, 1, 2]
11607 // [-1, 12, 13, 14, -1, -1, 1, -1]
11608 // [-1, -1, -1, -1, -1, -1, 1, 2]
11609 // [ 3, 4, 5, 6, 7, 8, 9, 10]
11610 // [-1, 4, 5, 6, -1, -1, 9, -1]
11611 // [-1, 4, 5, 6, -1, -1, -1, -1]
11612 int Rotation = 0;
11613 SDValue Lo, Hi;
11614 for (int i = 0; i < NumElts; ++i) {
11615 int M = Mask[i];
11616 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
11617 "Unexpected mask index.");
11618 if (M < 0)
11619 continue;
11620
11621 // Determine where a rotated vector would have started.
11622 int StartIdx = i - (M % NumElts);
11623 if (StartIdx == 0)
11624 // The identity rotation isn't interesting, stop.
11625 return -1;
11626
11627 // If we found the tail of a vector the rotation must be the missing
11628 // front. If we found the head of a vector, it must be how much of the
11629 // head.
11630 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11631
11632 if (Rotation == 0)
11633 Rotation = CandidateRotation;
11634 else if (Rotation != CandidateRotation)
11635 // The rotations don't match, so we can't match this mask.
11636 return -1;
11637
11638 // Compute which value this mask is pointing at.
11639 SDValue MaskV = M < NumElts ? V1 : V2;
11640
11641 // Compute which of the two target values this index should be assigned
11642 // to. This reflects whether the high elements are remaining or the low
11643 // elements are remaining.
11644 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11645
11646 // Either set up this value if we've not encountered it before, or check
11647 // that it remains consistent.
11648 if (!TargetV)
11649 TargetV = MaskV;
11650 else if (TargetV != MaskV)
11651 // This may be a rotation, but it pulls from the inputs in some
11652 // unsupported interleaving.
11653 return -1;
11654 }
11655
11656 // Check that we successfully analyzed the mask, and normalize the results.
11657 assert(Rotation != 0 && "Failed to locate a viable rotation!");
11658 assert((Lo || Hi) && "Failed to find a rotated input vector!");
11659 if (!Lo)
11660 Lo = Hi;
11661 else if (!Hi)
11662 Hi = Lo;
11663
11664 V1 = Lo;
11665 V2 = Hi;
11666
11667 return Rotation;
11668}
11669
11670/// Try to lower a vector shuffle as a byte rotation.
11671///
11672/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11673/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11674/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11675/// try to generically lower a vector shuffle through such an pattern. It
11676/// does not check for the profitability of lowering either as PALIGNR or
11677/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11678/// This matches shuffle vectors that look like:
11679///
11680/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11681///
11682/// Essentially it concatenates V1 and V2, shifts right by some number of
11683/// elements, and takes the low elements as the result. Note that while this is
11684/// specified as a *right shift* because x86 is little-endian, it is a *left
11685/// rotate* of the vector lanes.
11687 ArrayRef<int> Mask) {
11688 // Don't accept any shuffles with zero elements.
11689 if (isAnyZero(Mask))
11690 return -1;
11691
11692 // PALIGNR works on 128-bit lanes.
11693 SmallVector<int, 16> RepeatedMask;
11694 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11695 return -1;
11696
11697 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
11698 if (Rotation <= 0)
11699 return -1;
11700
11701 // PALIGNR rotates bytes, so we need to scale the
11702 // rotation based on how many bytes are in the vector lane.
11703 int NumElts = RepeatedMask.size();
11704 int Scale = 16 / NumElts;
11705 return Rotation * Scale;
11706}
11707
11709 SDValue V2, ArrayRef<int> Mask,
11710 const X86Subtarget &Subtarget,
11711 SelectionDAG &DAG) {
11712 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11713
11714 SDValue Lo = V1, Hi = V2;
11715 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11716 if (ByteRotation <= 0)
11717 return SDValue();
11718
11719 // Cast the inputs to i8 vector of correct length to match PALIGNR or
11720 // PSLLDQ/PSRLDQ.
11721 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11722 Lo = DAG.getBitcast(ByteVT, Lo);
11723 Hi = DAG.getBitcast(ByteVT, Hi);
11724
11725 // SSSE3 targets can use the palignr instruction.
11726 if (Subtarget.hasSSSE3()) {
11727 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
11728 "512-bit PALIGNR requires BWI instructions");
11729 return DAG.getBitcast(
11730 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
11731 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
11732 }
11733
11734 assert(VT.is128BitVector() &&
11735 "Rotate-based lowering only supports 128-bit lowering!");
11736 assert(Mask.size() <= 16 &&
11737 "Can shuffle at most 16 bytes in a 128-bit vector!");
11738 assert(ByteVT == MVT::v16i8 &&
11739 "SSE2 rotate lowering only needed for v16i8!");
11740
11741 // Default SSE2 implementation
11742 int LoByteShift = 16 - ByteRotation;
11743 int HiByteShift = ByteRotation;
11744
11745 SDValue LoShift =
11746 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
11747 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
11748 SDValue HiShift =
11749 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
11750 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
11751 return DAG.getBitcast(VT,
11752 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
11753}
11754
11755/// Try to lower a vector shuffle as a dword/qword rotation.
11756///
11757/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
11758/// rotation of the concatenation of two vectors; This routine will
11759/// try to generically lower a vector shuffle through such an pattern.
11760///
11761/// Essentially it concatenates V1 and V2, shifts right by some number of
11762/// elements, and takes the low elements as the result. Note that while this is
11763/// specified as a *right shift* because x86 is little-endian, it is a *left
11764/// rotate* of the vector lanes.
11766 SDValue V2, ArrayRef<int> Mask,
11767 const APInt &Zeroable,
11768 const X86Subtarget &Subtarget,
11769 SelectionDAG &DAG) {
11770 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
11771 "Only 32-bit and 64-bit elements are supported!");
11772
11773 // 128/256-bit vectors are only supported with VLX.
11774 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
11775 && "VLX required for 128/256-bit vectors");
11776
11777 SDValue Lo = V1, Hi = V2;
11778 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
11779 if (0 < Rotation)
11780 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
11781 DAG.getTargetConstant(Rotation, DL, MVT::i8));
11782
11783 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
11784 // TODO: Pull this out as a matchShuffleAsElementShift helper?
11785 // TODO: We can probably make this more aggressive and use shift-pairs like
11786 // lowerShuffleAsByteShiftMask.
11787 unsigned NumElts = Mask.size();
11788 unsigned ZeroLo = Zeroable.countr_one();
11789 unsigned ZeroHi = Zeroable.countl_one();
11790 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
11791 if (!ZeroLo && !ZeroHi)
11792 return SDValue();
11793
11794 if (ZeroLo) {
11795 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11796 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
11797 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
11798 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
11799 getZeroVector(VT, Subtarget, DAG, DL),
11800 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
11801 }
11802
11803 if (ZeroHi) {
11804 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
11805 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
11806 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
11807 return DAG.getNode(X86ISD::VALIGN, DL, VT,
11808 getZeroVector(VT, Subtarget, DAG, DL), Src,
11809 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
11810 }
11811
11812 return SDValue();
11813}
11814
11815/// Try to lower a vector shuffle as a byte shift sequence.
11817 SDValue V2, ArrayRef<int> Mask,
11818 const APInt &Zeroable,
11819 const X86Subtarget &Subtarget,
11820 SelectionDAG &DAG) {
11821 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11822 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
11823
11824 // We need a shuffle that has zeros at one/both ends and a sequential
11825 // shuffle from one source within.
11826 unsigned ZeroLo = Zeroable.countr_one();
11827 unsigned ZeroHi = Zeroable.countl_one();
11828 if (!ZeroLo && !ZeroHi)
11829 return SDValue();
11830
11831 unsigned NumElts = Mask.size();
11832 unsigned Len = NumElts - (ZeroLo + ZeroHi);
11833 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
11834 return SDValue();
11835
11836 unsigned Scale = VT.getScalarSizeInBits() / 8;
11837 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
11838 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
11839 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
11840 return SDValue();
11841
11842 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11843 Res = DAG.getBitcast(MVT::v16i8, Res);
11844
11845 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
11846 // inner sequential set of elements, possibly offset:
11847 // 01234567 --> zzzzzz01 --> 1zzzzzzz
11848 // 01234567 --> 4567zzzz --> zzzzz456
11849 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
11850 if (ZeroLo == 0) {
11851 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11852 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11853 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11854 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11855 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
11856 } else if (ZeroHi == 0) {
11857 unsigned Shift = Mask[ZeroLo] % NumElts;
11858 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11859 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11860 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11861 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
11862 } else if (!Subtarget.hasSSSE3()) {
11863 // If we don't have PSHUFB then its worth avoiding an AND constant mask
11864 // by performing 3 byte shifts. Shuffle combining can kick in above that.
11865 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
11866 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11867 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11868 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11869 Shift += Mask[ZeroLo] % NumElts;
11870 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11871 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11872 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11873 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
11874 } else
11875 return SDValue();
11876
11877 return DAG.getBitcast(VT, Res);
11878}
11879
11880/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
11881///
11882/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
11883/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
11884/// matches elements from one of the input vectors shuffled to the left or
11885/// right with zeroable elements 'shifted in'. It handles both the strictly
11886/// bit-wise element shifts and the byte shift across an entire 128-bit double
11887/// quad word lane.
11888///
11889/// PSHL : (little-endian) left bit shift.
11890/// [ zz, 0, zz, 2 ]
11891/// [ -1, 4, zz, -1 ]
11892/// PSRL : (little-endian) right bit shift.
11893/// [ 1, zz, 3, zz]
11894/// [ -1, -1, 7, zz]
11895/// PSLLDQ : (little-endian) left byte shift
11896/// [ zz, 0, 1, 2, 3, 4, 5, 6]
11897/// [ zz, zz, -1, -1, 2, 3, 4, -1]
11898/// [ zz, zz, zz, zz, zz, zz, -1, 1]
11899/// PSRLDQ : (little-endian) right byte shift
11900/// [ 5, 6, 7, zz, zz, zz, zz, zz]
11901/// [ -1, 5, 6, 7, zz, zz, zz, zz]
11902/// [ 1, 2, -1, -1, -1, -1, zz, zz]
11903static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
11904 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
11905 int MaskOffset, const APInt &Zeroable,
11906 const X86Subtarget &Subtarget) {
11907 int Size = Mask.size();
11908 unsigned SizeInBits = Size * ScalarSizeInBits;
11909
11910 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
11911 for (int i = 0; i < Size; i += Scale)
11912 for (int j = 0; j < Shift; ++j)
11913 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
11914 return false;
11915
11916 return true;
11917 };
11918
11919 auto MatchShift = [&](int Shift, int Scale, bool Left) {
11920 for (int i = 0; i != Size; i += Scale) {
11921 unsigned Pos = Left ? i + Shift : i;
11922 unsigned Low = Left ? i : i + Shift;
11923 unsigned Len = Scale - Shift;
11924 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
11925 return -1;
11926 }
11927
11928 int ShiftEltBits = ScalarSizeInBits * Scale;
11929 bool ByteShift = ShiftEltBits > 64;
11930 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
11931 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
11932 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
11933
11934 // Normalize the scale for byte shifts to still produce an i64 element
11935 // type.
11936 Scale = ByteShift ? Scale / 2 : Scale;
11937
11938 // We need to round trip through the appropriate type for the shift.
11939 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
11940 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
11941 : MVT::getVectorVT(ShiftSVT, Size / Scale);
11942 return (int)ShiftAmt;
11943 };
11944
11945 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
11946 // keep doubling the size of the integer elements up to that. We can
11947 // then shift the elements of the integer vector by whole multiples of
11948 // their width within the elements of the larger integer vector. Test each
11949 // multiple to see if we can find a match with the moved element indices
11950 // and that the shifted in elements are all zeroable.
11951 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
11952 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
11953 for (int Shift = 1; Shift != Scale; ++Shift)
11954 for (bool Left : {true, false})
11955 if (CheckZeros(Shift, Scale, Left)) {
11956 int ShiftAmt = MatchShift(Shift, Scale, Left);
11957 if (0 < ShiftAmt)
11958 return ShiftAmt;
11959 }
11960
11961 // no match
11962 return -1;
11963}
11964
11966 SDValue V2, ArrayRef<int> Mask,
11967 const APInt &Zeroable,
11968 const X86Subtarget &Subtarget,
11969 SelectionDAG &DAG, bool BitwiseOnly) {
11970 int Size = Mask.size();
11971 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11972
11973 MVT ShiftVT;
11974 SDValue V = V1;
11975 unsigned Opcode;
11976
11977 // Try to match shuffle against V1 shift.
11978 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
11979 Mask, 0, Zeroable, Subtarget);
11980
11981 // If V1 failed, try to match shuffle against V2 shift.
11982 if (ShiftAmt < 0) {
11983 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
11984 Mask, Size, Zeroable, Subtarget);
11985 V = V2;
11986 }
11987
11988 if (ShiftAmt < 0)
11989 return SDValue();
11990
11991 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
11992 return SDValue();
11993
11994 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
11995 "Illegal integer vector type");
11996 V = DAG.getBitcast(ShiftVT, V);
11997 V = DAG.getNode(Opcode, DL, ShiftVT, V,
11998 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
11999 return DAG.getBitcast(VT, V);
12000}
12001
12002// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
12003// Remainder of lower half result is zero and upper half is all undef.
12004static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
12005 ArrayRef<int> Mask, uint64_t &BitLen,
12006 uint64_t &BitIdx, const APInt &Zeroable) {
12007 int Size = Mask.size();
12008 int HalfSize = Size / 2;
12009 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12010 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
12011
12012 // Upper half must be undefined.
12013 if (!isUndefUpperHalf(Mask))
12014 return false;
12015
12016 // Determine the extraction length from the part of the
12017 // lower half that isn't zeroable.
12018 int Len = HalfSize;
12019 for (; Len > 0; --Len)
12020 if (!Zeroable[Len - 1])
12021 break;
12022 assert(Len > 0 && "Zeroable shuffle mask");
12023
12024 // Attempt to match first Len sequential elements from the lower half.
12025 SDValue Src;
12026 int Idx = -1;
12027 for (int i = 0; i != Len; ++i) {
12028 int M = Mask[i];
12029 if (M == SM_SentinelUndef)
12030 continue;
12031 SDValue &V = (M < Size ? V1 : V2);
12032 M = M % Size;
12033
12034 // The extracted elements must start at a valid index and all mask
12035 // elements must be in the lower half.
12036 if (i > M || M >= HalfSize)
12037 return false;
12038
12039 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12040 Src = V;
12041 Idx = M - i;
12042 continue;
12043 }
12044 return false;
12045 }
12046
12047 if (!Src || Idx < 0)
12048 return false;
12049
12050 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
12051 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12052 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12053 V1 = Src;
12054 return true;
12055}
12056
12057// INSERTQ: Extract lowest Len elements from lower half of second source and
12058// insert over first source, starting at Idx.
12059// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12060static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
12061 ArrayRef<int> Mask, uint64_t &BitLen,
12062 uint64_t &BitIdx) {
12063 int Size = Mask.size();
12064 int HalfSize = Size / 2;
12065 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12066
12067 // Upper half must be undefined.
12068 if (!isUndefUpperHalf(Mask))
12069 return false;
12070
12071 for (int Idx = 0; Idx != HalfSize; ++Idx) {
12072 SDValue Base;
12073
12074 // Attempt to match first source from mask before insertion point.
12075 if (isUndefInRange(Mask, 0, Idx)) {
12076 /* EMPTY */
12077 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
12078 Base = V1;
12079 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
12080 Base = V2;
12081 } else {
12082 continue;
12083 }
12084
12085 // Extend the extraction length looking to match both the insertion of
12086 // the second source and the remaining elements of the first.
12087 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
12088 SDValue Insert;
12089 int Len = Hi - Idx;
12090
12091 // Match insertion.
12092 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
12093 Insert = V1;
12094 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
12095 Insert = V2;
12096 } else {
12097 continue;
12098 }
12099
12100 // Match the remaining elements of the lower half.
12101 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
12102 /* EMPTY */
12103 } else if ((!Base || (Base == V1)) &&
12104 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
12105 Base = V1;
12106 } else if ((!Base || (Base == V2)) &&
12107 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
12108 Size + Hi)) {
12109 Base = V2;
12110 } else {
12111 continue;
12112 }
12113
12114 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12115 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12116 V1 = Base;
12117 V2 = Insert;
12118 return true;
12119 }
12120 }
12121
12122 return false;
12123}
12124
12125/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
12127 SDValue V2, ArrayRef<int> Mask,
12128 const APInt &Zeroable, SelectionDAG &DAG) {
12129 uint64_t BitLen, BitIdx;
12130 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
12131 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
12132 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12133 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12134
12135 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
12136 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
12137 V2 ? V2 : DAG.getUNDEF(VT),
12138 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12139 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12140
12141 return SDValue();
12142}
12143
12144/// Lower a vector shuffle as a zero or any extension.
12145///
12146/// Given a specific number of elements, element bit width, and extension
12147/// stride, produce either a zero or any extension based on the available
12148/// features of the subtarget. The extended elements are consecutive and
12149/// begin and can start from an offsetted element index in the input; to
12150/// avoid excess shuffling the offset must either being in the bottom lane
12151/// or at the start of a higher lane. All extended elements must be from
12152/// the same lane.
12154 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
12155 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12156 assert(Scale > 1 && "Need a scale to extend.");
12157 int EltBits = VT.getScalarSizeInBits();
12158 int NumElements = VT.getVectorNumElements();
12159 int NumEltsPerLane = 128 / EltBits;
12160 int OffsetLane = Offset / NumEltsPerLane;
12161 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
12162 "Only 8, 16, and 32 bit elements can be extended.");
12163 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
12164 assert(0 <= Offset && "Extension offset must be positive.");
12165 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
12166 "Extension offset must be in the first lane or start an upper lane.");
12167
12168 // Check that an index is in same lane as the base offset.
12169 auto SafeOffset = [&](int Idx) {
12170 return OffsetLane == (Idx / NumEltsPerLane);
12171 };
12172
12173 // Shift along an input so that the offset base moves to the first element.
12174 auto ShuffleOffset = [&](SDValue V) {
12175 if (!Offset)
12176 return V;
12177
12178 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12179 for (int i = 0; i * Scale < NumElements; ++i) {
12180 int SrcIdx = i + Offset;
12181 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12182 }
12183 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
12184 };
12185
12186 // Found a valid a/zext mask! Try various lowering strategies based on the
12187 // input type and available ISA extensions.
12188 if (Subtarget.hasSSE41()) {
12189 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
12190 // PUNPCK will catch this in a later shuffle match.
12191 if (Offset && Scale == 2 && VT.is128BitVector())
12192 return SDValue();
12193 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
12194 NumElements / Scale);
12195 InputV = DAG.getBitcast(VT, InputV);
12196 InputV = ShuffleOffset(InputV);
12198 DL, ExtVT, InputV, DAG);
12199 return DAG.getBitcast(VT, InputV);
12200 }
12201
12202 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
12203 InputV = DAG.getBitcast(VT, InputV);
12204
12205 // For any extends we can cheat for larger element sizes and use shuffle
12206 // instructions that can fold with a load and/or copy.
12207 if (AnyExt && EltBits == 32) {
12208 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
12209 -1};
12210 return DAG.getBitcast(
12211 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12212 DAG.getBitcast(MVT::v4i32, InputV),
12213 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12214 }
12215 if (AnyExt && EltBits == 16 && Scale > 2) {
12216 int PSHUFDMask[4] = {Offset / 2, -1,
12217 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
12218 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12219 DAG.getBitcast(MVT::v4i32, InputV),
12220 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
12221 int PSHUFWMask[4] = {1, -1, -1, -1};
12222 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
12223 return DAG.getBitcast(
12224 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
12225 DAG.getBitcast(MVT::v8i16, InputV),
12226 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
12227 }
12228
12229 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
12230 // to 64-bits.
12231 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
12232 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
12233 assert(VT.is128BitVector() && "Unexpected vector width!");
12234
12235 int LoIdx = Offset * EltBits;
12236 SDValue Lo = DAG.getBitcast(
12237 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12238 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12239 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
12240
12241 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
12242 return DAG.getBitcast(VT, Lo);
12243
12244 int HiIdx = (Offset + 1) * EltBits;
12245 SDValue Hi = DAG.getBitcast(
12246 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12247 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12248 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
12249 return DAG.getBitcast(VT,
12250 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
12251 }
12252
12253 // If this would require more than 2 unpack instructions to expand, use
12254 // pshufb when available. We can only use more than 2 unpack instructions
12255 // when zero extending i8 elements which also makes it easier to use pshufb.
12256 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
12257 assert(NumElements == 16 && "Unexpected byte vector width!");
12258 SDValue PSHUFBMask[16];
12259 for (int i = 0; i < 16; ++i) {
12260 int Idx = Offset + (i / Scale);
12261 if ((i % Scale == 0 && SafeOffset(Idx))) {
12262 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
12263 continue;
12264 }
12265 PSHUFBMask[i] =
12266 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
12267 }
12268 InputV = DAG.getBitcast(MVT::v16i8, InputV);
12269 return DAG.getBitcast(
12270 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
12271 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
12272 }
12273
12274 // If we are extending from an offset, ensure we start on a boundary that
12275 // we can unpack from.
12276 int AlignToUnpack = Offset % (NumElements / Scale);
12277 if (AlignToUnpack) {
12278 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12279 for (int i = AlignToUnpack; i < NumElements; ++i)
12280 ShMask[i - AlignToUnpack] = i;
12281 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
12282 Offset -= AlignToUnpack;
12283 }
12284
12285 // Otherwise emit a sequence of unpacks.
12286 do {
12287 unsigned UnpackLoHi = X86ISD::UNPCKL;
12288 if (Offset >= (NumElements / 2)) {
12289 UnpackLoHi = X86ISD::UNPCKH;
12290 Offset -= (NumElements / 2);
12291 }
12292
12293 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
12294 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
12295 : getZeroVector(InputVT, Subtarget, DAG, DL);
12296 InputV = DAG.getBitcast(InputVT, InputV);
12297 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
12298 Scale /= 2;
12299 EltBits *= 2;
12300 NumElements /= 2;
12301 } while (Scale > 1);
12302 return DAG.getBitcast(VT, InputV);
12303}
12304
12305/// Try to lower a vector shuffle as a zero extension on any microarch.
12306///
12307/// This routine will try to do everything in its power to cleverly lower
12308/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12309/// check for the profitability of this lowering, it tries to aggressively
12310/// match this pattern. It will use all of the micro-architectural details it
12311/// can to emit an efficient lowering. It handles both blends with all-zero
12312/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12313/// masking out later).
12314///
12315/// The reason we have dedicated lowering for zext-style shuffles is that they
12316/// are both incredibly common and often quite performance sensitive.
12318 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12319 const APInt &Zeroable, const X86Subtarget &Subtarget,
12320 SelectionDAG &DAG) {
12321 int Bits = VT.getSizeInBits();
12322 int NumLanes = Bits / 128;
12323 int NumElements = VT.getVectorNumElements();
12324 int NumEltsPerLane = NumElements / NumLanes;
12325 assert(VT.getScalarSizeInBits() <= 32 &&
12326 "Exceeds 32-bit integer zero extension limit");
12327 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
12328
12329 // Define a helper function to check a particular ext-scale and lower to it if
12330 // valid.
12331 auto Lower = [&](int Scale) -> SDValue {
12332 SDValue InputV;
12333 bool AnyExt = true;
12334 int Offset = 0;
12335 int Matches = 0;
12336 for (int i = 0; i < NumElements; ++i) {
12337 int M = Mask[i];
12338 if (M < 0)
12339 continue; // Valid anywhere but doesn't tell us anything.
12340 if (i % Scale != 0) {
12341 // Each of the extended elements need to be zeroable.
12342 if (!Zeroable[i])
12343 return SDValue();
12344
12345 // We no longer are in the anyext case.
12346 AnyExt = false;
12347 continue;
12348 }
12349
12350 // Each of the base elements needs to be consecutive indices into the
12351 // same input vector.
12352 SDValue V = M < NumElements ? V1 : V2;
12353 M = M % NumElements;
12354 if (!InputV) {
12355 InputV = V;
12356 Offset = M - (i / Scale);
12357 } else if (InputV != V)
12358 return SDValue(); // Flip-flopping inputs.
12359
12360 // Offset must start in the lowest 128-bit lane or at the start of an
12361 // upper lane.
12362 // FIXME: Is it ever worth allowing a negative base offset?
12363 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12364 (Offset % NumEltsPerLane) == 0))
12365 return SDValue();
12366
12367 // If we are offsetting, all referenced entries must come from the same
12368 // lane.
12369 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12370 return SDValue();
12371
12372 if ((M % NumElements) != (Offset + (i / Scale)))
12373 return SDValue(); // Non-consecutive strided elements.
12374 Matches++;
12375 }
12376
12377 // If we fail to find an input, we have a zero-shuffle which should always
12378 // have already been handled.
12379 // FIXME: Maybe handle this here in case during blending we end up with one?
12380 if (!InputV)
12381 return SDValue();
12382
12383 // If we are offsetting, don't extend if we only match a single input, we
12384 // can always do better by using a basic PSHUF or PUNPCK.
12385 if (Offset != 0 && Matches < 2)
12386 return SDValue();
12387
12388 return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
12389 InputV, Mask, Subtarget, DAG);
12390 };
12391
12392 // The widest scale possible for extending is to a 64-bit integer.
12393 assert(Bits % 64 == 0 &&
12394 "The number of bits in a vector must be divisible by 64 on x86!");
12395 int NumExtElements = Bits / 64;
12396
12397 // Each iteration, try extending the elements half as much, but into twice as
12398 // many elements.
12399 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12400 assert(NumElements % NumExtElements == 0 &&
12401 "The input vector size must be divisible by the extended size.");
12402 if (SDValue V = Lower(NumElements / NumExtElements))
12403 return V;
12404 }
12405
12406 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12407 if (Bits != 128)
12408 return SDValue();
12409
12410 // Returns one of the source operands if the shuffle can be reduced to a
12411 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12412 auto CanZExtLowHalf = [&]() {
12413 for (int i = NumElements / 2; i != NumElements; ++i)
12414 if (!Zeroable[i])
12415 return SDValue();
12416 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12417 return V1;
12418 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12419 return V2;
12420 return SDValue();
12421 };
12422
12423 if (SDValue V = CanZExtLowHalf()) {
12424 V = DAG.getBitcast(MVT::v2i64, V);
12425 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12426 return DAG.getBitcast(VT, V);
12427 }
12428
12429 // No viable ext lowering found.
12430 return SDValue();
12431}
12432
12433/// Try to get a scalar value for a specific element of a vector.
12434///
12435/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12437 SelectionDAG &DAG) {
12438 MVT VT = V.getSimpleValueType();
12439 MVT EltVT = VT.getVectorElementType();
12440 V = peekThroughBitcasts(V);
12441
12442 // If the bitcasts shift the element size, we can't extract an equivalent
12443 // element from it.
12444 MVT NewVT = V.getSimpleValueType();
12445 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12446 return SDValue();
12447
12448 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12449 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12450 // Ensure the scalar operand is the same size as the destination.
12451 // FIXME: Add support for scalar truncation where possible.
12452 SDValue S = V.getOperand(Idx);
12453 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12454 return DAG.getBitcast(EltVT, S);
12455 }
12456
12457 return SDValue();
12458}
12459
12460/// Helper to test for a load that can be folded with x86 shuffles.
12461///
12462/// This is particularly important because the set of instructions varies
12463/// significantly based on whether the operand is a load or not.
12465 return V->hasOneUse() &&
12467}
12468
12469template<typename T>
12470static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
12471 T EltVT = VT.getScalarType();
12472 return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
12473 (EltVT == MVT::f16 && !Subtarget.hasFP16());
12474}
12475
12476/// Try to lower insertion of a single element into a zero vector.
12477///
12478/// This is a common pattern that we have especially efficient patterns to lower
12479/// across all subtarget feature sets.
12481 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12482 const APInt &Zeroable, const X86Subtarget &Subtarget,
12483 SelectionDAG &DAG) {
12484 MVT ExtVT = VT;
12485 MVT EltVT = VT.getVectorElementType();
12486 unsigned NumElts = VT.getVectorNumElements();
12487 unsigned EltBits = VT.getScalarSizeInBits();
12488
12489 if (isSoftF16(EltVT, Subtarget))
12490 return SDValue();
12491
12492 int V2Index =
12493 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12494 Mask.begin();
12495 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
12496 bool IsV1Zeroable = true;
12497 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12498 if (i != V2Index && !Zeroable[i]) {
12499 IsV1Zeroable = false;
12500 break;
12501 }
12502
12503 // Bail if a non-zero V1 isn't used in place.
12504 if (!IsV1Zeroable) {
12505 SmallVector<int, 8> V1Mask(Mask);
12506 V1Mask[V2Index] = -1;
12507 if (!isNoopShuffleMask(V1Mask))
12508 return SDValue();
12509 }
12510
12511 // Check for a single input from a SCALAR_TO_VECTOR node.
12512 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12513 // all the smarts here sunk into that routine. However, the current
12514 // lowering of BUILD_VECTOR makes that nearly impossible until the old
12515 // vector shuffle lowering is dead.
12516 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12517 DAG);
12518 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12519 // We need to zext the scalar if it is smaller than an i32.
12520 V2S = DAG.getBitcast(EltVT, V2S);
12521 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
12522 // Using zext to expand a narrow element won't work for non-zero
12523 // insertions. But we can use a masked constant vector if we're
12524 // inserting V2 into the bottom of V1.
12525 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
12526 return SDValue();
12527
12528 // Zero-extend directly to i32.
12529 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12530 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12531
12532 // If we're inserting into a constant, mask off the inserted index
12533 // and OR with the zero-extended scalar.
12534 if (!IsV1Zeroable) {
12535 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
12536 Bits[V2Index] = APInt::getZero(EltBits);
12537 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
12538 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
12539 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12540 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
12541 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12542 }
12543 }
12544 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12545 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12546 (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
12547 // Either not inserting from the low element of the input or the input
12548 // element size is too small to use VZEXT_MOVL to clear the high bits.
12549 return SDValue();
12550 }
12551
12552 if (!IsV1Zeroable) {
12553 // If V1 can't be treated as a zero vector we have fewer options to lower
12554 // this. We can't support integer vectors or non-zero targets cheaply.
12555 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12556 if (!VT.isFloatingPoint() || V2Index != 0)
12557 return SDValue();
12558 if (!VT.is128BitVector())
12559 return SDValue();
12560
12561 // Otherwise, use MOVSD, MOVSS or MOVSH.
12562 unsigned MovOpc = 0;
12563 if (EltVT == MVT::f16)
12564 MovOpc = X86ISD::MOVSH;
12565 else if (EltVT == MVT::f32)
12566 MovOpc = X86ISD::MOVSS;
12567 else if (EltVT == MVT::f64)
12568 MovOpc = X86ISD::MOVSD;
12569 else
12570 llvm_unreachable("Unsupported floating point element type to handle!");
12571 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
12572 }
12573
12574 // This lowering only works for the low element with floating point vectors.
12575 if (VT.isFloatingPoint() && V2Index != 0)
12576 return SDValue();
12577
12578 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12579 if (ExtVT != VT)
12580 V2 = DAG.getBitcast(VT, V2);
12581
12582 if (V2Index != 0) {
12583 // If we have 4 or fewer lanes we can cheaply shuffle the element into
12584 // the desired position. Otherwise it is more efficient to do a vector
12585 // shift left. We know that we can do a vector shift left because all
12586 // the inputs are zero.
12587 if (VT.isFloatingPoint() || NumElts <= 4) {
12588 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12589 V2Shuffle[V2Index] = 0;
12590 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12591 } else {
12592 V2 = DAG.getBitcast(MVT::v16i8, V2);
12593 V2 = DAG.getNode(
12594 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12595 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
12596 V2 = DAG.getBitcast(VT, V2);
12597 }
12598 }
12599 return V2;
12600}
12601
12602/// Try to lower broadcast of a single - truncated - integer element,
12603/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12604///
12605/// This assumes we have AVX2.
12607 int BroadcastIdx,
12608 const X86Subtarget &Subtarget,
12609 SelectionDAG &DAG) {
12610 assert(Subtarget.hasAVX2() &&
12611 "We can only lower integer broadcasts with AVX2!");
12612
12613 MVT EltVT = VT.getVectorElementType();
12614 MVT V0VT = V0.getSimpleValueType();
12615
12616 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12617 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12618
12619 MVT V0EltVT = V0VT.getVectorElementType();
12620 if (!V0EltVT.isInteger())
12621 return SDValue();
12622
12623 const unsigned EltSize = EltVT.getSizeInBits();
12624 const unsigned V0EltSize = V0EltVT.getSizeInBits();
12625
12626 // This is only a truncation if the original element type is larger.
12627 if (V0EltSize <= EltSize)
12628 return SDValue();
12629
12630 assert(((V0EltSize % EltSize) == 0) &&
12631 "Scalar type sizes must all be powers of 2 on x86!");
12632
12633 const unsigned V0Opc = V0.getOpcode();
12634 const unsigned Scale = V0EltSize / EltSize;
12635 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12636
12637 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12638 V0Opc != ISD::BUILD_VECTOR)
12639 return SDValue();
12640
12641 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12642
12643 // If we're extracting non-least-significant bits, shift so we can truncate.
12644 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12645 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12646 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12647 if (const int OffsetIdx = BroadcastIdx % Scale)
12648 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12649 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12650
12651 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12652 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12653}
12654
12655/// Test whether this can be lowered with a single SHUFPS instruction.
12656///
12657/// This is used to disable more specialized lowerings when the shufps lowering
12658/// will happen to be efficient.
12660 // This routine only handles 128-bit shufps.
12661 assert(Mask.size() == 4 && "Unsupported mask size!");
12662 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12663 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12664 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12665 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12666
12667 // To lower with a single SHUFPS we need to have the low half and high half
12668 // each requiring a single input.
12669 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12670 return false;
12671 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12672 return false;
12673
12674 return true;
12675}
12676
12677/// Test whether the specified input (0 or 1) is in-place blended by the
12678/// given mask.
12679///
12680/// This returns true if the elements from a particular input are already in the
12681/// slot required by the given mask and require no permutation.
12682static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12683 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12684 int Size = Mask.size();
12685 for (int i = 0; i < Size; ++i)
12686 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12687 return false;
12688
12689 return true;
12690}
12691
12692/// If we are extracting two 128-bit halves of a vector and shuffling the
12693/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12694/// multi-shuffle lowering.
12696 SDValue N1, ArrayRef<int> Mask,
12697 SelectionDAG &DAG) {
12698 MVT VT = N0.getSimpleValueType();
12699 assert((VT.is128BitVector() &&
12700 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
12701 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12702
12703 // Check that both sources are extracts of the same source vector.
12704 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12706 N0.getOperand(0) != N1.getOperand(0) ||
12707 !N0.hasOneUse() || !N1.hasOneUse())
12708 return SDValue();
12709
12710 SDValue WideVec = N0.getOperand(0);
12711 MVT WideVT = WideVec.getSimpleValueType();
12712 if (!WideVT.is256BitVector())
12713 return SDValue();
12714
12715 // Match extracts of each half of the wide source vector. Commute the shuffle
12716 // if the extract of the low half is N1.
12717 unsigned NumElts = VT.getVectorNumElements();
12718 SmallVector<int, 4> NewMask(Mask);
12719 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
12720 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
12721 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
12723 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
12724 return SDValue();
12725
12726 // Final bailout: if the mask is simple, we are better off using an extract
12727 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
12728 // because that avoids a constant load from memory.
12729 if (NumElts == 4 &&
12730 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
12731 return SDValue();
12732
12733 // Extend the shuffle mask with undef elements.
12734 NewMask.append(NumElts, -1);
12735
12736 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
12737 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
12738 NewMask);
12739 // This is free: ymm -> xmm.
12740 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
12741 DAG.getVectorIdxConstant(0, DL));
12742}
12743
12744/// Try to lower broadcast of a single element.
12745///
12746/// For convenience, this code also bundles all of the subtarget feature set
12747/// filtering. While a little annoying to re-dispatch on type here, there isn't
12748/// a convenient way to factor it out.
12750 SDValue V2, ArrayRef<int> Mask,
12751 const X86Subtarget &Subtarget,
12752 SelectionDAG &DAG) {
12753 MVT EltVT = VT.getVectorElementType();
12754 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
12755 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
12756 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
12757 return SDValue();
12758
12759 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
12760 // we can only broadcast from a register with AVX2.
12761 unsigned NumEltBits = VT.getScalarSizeInBits();
12762 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
12765 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
12766
12767 // Check that the mask is a broadcast.
12768 int BroadcastIdx = getSplatIndex(Mask);
12769 if (BroadcastIdx < 0)
12770 return SDValue();
12771 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
12772 "a sorted mask where the broadcast "
12773 "comes from V1.");
12774 int NumActiveElts = count_if(Mask, [](int M) { return M >= 0; });
12775
12776 // Go up the chain of (vector) values to find a scalar load that we can
12777 // combine with the broadcast.
12778 // TODO: Combine this logic with findEltLoadSrc() used by
12779 // EltsFromConsecutiveLoads().
12780 int BitOffset = BroadcastIdx * NumEltBits;
12781 SDValue V = V1;
12782 for (;;) {
12783 switch (V.getOpcode()) {
12784 case ISD::BITCAST: {
12785 V = V.getOperand(0);
12786 continue;
12787 }
12788 case ISD::CONCAT_VECTORS: {
12789 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
12790 int OpIdx = BitOffset / OpBitWidth;
12791 V = V.getOperand(OpIdx);
12792 BitOffset %= OpBitWidth;
12793 continue;
12794 }
12796 // The extraction index adds to the existing offset.
12797 unsigned EltBitWidth = V.getScalarValueSizeInBits();
12798 unsigned Idx = V.getConstantOperandVal(1);
12799 unsigned BeginOffset = Idx * EltBitWidth;
12800 BitOffset += BeginOffset;
12801 V = V.getOperand(0);
12802 continue;
12803 }
12804 case ISD::INSERT_SUBVECTOR: {
12805 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
12806 int EltBitWidth = VOuter.getScalarValueSizeInBits();
12807 int Idx = (int)V.getConstantOperandVal(2);
12808 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
12809 int BeginOffset = Idx * EltBitWidth;
12810 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
12811 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
12812 BitOffset -= BeginOffset;
12813 V = VInner;
12814 } else {
12815 V = VOuter;
12816 }
12817 continue;
12818 }
12819 }
12820 break;
12821 }
12822 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
12823 BroadcastIdx = BitOffset / NumEltBits;
12824
12825 // Do we need to bitcast the source to retrieve the original broadcast index?
12826 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
12827
12828 // Check if this is a broadcast of a scalar. We special case lowering
12829 // for scalars so that we can more effectively fold with loads.
12830 // If the original value has a larger element type than the shuffle, the
12831 // broadcast element is in essence truncated. Make that explicit to ease
12832 // folding.
12833 if (BitCastSrc && VT.isInteger())
12834 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
12835 DL, VT, V, BroadcastIdx, Subtarget, DAG))
12836 return TruncBroadcast;
12837
12838 // Also check the simpler case, where we can directly reuse the scalar.
12839 if (!BitCastSrc &&
12840 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
12841 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
12842 V = V.getOperand(BroadcastIdx);
12843
12844 // If we can't broadcast from a register, check that the input is a load.
12845 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
12846 return SDValue();
12847 } else if (ISD::isNormalLoad(V.getNode()) &&
12848 cast<LoadSDNode>(V)->isSimple()) {
12849 // We do not check for one-use of the vector load because a broadcast load
12850 // is expected to be a win for code size, register pressure, and possibly
12851 // uops even if the original vector load is not eliminated.
12852
12853 // Reduce the vector load and shuffle to a broadcasted scalar load.
12854 LoadSDNode *Ld = cast<LoadSDNode>(V);
12855 SDValue BaseAddr = Ld->getOperand(1);
12856 MVT SVT = VT.getScalarType();
12857 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
12858 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
12859 SDValue NewAddr =
12861
12862 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
12863 // than MOVDDUP.
12864 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
12865 if (Opcode == X86ISD::VBROADCAST) {
12866 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
12867 SDValue Ops[] = {Ld->getChain(), NewAddr};
12868 V = DAG.getMemIntrinsicNode(
12869 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
12871 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12873 return DAG.getBitcast(VT, V);
12874 }
12875 assert(SVT == MVT::f64 && "Unexpected VT!");
12876 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
12878 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12880 } else if (!BroadcastFromReg) {
12881 // We can't broadcast from a vector register.
12882 return SDValue();
12883 } else if (BitOffset != 0) {
12884 // We can only broadcast from the zero-element of a vector register,
12885 // but it can be advantageous to broadcast from the zero-element of a
12886 // subvector.
12887 if (!VT.is256BitVector() && !VT.is512BitVector())
12888 return SDValue();
12889
12890 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
12891 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12892 return SDValue();
12893
12894 // If we are broadcasting an element from the lowest 128-bit subvector, try
12895 // to move the element in position.
12896 if (BitOffset < 128 && NumActiveElts > 1 &&
12897 V.getScalarValueSizeInBits() == NumEltBits) {
12898 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
12899 "Unexpected bit-offset");
12900 SmallVector<int, 16> ExtractMask(128 / NumEltBits, SM_SentinelUndef);
12901 ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
12902 V = extractSubVector(V, 0, DAG, DL, 128);
12903 V = DAG.getVectorShuffle(V.getValueType(), DL, V, V, ExtractMask);
12904 } else {
12905 // Only broadcast the zero-element of a 128-bit subvector.
12906 if ((BitOffset % 128) != 0)
12907 return SDValue();
12908
12909 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
12910 "Unexpected bit-offset");
12911 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
12912 "Unexpected vector size");
12913 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
12914 V = extract128BitVector(V, ExtractIdx, DAG, DL);
12915 }
12916 }
12917
12918 // On AVX we can use VBROADCAST directly for scalar sources.
12919 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
12920 V = DAG.getBitcast(MVT::f64, V);
12921 if (Subtarget.hasAVX()) {
12922 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
12923 return DAG.getBitcast(VT, V);
12924 }
12925 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
12926 }
12927
12928 // If this is a scalar, do the broadcast on this type and bitcast.
12929 if (!V.getValueType().isVector()) {
12930 assert(V.getScalarValueSizeInBits() == NumEltBits &&
12931 "Unexpected scalar size");
12932 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
12934 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
12935 }
12936
12937 // We only support broadcasting from 128-bit vectors to minimize the
12938 // number of patterns we need to deal with in isel. So extract down to
12939 // 128-bits, removing as many bitcasts as possible.
12940 if (V.getValueSizeInBits() > 128)
12942
12943 // Otherwise cast V to a vector with the same element type as VT, but
12944 // possibly narrower than VT. Then perform the broadcast.
12945 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
12946 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
12947 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
12948}
12949
12950// Check for whether we can use INSERTPS to perform the shuffle. We only use
12951// INSERTPS when the V1 elements are already in the correct locations
12952// because otherwise we can just always use two SHUFPS instructions which
12953// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
12954// perform INSERTPS if a single V1 element is out of place and all V2
12955// elements are zeroable.
12957 unsigned &InsertPSMask,
12958 const APInt &Zeroable,
12959 ArrayRef<int> Mask, SelectionDAG &DAG) {
12960 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
12961 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
12962 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12963
12964 // Attempt to match INSERTPS with one element from VA or VB being
12965 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
12966 // are updated.
12967 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
12968 ArrayRef<int> CandidateMask) {
12969 unsigned ZMask = 0;
12970 int VADstIndex = -1;
12971 int VBDstIndex = -1;
12972 bool VAUsedInPlace = false;
12973
12974 for (int i = 0; i < 4; ++i) {
12975 // Synthesize a zero mask from the zeroable elements (includes undefs).
12976 if (Zeroable[i]) {
12977 ZMask |= 1 << i;
12978 continue;
12979 }
12980
12981 // Flag if we use any VA inputs in place.
12982 if (i == CandidateMask[i]) {
12983 VAUsedInPlace = true;
12984 continue;
12985 }
12986
12987 // We can only insert a single non-zeroable element.
12988 if (VADstIndex >= 0 || VBDstIndex >= 0)
12989 return false;
12990
12991 if (CandidateMask[i] < 4) {
12992 // VA input out of place for insertion.
12993 VADstIndex = i;
12994 } else {
12995 // VB input for insertion.
12996 VBDstIndex = i;
12997 }
12998 }
12999
13000 // Don't bother if we have no (non-zeroable) element for insertion.
13001 if (VADstIndex < 0 && VBDstIndex < 0)
13002 return false;
13003
13004 // Determine element insertion src/dst indices. The src index is from the
13005 // start of the inserted vector, not the start of the concatenated vector.
13006 unsigned VBSrcIndex = 0;
13007 if (VADstIndex >= 0) {
13008 // If we have a VA input out of place, we use VA as the V2 element
13009 // insertion and don't use the original V2 at all.
13010 VBSrcIndex = CandidateMask[VADstIndex];
13011 VBDstIndex = VADstIndex;
13012 VB = VA;
13013 } else {
13014 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13015 }
13016
13017 // If no V1 inputs are used in place, then the result is created only from
13018 // the zero mask and the V2 insertion - so remove V1 dependency.
13019 if (!VAUsedInPlace)
13020 VA = DAG.getUNDEF(MVT::v4f32);
13021
13022 // Update V1, V2 and InsertPSMask accordingly.
13023 V1 = VA;
13024 V2 = VB;
13025
13026 // Insert the V2 element into the desired position.
13027 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13028 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
13029 return true;
13030 };
13031
13032 if (matchAsInsertPS(V1, V2, Mask))
13033 return true;
13034
13035 // Commute and try again.
13036 SmallVector<int, 4> CommutedMask(Mask);
13038 if (matchAsInsertPS(V2, V1, CommutedMask))
13039 return true;
13040
13041 return false;
13042}
13043
13045 ArrayRef<int> Mask, const APInt &Zeroable,
13046 SelectionDAG &DAG) {
13047 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13048 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13049
13050 // Attempt to match the insertps pattern.
13051 unsigned InsertPSMask = 0;
13052 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13053 return SDValue();
13054
13055 // Insert the V2 element into the desired position.
13056 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13057 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13058}
13059
13060/// Handle lowering of 2-lane 64-bit floating point shuffles.
13061///
13062/// This is the basis function for the 2-lane 64-bit shuffles as we have full
13063/// support for floating point shuffles but not integer shuffles. These
13064/// instructions will incur a domain crossing penalty on some chips though so
13065/// it is better to avoid lowering through this for integer vectors where
13066/// possible.
13068 const APInt &Zeroable, SDValue V1, SDValue V2,
13069 const X86Subtarget &Subtarget,
13070 SelectionDAG &DAG) {
13071 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13072 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13073 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13074
13075 if (V2.isUndef()) {
13076 // Check for being able to broadcast a single element.
13077 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
13078 Mask, Subtarget, DAG))
13079 return Broadcast;
13080
13081 // Straight shuffle of a single input vector. Simulate this by using the
13082 // single input as both of the "inputs" to this instruction..
13083 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13084
13085 if (Subtarget.hasAVX()) {
13086 // If we have AVX, we can use VPERMILPS which will allow folding a load
13087 // into the shuffle.
13088 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
13089 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13090 }
13091
13092 return DAG.getNode(
13093 X86ISD::SHUFP, DL, MVT::v2f64,
13094 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13095 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13096 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13097 }
13098 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13099 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13100 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13101 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13102
13103 if (Subtarget.hasAVX2())
13104 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13105 return Extract;
13106
13107 // When loading a scalar and then shuffling it into a vector we can often do
13108 // the insertion cheaply.
13110 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13111 return Insertion;
13112 // Try inverting the insertion since for v2 masks it is easy to do and we
13113 // can't reliably sort the mask one way or the other.
13114 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13115 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13117 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13118 return Insertion;
13119
13120 // Try to use one of the special instruction patterns to handle two common
13121 // blend patterns if a zero-blend above didn't work.
13122 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
13123 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
13124 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
13125 // We can either use a special instruction to load over the low double or
13126 // to move just the low double.
13127 return DAG.getNode(
13128 X86ISD::MOVSD, DL, MVT::v2f64, V2,
13129 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
13130
13131 if (Subtarget.hasSSE41())
13132 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
13133 Zeroable, Subtarget, DAG))
13134 return Blend;
13135
13136 // Use dedicated unpack instructions for masks that match their pattern.
13137 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, V1, V2, Mask, DAG))
13138 return V;
13139
13140 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13141 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
13142 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13143}
13144
13145/// Handle lowering of 2-lane 64-bit integer shuffles.
13146///
13147/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
13148/// the integer unit to minimize domain crossing penalties. However, for blends
13149/// it falls back to the floating point shuffle operation with appropriate bit
13150/// casting.
13152 const APInt &Zeroable, SDValue V1, SDValue V2,
13153 const X86Subtarget &Subtarget,
13154 SelectionDAG &DAG) {
13155 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13156 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13157 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13158
13159 if (V2.isUndef()) {
13160 // Check for being able to broadcast a single element.
13161 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
13162 Mask, Subtarget, DAG))
13163 return Broadcast;
13164
13165 // Straight shuffle of a single input vector. For everything from SSE2
13166 // onward this has a single fast instruction with no scary immediates.
13167 // We have to map the mask as it is actually a v4i32 shuffle instruction.
13168 V1 = DAG.getBitcast(MVT::v4i32, V1);
13169 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
13170 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
13171 Mask[1] < 0 ? -1 : (Mask[1] * 2),
13172 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
13173 return DAG.getBitcast(
13174 MVT::v2i64,
13175 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13176 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
13177 }
13178 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
13179 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
13180 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13181 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13182
13183 if (Subtarget.hasAVX2())
13184 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13185 return Extract;
13186
13187 // Try to use shift instructions.
13188 if (SDValue Shift =
13189 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
13190 DAG, /*BitwiseOnly*/ false))
13191 return Shift;
13192
13193 // When loading a scalar and then shuffling it into a vector we can often do
13194 // the insertion cheaply.
13196 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13197 return Insertion;
13198 // Try inverting the insertion since for v2 masks it is easy to do and we
13199 // can't reliably sort the mask one way or the other.
13200 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
13202 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13203 return Insertion;
13204
13205 // We have different paths for blend lowering, but they all must use the
13206 // *exact* same predicate.
13207 bool IsBlendSupported = Subtarget.hasSSE41();
13208 if (IsBlendSupported)
13209 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
13210 Zeroable, Subtarget, DAG))
13211 return Blend;
13212
13213 // Use dedicated unpack instructions for masks that match their pattern.
13214 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, V1, V2, Mask, DAG))
13215 return V;
13216
13217 // Try to use byte rotation instructions.
13218 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13219 if (Subtarget.hasSSSE3()) {
13220 if (Subtarget.hasVLX())
13221 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
13222 Zeroable, Subtarget, DAG))
13223 return Rotate;
13224
13225 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
13226 Subtarget, DAG))
13227 return Rotate;
13228 }
13229
13230 // If we have direct support for blends, we should lower by decomposing into
13231 // a permute. That will be faster than the domain cross.
13232 if (IsBlendSupported)
13233 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
13234 Zeroable, Subtarget, DAG);
13235
13236 // We implement this with SHUFPD which is pretty lame because it will likely
13237 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
13238 // However, all the alternatives are still more cycles and newer chips don't
13239 // have this problem. It would be really nice if x86 had better shuffles here.
13240 V1 = DAG.getBitcast(MVT::v2f64, V1);
13241 V2 = DAG.getBitcast(MVT::v2f64, V2);
13242 return DAG.getBitcast(MVT::v2i64,
13243 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
13244}
13245
13246/// Lower a vector shuffle using the SHUFPS instruction.
13247///
13248/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
13249/// It makes no assumptions about whether this is the *best* lowering, it simply
13250/// uses it.
13252 ArrayRef<int> Mask, SDValue V1,
13253 SDValue V2, SelectionDAG &DAG) {
13254 SDValue LowV = V1, HighV = V2;
13255 SmallVector<int, 4> NewMask(Mask);
13256 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13257
13258 if (NumV2Elements == 1) {
13259 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
13260
13261 // Compute the index adjacent to V2Index and in the same half by toggling
13262 // the low bit.
13263 int V2AdjIndex = V2Index ^ 1;
13264
13265 if (Mask[V2AdjIndex] < 0) {
13266 // Handles all the cases where we have a single V2 element and an undef.
13267 // This will only ever happen in the high lanes because we commute the
13268 // vector otherwise.
13269 if (V2Index < 2)
13270 std::swap(LowV, HighV);
13271 NewMask[V2Index] -= 4;
13272 } else {
13273 // Handle the case where the V2 element ends up adjacent to a V1 element.
13274 // To make this work, blend them together as the first step.
13275 int V1Index = V2AdjIndex;
13276 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13277 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
13278 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13279
13280 // Now proceed to reconstruct the final blend as we have the necessary
13281 // high or low half formed.
13282 if (V2Index < 2) {
13283 LowV = V2;
13284 HighV = V1;
13285 } else {
13286 HighV = V2;
13287 }
13288 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
13289 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
13290 }
13291 } else if (NumV2Elements == 2) {
13292 if (Mask[0] < 4 && Mask[1] < 4) {
13293 // Handle the easy case where we have V1 in the low lanes and V2 in the
13294 // high lanes.
13295 NewMask[2] -= 4;
13296 NewMask[3] -= 4;
13297 } else if (Mask[2] < 4 && Mask[3] < 4) {
13298 // We also handle the reversed case because this utility may get called
13299 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
13300 // arrange things in the right direction.
13301 NewMask[0] -= 4;
13302 NewMask[1] -= 4;
13303 HighV = V1;
13304 LowV = V2;
13305 } else {
13306 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
13307 // trying to place elements directly, just blend them and set up the final
13308 // shuffle to place them.
13309
13310 // The first two blend mask elements are for V1, the second two are for
13311 // V2.
13312 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13313 Mask[2] < 4 ? Mask[2] : Mask[3],
13314 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13315 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13316 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13317 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13318
13319 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13320 // a blend.
13321 LowV = HighV = V1;
13322 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13323 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13324 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13325 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13326 }
13327 } else if (NumV2Elements == 3) {
13328 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
13329 // we can get here due to other paths (e.g repeated mask matching) that we
13330 // don't want to do another round of lowerVECTOR_SHUFFLE.
13332 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
13333 }
13334 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13335 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13336}
13337
13338/// Lower 4-lane 32-bit floating point shuffles.
13339///
13340/// Uses instructions exclusively from the floating point unit to minimize
13341/// domain crossing penalties, as these are sufficient to implement all v4f32
13342/// shuffles.
13344 const APInt &Zeroable, SDValue V1, SDValue V2,
13345 const X86Subtarget &Subtarget,
13346 SelectionDAG &DAG) {
13347 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13348 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13349 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13350
13351 if (Subtarget.hasSSE41())
13352 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13353 Zeroable, Subtarget, DAG))
13354 return Blend;
13355
13356 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13357
13358 if (NumV2Elements == 0) {
13359 // Check for being able to broadcast a single element.
13360 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13361 Mask, Subtarget, DAG))
13362 return Broadcast;
13363
13364 // Use even/odd duplicate instructions for masks that match their pattern.
13365 if (Subtarget.hasSSE3()) {
13366 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
13367 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13368 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
13369 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13370 }
13371
13372 if (Subtarget.hasAVX()) {
13373 // If we have AVX, we can use VPERMILPS which will allow folding a load
13374 // into the shuffle.
13375 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13376 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13377 }
13378
13379 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13380 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13381 if (!Subtarget.hasSSE2()) {
13382 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
13383 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13384 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
13385 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13386 }
13387
13388 // Otherwise, use a straight shuffle of a single input vector. We pass the
13389 // input vector to both operands to simulate this with a SHUFPS.
13390 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13391 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13392 }
13393
13394 if (Subtarget.hasSSE2())
13396 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
13397 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
13398 return ZExt;
13399 }
13400
13401 if (Subtarget.hasAVX2())
13402 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13403 return Extract;
13404
13405 // There are special ways we can lower some single-element blends. However, we
13406 // have custom ways we can lower more complex single-element blends below that
13407 // we defer to if both this and BLENDPS fail to match, so restrict this to
13408 // when the V2 input is targeting element 0 of the mask -- that is the fast
13409 // case here.
13410 if (NumV2Elements == 1 && Mask[0] >= 4)
13412 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13413 return V;
13414
13415 if (Subtarget.hasSSE41()) {
13416 // Use INSERTPS if we can complete the shuffle efficiently.
13417 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13418 return V;
13419
13420 if (!isSingleSHUFPSMask(Mask))
13421 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13422 V2, Mask, DAG))
13423 return BlendPerm;
13424 }
13425
13426 // Use low/high mov instructions. These are only valid in SSE1 because
13427 // otherwise they are widened to v2f64 and never get here.
13428 if (!Subtarget.hasSSE2()) {
13429 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
13430 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13431 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
13432 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13433 }
13434
13435 // Use dedicated unpack instructions for masks that match their pattern.
13436 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, V1, V2, Mask, DAG))
13437 return V;
13438
13439 // Otherwise fall back to a SHUFPS lowering strategy.
13440 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13441}
13442
13443/// Lower 4-lane i32 vector shuffles.
13444///
13445/// We try to handle these with integer-domain shuffles where we can, but for
13446/// blends we use the floating point domain blend instructions.
13448 const APInt &Zeroable, SDValue V1, SDValue V2,
13449 const X86Subtarget &Subtarget,
13450 SelectionDAG &DAG) {
13451 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13452 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13453 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13454
13455 // Whenever we can lower this as a zext, that instruction is strictly faster
13456 // than any alternative. It also allows us to fold memory operands into the
13457 // shuffle in many cases.
13458 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13459 Zeroable, Subtarget, DAG))
13460 return ZExt;
13461
13462 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13463
13464 // Try to use shift instructions if fast.
13465 if (Subtarget.preferLowerShuffleAsShift()) {
13466 if (SDValue Shift =
13467 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
13468 Subtarget, DAG, /*BitwiseOnly*/ true))
13469 return Shift;
13470 if (NumV2Elements == 0)
13471 if (SDValue Rotate =
13472 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
13473 return Rotate;
13474 }
13475
13476 if (NumV2Elements == 0) {
13477 // Try to use broadcast unless the mask only has one non-undef element.
13478 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
13479 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
13480 Mask, Subtarget, DAG))
13481 return Broadcast;
13482 }
13483
13484 // Straight shuffle of a single input vector. For everything from SSE2
13485 // onward this has a single fast instruction with no scary immediates.
13486 // We coerce the shuffle pattern to be compatible with UNPCK instructions
13487 // but we aren't actually going to use the UNPCK instruction because doing
13488 // so prevents folding a load into this instruction or making a copy.
13489 const int UnpackLoMask[] = {0, 0, 1, 1};
13490 const int UnpackHiMask[] = {2, 2, 3, 3};
13491 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
13492 Mask = UnpackLoMask;
13493 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
13494 Mask = UnpackHiMask;
13495
13496 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13497 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13498 }
13499
13500 if (Subtarget.hasAVX2())
13501 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13502 return Extract;
13503
13504 // Try to use shift instructions.
13505 if (SDValue Shift =
13506 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
13507 DAG, /*BitwiseOnly*/ false))
13508 return Shift;
13509
13510 // There are special ways we can lower some single-element blends.
13511 if (NumV2Elements == 1)
13513 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13514 return V;
13515
13516 // We have different paths for blend lowering, but they all must use the
13517 // *exact* same predicate.
13518 bool IsBlendSupported = Subtarget.hasSSE41();
13519 if (IsBlendSupported)
13520 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
13521 Zeroable, Subtarget, DAG))
13522 return Blend;
13523
13524 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13525 Zeroable, Subtarget, DAG))
13526 return Masked;
13527
13528 // Use dedicated unpack instructions for masks that match their pattern.
13529 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, V1, V2, Mask, DAG))
13530 return V;
13531
13532 // Try to use byte rotation instructions.
13533 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13534 if (Subtarget.hasSSSE3()) {
13535 if (Subtarget.hasVLX())
13536 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13537 Zeroable, Subtarget, DAG))
13538 return Rotate;
13539
13540 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13541 Subtarget, DAG))
13542 return Rotate;
13543 }
13544
13545 // Assume that a single SHUFPS is faster than an alternative sequence of
13546 // multiple instructions (even if the CPU has a domain penalty).
13547 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13548 if (!isSingleSHUFPSMask(Mask)) {
13549 // If we have direct support for blends, we should lower by decomposing into
13550 // a permute. That will be faster than the domain cross.
13551 if (IsBlendSupported)
13552 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
13553 Zeroable, Subtarget, DAG);
13554
13555 // Try to lower by permuting the inputs into an unpack instruction.
13556 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13557 Mask, Subtarget, DAG))
13558 return Unpack;
13559 }
13560
13561 // We implement this with SHUFPS because it can blend from two vectors.
13562 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13563 // up the inputs, bypassing domain shift penalties that we would incur if we
13564 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13565 // relevant.
13566 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13567 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13568 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13569 return DAG.getBitcast(MVT::v4i32, ShufPS);
13570}
13571
13572/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13573/// shuffle lowering, and the most complex part.
13574///
13575/// The lowering strategy is to try to form pairs of input lanes which are
13576/// targeted at the same half of the final vector, and then use a dword shuffle
13577/// to place them onto the right half, and finally unpack the paired lanes into
13578/// their final position.
13579///
13580/// The exact breakdown of how to form these dword pairs and align them on the
13581/// correct sides is really tricky. See the comments within the function for
13582/// more of the details.
13583///
13584/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13585/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13586/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13587/// vector, form the analogous 128-bit 8-element Mask.
13589 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13590 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13591 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
13592 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13593
13594 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
13595 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13596 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13597
13598 // Attempt to directly match PSHUFLW or PSHUFHW.
13599 if (isUndefOrInRange(LoMask, 0, 4) &&
13600 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13601 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13602 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13603 }
13604 if (isUndefOrInRange(HiMask, 4, 8) &&
13605 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13606 for (int i = 0; i != 4; ++i)
13607 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13608 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13609 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13610 }
13611
13612 SmallVector<int, 4> LoInputs;
13613 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13614 array_pod_sort(LoInputs.begin(), LoInputs.end());
13615 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
13616 SmallVector<int, 4> HiInputs;
13617 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13618 array_pod_sort(HiInputs.begin(), HiInputs.end());
13619 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
13620 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13621 int NumHToL = LoInputs.size() - NumLToL;
13622 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13623 int NumHToH = HiInputs.size() - NumLToH;
13624 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13625 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13626 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13627 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13628
13629 // If we are shuffling values from one half - check how many different DWORD
13630 // pairs we need to create. If only 1 or 2 then we can perform this as a
13631 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13632 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13633 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13634 V = DAG.getNode(ShufWOp, DL, VT, V,
13635 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13636 V = DAG.getBitcast(PSHUFDVT, V);
13637 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13638 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13639 return DAG.getBitcast(VT, V);
13640 };
13641
13642 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13643 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13644 SmallVector<std::pair<int, int>, 4> DWordPairs;
13645 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13646
13647 // Collect the different DWORD pairs.
13648 for (int DWord = 0; DWord != 4; ++DWord) {
13649 int M0 = Mask[2 * DWord + 0];
13650 int M1 = Mask[2 * DWord + 1];
13651 M0 = (M0 >= 0 ? M0 % 4 : M0);
13652 M1 = (M1 >= 0 ? M1 % 4 : M1);
13653 if (M0 < 0 && M1 < 0)
13654 continue;
13655
13656 bool Match = false;
13657 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13658 auto &DWordPair = DWordPairs[j];
13659 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13660 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13661 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13662 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13663 PSHUFDMask[DWord] = DOffset + j;
13664 Match = true;
13665 break;
13666 }
13667 }
13668 if (!Match) {
13669 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13670 DWordPairs.push_back(std::make_pair(M0, M1));
13671 }
13672 }
13673
13674 if (DWordPairs.size() <= 2) {
13675 DWordPairs.resize(2, std::make_pair(-1, -1));
13676 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13677 DWordPairs[1].first, DWordPairs[1].second};
13678 if ((NumHToL + NumHToH) == 0)
13679 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13680 if ((NumLToL + NumLToH) == 0)
13681 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13682 }
13683 }
13684
13685 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13686 // such inputs we can swap two of the dwords across the half mark and end up
13687 // with <=2 inputs to each half in each half. Once there, we can fall through
13688 // to the generic code below. For example:
13689 //
13690 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13691 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13692 //
13693 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13694 // and an existing 2-into-2 on the other half. In this case we may have to
13695 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13696 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
13697 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
13698 // because any other situation (including a 3-into-1 or 1-into-3 in the other
13699 // half than the one we target for fixing) will be fixed when we re-enter this
13700 // path. We will also combine away any sequence of PSHUFD instructions that
13701 // result into a single instruction. Here is an example of the tricky case:
13702 //
13703 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13704 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
13705 //
13706 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
13707 //
13708 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
13709 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
13710 //
13711 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
13712 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
13713 //
13714 // The result is fine to be handled by the generic logic.
13715 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
13716 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
13717 int AOffset, int BOffset) {
13718 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
13719 "Must call this with A having 3 or 1 inputs from the A half.");
13720 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
13721 "Must call this with B having 1 or 3 inputs from the B half.");
13722 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
13723 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
13724
13725 bool ThreeAInputs = AToAInputs.size() == 3;
13726
13727 // Compute the index of dword with only one word among the three inputs in
13728 // a half by taking the sum of the half with three inputs and subtracting
13729 // the sum of the actual three inputs. The difference is the remaining
13730 // slot.
13731 int ADWord = 0, BDWord = 0;
13732 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
13733 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
13734 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
13735 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
13736 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
13737 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
13738 int TripleNonInputIdx =
13739 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
13740 TripleDWord = TripleNonInputIdx / 2;
13741
13742 // We use xor with one to compute the adjacent DWord to whichever one the
13743 // OneInput is in.
13744 OneInputDWord = (OneInput / 2) ^ 1;
13745
13746 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
13747 // and BToA inputs. If there is also such a problem with the BToB and AToB
13748 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
13749 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
13750 // is essential that we don't *create* a 3<-1 as then we might oscillate.
13751 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
13752 // Compute how many inputs will be flipped by swapping these DWords. We
13753 // need
13754 // to balance this to ensure we don't form a 3-1 shuffle in the other
13755 // half.
13756 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
13757 llvm::count(AToBInputs, 2 * ADWord + 1);
13758 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
13759 llvm::count(BToBInputs, 2 * BDWord + 1);
13760 if ((NumFlippedAToBInputs == 1 &&
13761 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
13762 (NumFlippedBToBInputs == 1 &&
13763 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
13764 // We choose whether to fix the A half or B half based on whether that
13765 // half has zero flipped inputs. At zero, we may not be able to fix it
13766 // with that half. We also bias towards fixing the B half because that
13767 // will more commonly be the high half, and we have to bias one way.
13768 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
13769 ArrayRef<int> Inputs) {
13770 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
13771 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
13772 // Determine whether the free index is in the flipped dword or the
13773 // unflipped dword based on where the pinned index is. We use this bit
13774 // in an xor to conditionally select the adjacent dword.
13775 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
13776 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13777 if (IsFixIdxInput == IsFixFreeIdxInput)
13778 FixFreeIdx += 1;
13779 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13780 assert(IsFixIdxInput != IsFixFreeIdxInput &&
13781 "We need to be changing the number of flipped inputs!");
13782 int PSHUFHalfMask[] = {0, 1, 2, 3};
13783 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
13784 V = DAG.getNode(
13785 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
13786 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
13787 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13788
13789 for (int &M : Mask)
13790 if (M >= 0 && M == FixIdx)
13791 M = FixFreeIdx;
13792 else if (M >= 0 && M == FixFreeIdx)
13793 M = FixIdx;
13794 };
13795 if (NumFlippedBToBInputs != 0) {
13796 int BPinnedIdx =
13797 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
13798 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
13799 } else {
13800 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
13801 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
13802 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
13803 }
13804 }
13805 }
13806
13807 int PSHUFDMask[] = {0, 1, 2, 3};
13808 PSHUFDMask[ADWord] = BDWord;
13809 PSHUFDMask[BDWord] = ADWord;
13810 V = DAG.getBitcast(
13811 VT,
13812 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
13813 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13814
13815 // Adjust the mask to match the new locations of A and B.
13816 for (int &M : Mask)
13817 if (M >= 0 && M/2 == ADWord)
13818 M = 2 * BDWord + M % 2;
13819 else if (M >= 0 && M/2 == BDWord)
13820 M = 2 * ADWord + M % 2;
13821
13822 // Recurse back into this routine to re-compute state now that this isn't
13823 // a 3 and 1 problem.
13824 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
13825 };
13826 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
13827 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
13828 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
13829 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
13830
13831 // At this point there are at most two inputs to the low and high halves from
13832 // each half. That means the inputs can always be grouped into dwords and
13833 // those dwords can then be moved to the correct half with a dword shuffle.
13834 // We use at most one low and one high word shuffle to collect these paired
13835 // inputs into dwords, and finally a dword shuffle to place them.
13836 int PSHUFLMask[4] = {-1, -1, -1, -1};
13837 int PSHUFHMask[4] = {-1, -1, -1, -1};
13838 int PSHUFDMask[4] = {-1, -1, -1, -1};
13839
13840 // First fix the masks for all the inputs that are staying in their
13841 // original halves. This will then dictate the targets of the cross-half
13842 // shuffles.
13843 auto fixInPlaceInputs =
13844 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
13845 MutableArrayRef<int> SourceHalfMask,
13846 MutableArrayRef<int> HalfMask, int HalfOffset) {
13847 if (InPlaceInputs.empty())
13848 return;
13849 if (InPlaceInputs.size() == 1) {
13850 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13851 InPlaceInputs[0] - HalfOffset;
13852 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
13853 return;
13854 }
13855 if (IncomingInputs.empty()) {
13856 // Just fix all of the in place inputs.
13857 for (int Input : InPlaceInputs) {
13858 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
13859 PSHUFDMask[Input / 2] = Input / 2;
13860 }
13861 return;
13862 }
13863
13864 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
13865 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13866 InPlaceInputs[0] - HalfOffset;
13867 // Put the second input next to the first so that they are packed into
13868 // a dword. We find the adjacent index by toggling the low bit.
13869 int AdjIndex = InPlaceInputs[0] ^ 1;
13870 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
13871 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
13872 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
13873 };
13874 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
13875 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
13876
13877 // Now gather the cross-half inputs and place them into a free dword of
13878 // their target half.
13879 // FIXME: This operation could almost certainly be simplified dramatically to
13880 // look more like the 3-1 fixing operation.
13881 auto moveInputsToRightHalf = [&PSHUFDMask](
13882 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
13883 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
13884 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
13885 int DestOffset) {
13886 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
13887 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
13888 };
13889 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
13890 int Word) {
13891 int LowWord = Word & ~1;
13892 int HighWord = Word | 1;
13893 return isWordClobbered(SourceHalfMask, LowWord) ||
13894 isWordClobbered(SourceHalfMask, HighWord);
13895 };
13896
13897 if (IncomingInputs.empty())
13898 return;
13899
13900 if (ExistingInputs.empty()) {
13901 // Map any dwords with inputs from them into the right half.
13902 for (int Input : IncomingInputs) {
13903 // If the source half mask maps over the inputs, turn those into
13904 // swaps and use the swapped lane.
13905 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
13906 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
13907 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
13908 Input - SourceOffset;
13909 // We have to swap the uses in our half mask in one sweep.
13910 for (int &M : HalfMask)
13911 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
13912 M = Input;
13913 else if (M == Input)
13914 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13915 } else {
13916 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
13917 Input - SourceOffset &&
13918 "Previous placement doesn't match!");
13919 }
13920 // Note that this correctly re-maps both when we do a swap and when
13921 // we observe the other side of the swap above. We rely on that to
13922 // avoid swapping the members of the input list directly.
13923 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13924 }
13925
13926 // Map the input's dword into the correct half.
13927 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
13928 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
13929 else
13930 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
13931 Input / 2 &&
13932 "Previous placement doesn't match!");
13933 }
13934
13935 // And just directly shift any other-half mask elements to be same-half
13936 // as we will have mirrored the dword containing the element into the
13937 // same position within that half.
13938 for (int &M : HalfMask)
13939 if (M >= SourceOffset && M < SourceOffset + 4) {
13940 M = M - SourceOffset + DestOffset;
13941 assert(M >= 0 && "This should never wrap below zero!");
13942 }
13943 return;
13944 }
13945
13946 // Ensure we have the input in a viable dword of its current half. This
13947 // is particularly tricky because the original position may be clobbered
13948 // by inputs being moved and *staying* in that half.
13949 if (IncomingInputs.size() == 1) {
13950 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13951 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
13952 SourceOffset;
13953 SourceHalfMask[InputFixed - SourceOffset] =
13954 IncomingInputs[0] - SourceOffset;
13955 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
13956 InputFixed);
13957 IncomingInputs[0] = InputFixed;
13958 }
13959 } else if (IncomingInputs.size() == 2) {
13960 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
13961 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13962 // We have two non-adjacent or clobbered inputs we need to extract from
13963 // the source half. To do this, we need to map them into some adjacent
13964 // dword slot in the source mask.
13965 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
13966 IncomingInputs[1] - SourceOffset};
13967
13968 // If there is a free slot in the source half mask adjacent to one of
13969 // the inputs, place the other input in it. We use (Index XOR 1) to
13970 // compute an adjacent index.
13971 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
13972 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
13973 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
13974 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
13975 InputsFixed[1] = InputsFixed[0] ^ 1;
13976 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
13977 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
13978 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
13979 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
13980 InputsFixed[0] = InputsFixed[1] ^ 1;
13981 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
13982 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
13983 // The two inputs are in the same DWord but it is clobbered and the
13984 // adjacent DWord isn't used at all. Move both inputs to the free
13985 // slot.
13986 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
13987 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
13988 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
13989 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
13990 } else {
13991 // The only way we hit this point is if there is no clobbering
13992 // (because there are no off-half inputs to this half) and there is no
13993 // free slot adjacent to one of the inputs. In this case, we have to
13994 // swap an input with a non-input.
13995 for (int i = 0; i < 4; ++i)
13996 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
13997 "We can't handle any clobbers here!");
13998 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
13999 "Cannot have adjacent inputs here!");
14000
14001 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
14002 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
14003
14004 // We also have to update the final source mask in this case because
14005 // it may need to undo the above swap.
14006 for (int &M : FinalSourceHalfMask)
14007 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
14008 M = InputsFixed[1] + SourceOffset;
14009 else if (M == InputsFixed[1] + SourceOffset)
14010 M = (InputsFixed[0] ^ 1) + SourceOffset;
14011
14012 InputsFixed[1] = InputsFixed[0] ^ 1;
14013 }
14014
14015 // Point everything at the fixed inputs.
14016 for (int &M : HalfMask)
14017 if (M == IncomingInputs[0])
14018 M = InputsFixed[0] + SourceOffset;
14019 else if (M == IncomingInputs[1])
14020 M = InputsFixed[1] + SourceOffset;
14021
14022 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14023 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14024 }
14025 } else {
14026 llvm_unreachable("Unhandled input size!");
14027 }
14028
14029 // Now hoist the DWord down to the right half.
14030 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14031 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
14032 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14033 for (int &M : HalfMask)
14034 for (int Input : IncomingInputs)
14035 if (M == Input)
14036 M = FreeDWord * 2 + Input % 2;
14037 };
14038 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14039 /*SourceOffset*/ 4, /*DestOffset*/ 0);
14040 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14041 /*SourceOffset*/ 0, /*DestOffset*/ 4);
14042
14043 // Now enact all the shuffles we've computed to move the inputs into their
14044 // target half.
14045 if (!isNoopShuffleMask(PSHUFLMask))
14046 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14047 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
14048 if (!isNoopShuffleMask(PSHUFHMask))
14049 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14050 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
14051 if (!isNoopShuffleMask(PSHUFDMask))
14052 V = DAG.getBitcast(
14053 VT,
14054 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14055 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14056
14057 // At this point, each half should contain all its inputs, and we can then
14058 // just shuffle them into their final position.
14059 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
14060 "Failed to lift all the high half inputs to the low mask!");
14061 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
14062 "Failed to lift all the low half inputs to the high mask!");
14063
14064 // Do a half shuffle for the low mask.
14065 if (!isNoopShuffleMask(LoMask))
14066 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14067 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14068
14069 // Do a half shuffle with the high mask after shifting its values down.
14070 for (int &M : HiMask)
14071 if (M >= 0)
14072 M -= 4;
14073 if (!isNoopShuffleMask(HiMask))
14074 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14075 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14076
14077 return V;
14078}
14079
14080/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
14081/// blend if only one input is used.
14083 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14084 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
14086 "Lane crossing shuffle masks not supported");
14087
14088 int NumBytes = VT.getSizeInBits() / 8;
14089 int Size = Mask.size();
14090 int Scale = NumBytes / Size;
14091
14092 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14093 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14094 V1InUse = false;
14095 V2InUse = false;
14096
14097 for (int i = 0; i < NumBytes; ++i) {
14098 int M = Mask[i / Scale];
14099 if (M < 0)
14100 continue;
14101
14102 const int ZeroMask = 0x80;
14103 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
14104 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
14105 if (Zeroable[i / Scale])
14106 V1Idx = V2Idx = ZeroMask;
14107
14108 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
14109 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
14110 V1InUse |= (ZeroMask != V1Idx);
14111 V2InUse |= (ZeroMask != V2Idx);
14112 }
14113
14114 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
14115 if (V1InUse)
14116 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
14117 DAG.getBuildVector(ShufVT, DL, V1Mask));
14118 if (V2InUse)
14119 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
14120 DAG.getBuildVector(ShufVT, DL, V2Mask));
14121
14122 // If we need shuffled inputs from both, blend the two.
14123 SDValue V;
14124 if (V1InUse && V2InUse)
14125 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
14126 else
14127 V = V1InUse ? V1 : V2;
14128
14129 // Cast the result back to the correct type.
14130 return DAG.getBitcast(VT, V);
14131}
14132
14133/// Generic lowering of 8-lane i16 shuffles.
14134///
14135/// This handles both single-input shuffles and combined shuffle/blends with
14136/// two inputs. The single input shuffles are immediately delegated to
14137/// a dedicated lowering routine.
14138///
14139/// The blends are lowered in one of three fundamental ways. If there are few
14140/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
14141/// of the input is significantly cheaper when lowered as an interleaving of
14142/// the two inputs, try to interleave them. Otherwise, blend the low and high
14143/// halves of the inputs separately (making them have relatively few inputs)
14144/// and then concatenate them.
14146 const APInt &Zeroable, SDValue V1, SDValue V2,
14147 const X86Subtarget &Subtarget,
14148 SelectionDAG &DAG) {
14149 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14150 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14151 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14152
14153 // Whenever we can lower this as a zext, that instruction is strictly faster
14154 // than any alternative.
14155 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
14156 Zeroable, Subtarget, DAG))
14157 return ZExt;
14158
14159 // Try to use lower using a truncation.
14160 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14161 Subtarget, DAG))
14162 return V;
14163
14164 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
14165
14166 if (NumV2Inputs == 0) {
14167 // Try to use shift instructions.
14168 if (SDValue Shift =
14169 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
14170 Subtarget, DAG, /*BitwiseOnly*/ false))
14171 return Shift;
14172
14173 // Check for being able to broadcast a single element.
14174 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
14175 Mask, Subtarget, DAG))
14176 return Broadcast;
14177
14178 // Try to use bit rotation instructions.
14179 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
14180 Subtarget, DAG))
14181 return Rotate;
14182
14183 // Use dedicated unpack instructions for masks that match their pattern.
14184 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14185 return V;
14186
14187 // Use dedicated pack instructions for masks that match their pattern.
14188 if (SDValue V =
14189 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14190 return V;
14191
14192 // Try to use byte rotation instructions.
14193 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
14194 Subtarget, DAG))
14195 return Rotate;
14196
14197 // Make a copy of the mask so it can be modified.
14198 SmallVector<int, 8> MutableMask(Mask);
14199 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
14200 Subtarget, DAG);
14201 }
14202
14203 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
14204 "All single-input shuffles should be canonicalized to be V1-input "
14205 "shuffles.");
14206
14207 // Try to use shift instructions.
14208 if (SDValue Shift =
14209 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
14210 DAG, /*BitwiseOnly*/ false))
14211 return Shift;
14212
14213 // See if we can use SSE4A Extraction / Insertion.
14214 if (Subtarget.hasSSE4A())
14215 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
14216 Zeroable, DAG))
14217 return V;
14218
14219 // There are special ways we can lower some single-element blends.
14220 if (NumV2Inputs == 1)
14222 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14223 return V;
14224
14225 // We have different paths for blend lowering, but they all must use the
14226 // *exact* same predicate.
14227 bool IsBlendSupported = Subtarget.hasSSE41();
14228 if (IsBlendSupported)
14229 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
14230 Zeroable, Subtarget, DAG))
14231 return Blend;
14232
14233 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
14234 Zeroable, Subtarget, DAG))
14235 return Masked;
14236
14237 // Use dedicated unpack instructions for masks that match their pattern.
14238 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14239 return V;
14240
14241 // Use dedicated pack instructions for masks that match their pattern.
14242 if (SDValue V =
14243 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14244 return V;
14245
14246 // Try to use lower using a truncation.
14247 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14248 Subtarget, DAG))
14249 return V;
14250
14251 // Try to use byte rotation instructions.
14252 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
14253 Subtarget, DAG))
14254 return Rotate;
14255
14256 if (SDValue BitBlend =
14257 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
14258 return BitBlend;
14259
14260 // Try to use byte shift instructions to mask.
14261 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
14262 Zeroable, Subtarget, DAG))
14263 return V;
14264
14265 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
14266 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
14267 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
14268 !Subtarget.hasVLX()) {
14269 // Check if this is part of a 256-bit vector truncation.
14270 unsigned PackOpc = 0;
14271 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
14274 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
14275 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
14276 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
14277 DAG.getTargetConstant(0xEE, DL, MVT::i8));
14278 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
14279 V1 = extract128BitVector(V1V2, 0, DAG, DL);
14280 V2 = extract128BitVector(V1V2, 4, DAG, DL);
14281 PackOpc = X86ISD::PACKUS;
14282 } else if (Subtarget.hasSSE41()) {
14283 SmallVector<SDValue, 4> DWordClearOps(4,
14284 DAG.getConstant(0, DL, MVT::i32));
14285 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14286 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
14287 SDValue DWordClearMask =
14288 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
14289 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
14290 DWordClearMask);
14291 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
14292 DWordClearMask);
14293 PackOpc = X86ISD::PACKUS;
14294 } else if (!Subtarget.hasSSSE3()) {
14295 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
14296 V1 = DAG.getBitcast(MVT::v4i32, V1);
14297 V2 = DAG.getBitcast(MVT::v4i32, V2);
14298 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
14299 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
14300 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
14301 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
14302 PackOpc = X86ISD::PACKSS;
14303 }
14304 if (PackOpc) {
14305 // Now pack things back together.
14306 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
14307 if (NumEvenDrops == 2) {
14308 Result = DAG.getBitcast(MVT::v4i32, Result);
14309 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
14310 }
14311 return Result;
14312 }
14313 }
14314
14315 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
14316 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
14317 if (NumOddDrops == 1) {
14318 bool HasSSE41 = Subtarget.hasSSE41();
14319 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14320 DAG.getBitcast(MVT::v4i32, V1),
14321 DAG.getTargetConstant(16, DL, MVT::i8));
14322 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14323 DAG.getBitcast(MVT::v4i32, V2),
14324 DAG.getTargetConstant(16, DL, MVT::i8));
14325 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
14326 MVT::v8i16, V1, V2);
14327 }
14328
14329 // Try to lower by permuting the inputs into an unpack instruction.
14330 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14331 Mask, Subtarget, DAG))
14332 return Unpack;
14333
14334 // If we can't directly blend but can use PSHUFB, that will be better as it
14335 // can both shuffle and set up the inefficient blend.
14336 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14337 bool V1InUse, V2InUse;
14338 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14339 Zeroable, DAG, V1InUse, V2InUse);
14340 }
14341
14342 // We can always bit-blend if we have to so the fallback strategy is to
14343 // decompose into single-input permutes and blends/unpacks.
14344 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask,
14345 Zeroable, Subtarget, DAG);
14346}
14347
14348/// Lower 8-lane 16-bit floating point shuffles.
14350 const APInt &Zeroable, SDValue V1, SDValue V2,
14351 const X86Subtarget &Subtarget,
14352 SelectionDAG &DAG) {
14353 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14354 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14355 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14356 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
14357
14358 if (Subtarget.hasFP16()) {
14359 if (NumV2Elements == 0) {
14360 // Check for being able to broadcast a single element.
14361 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
14362 Mask, Subtarget, DAG))
14363 return Broadcast;
14364 }
14365 if (NumV2Elements == 1 && Mask[0] >= 8)
14367 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14368 return V;
14369 }
14370
14371 V1 = DAG.getBitcast(MVT::v8i16, V1);
14372 V2 = DAG.getBitcast(MVT::v8i16, V2);
14373 return DAG.getBitcast(MVT::v8f16,
14374 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14375}
14376
14377// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14378// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14379// the active subvector is extracted.
14381 ArrayRef<int> OriginalMask, SDValue V1,
14382 SDValue V2, const X86Subtarget &Subtarget,
14383 SelectionDAG &DAG) {
14384 // Commute binary inputs so V2 is a load to simplify VPERMI2/T2 folds.
14385 SmallVector<int, 32> Mask(OriginalMask);
14386 if (!V2.isUndef() && isShuffleFoldableLoad(V1) &&
14387 !isShuffleFoldableLoad(V2)) {
14389 std::swap(V1, V2);
14390 }
14391
14392 MVT MaskVT = VT.changeTypeToInteger();
14393 SDValue MaskNode;
14394 MVT ShuffleVT = VT;
14395 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
14396 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
14397 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
14398 ShuffleVT = V1.getSimpleValueType();
14399
14400 // Adjust mask to correct indices for the second input.
14401 int NumElts = VT.getVectorNumElements();
14402 unsigned Scale = 512 / VT.getSizeInBits();
14403 SmallVector<int, 32> AdjustedMask(Mask);
14404 for (int &M : AdjustedMask)
14405 if (NumElts <= M)
14406 M += (Scale - 1) * NumElts;
14407 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
14408 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
14409 } else {
14410 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
14411 }
14412
14413 SDValue Result;
14414 if (V2.isUndef())
14415 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
14416 else
14417 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
14418
14419 if (VT != ShuffleVT)
14420 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
14421
14422 return Result;
14423}
14424
14425/// Generic lowering of v16i8 shuffles.
14426///
14427/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14428/// detect any complexity reducing interleaving. If that doesn't help, it uses
14429/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14430/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14431/// back together.
14433 const APInt &Zeroable, SDValue V1, SDValue V2,
14434 const X86Subtarget &Subtarget,
14435 SelectionDAG &DAG) {
14436 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14437 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14438 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14439
14440 // Try to use shift instructions.
14441 if (SDValue Shift =
14442 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
14443 DAG, /*BitwiseOnly*/ false))
14444 return Shift;
14445
14446 // Try to use byte rotation instructions.
14447 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14448 Subtarget, DAG))
14449 return Rotate;
14450
14451 // Use dedicated pack instructions for masks that match their pattern.
14452 if (SDValue V =
14453 lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14454 return V;
14455
14456 // Try to use a zext lowering.
14457 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14458 Zeroable, Subtarget, DAG))
14459 return ZExt;
14460
14461 // Try to use lower using a truncation.
14462 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14463 Subtarget, DAG))
14464 return V;
14465
14466 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14467 Subtarget, DAG))
14468 return V;
14469
14470 // See if we can use SSE4A Extraction / Insertion.
14471 if (Subtarget.hasSSE4A())
14472 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
14473 Zeroable, DAG))
14474 return V;
14475
14476 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
14477
14478 // For single-input shuffles, there are some nicer lowering tricks we can use.
14479 if (NumV2Elements == 0) {
14480 // Check for being able to broadcast a single element.
14481 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
14482 Mask, Subtarget, DAG))
14483 return Broadcast;
14484
14485 // Try to use bit rotation instructions.
14486 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
14487 Subtarget, DAG))
14488 return Rotate;
14489
14490 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14491 return V;
14492
14493 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
14494 // Notably, this handles splat and partial-splat shuffles more efficiently.
14495 // However, it only makes sense if the pre-duplication shuffle simplifies
14496 // things significantly. Currently, this means we need to be able to
14497 // express the pre-duplication shuffle as an i16 shuffle.
14498 //
14499 // FIXME: We should check for other patterns which can be widened into an
14500 // i16 shuffle as well.
14501 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
14502 for (int i = 0; i < 16; i += 2)
14503 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14504 return false;
14505
14506 return true;
14507 };
14508 auto tryToWidenViaDuplication = [&]() -> SDValue {
14509 if (!canWidenViaDuplication(Mask))
14510 return SDValue();
14511 SmallVector<int, 4> LoInputs;
14512 copy_if(Mask, std::back_inserter(LoInputs),
14513 [](int M) { return M >= 0 && M < 8; });
14514 array_pod_sort(LoInputs.begin(), LoInputs.end());
14515 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
14516 SmallVector<int, 4> HiInputs;
14517 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
14518 array_pod_sort(HiInputs.begin(), HiInputs.end());
14519 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
14520
14521 bool TargetLo = LoInputs.size() >= HiInputs.size();
14522 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14523 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14524
14525 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14527 for (int I : InPlaceInputs) {
14528 PreDupI16Shuffle[I/2] = I/2;
14529 LaneMap[I] = I;
14530 }
14531 int j = TargetLo ? 0 : 4, je = j + 4;
14532 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14533 // Check if j is already a shuffle of this input. This happens when
14534 // there are two adjacent bytes after we move the low one.
14535 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14536 // If we haven't yet mapped the input, search for a slot into which
14537 // we can map it.
14538 while (j < je && PreDupI16Shuffle[j] >= 0)
14539 ++j;
14540
14541 if (j == je)
14542 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14543 return SDValue();
14544
14545 // Map this input with the i16 shuffle.
14546 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14547 }
14548
14549 // Update the lane map based on the mapping we ended up with.
14550 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14551 }
14552 V1 = DAG.getBitcast(
14553 MVT::v16i8,
14554 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14555 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14556
14557 // Unpack the bytes to form the i16s that will be shuffled into place.
14558 bool EvenInUse = false, OddInUse = false;
14559 for (int i = 0; i < 16; i += 2) {
14560 EvenInUse |= (Mask[i + 0] >= 0);
14561 OddInUse |= (Mask[i + 1] >= 0);
14562 if (EvenInUse && OddInUse)
14563 break;
14564 }
14565 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14566 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14567 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14568
14569 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14570 for (int i = 0; i < 16; ++i)
14571 if (Mask[i] >= 0) {
14572 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14573 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
14574 if (PostDupI16Shuffle[i / 2] < 0)
14575 PostDupI16Shuffle[i / 2] = MappedMask;
14576 else
14577 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
14578 "Conflicting entries in the original shuffle!");
14579 }
14580 return DAG.getBitcast(
14581 MVT::v16i8,
14582 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14583 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14584 };
14585 if (SDValue V = tryToWidenViaDuplication())
14586 return V;
14587 }
14588
14589 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14590 Zeroable, Subtarget, DAG))
14591 return Masked;
14592
14593 // Use dedicated unpack instructions for masks that match their pattern.
14594 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14595 return V;
14596
14597 // Try to use byte shift instructions to mask.
14598 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
14599 Zeroable, Subtarget, DAG))
14600 return V;
14601
14602 // Check for compaction patterns.
14603 bool IsSingleInput = V2.isUndef();
14604 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
14605
14606 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14607 // with PSHUFB. It is important to do this before we attempt to generate any
14608 // blends but after all of the single-input lowerings. If the single input
14609 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14610 // want to preserve that and we can DAG combine any longer sequences into
14611 // a PSHUFB in the end. But once we start blending from multiple inputs,
14612 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14613 // and there are *very* few patterns that would actually be faster than the
14614 // PSHUFB approach because of its ability to zero lanes.
14615 //
14616 // If the mask is a binary compaction, we can more efficiently perform this
14617 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14618 //
14619 // FIXME: The only exceptions to the above are blends which are exact
14620 // interleavings with direct instructions supporting them. We currently don't
14621 // handle those well here.
14622 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
14623 bool V1InUse = false;
14624 bool V2InUse = false;
14625
14627 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14628
14629 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14630 // do so. This avoids using them to handle blends-with-zero which is
14631 // important as a single pshufb is significantly faster for that.
14632 if (V1InUse && V2InUse) {
14633 if (Subtarget.hasSSE41())
14634 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14635 Zeroable, Subtarget, DAG))
14636 return Blend;
14637
14638 // We can use an unpack to do the blending rather than an or in some
14639 // cases. Even though the or may be (very minorly) more efficient, we
14640 // preference this lowering because there are common cases where part of
14641 // the complexity of the shuffles goes away when we do the final blend as
14642 // an unpack.
14643 // FIXME: It might be worth trying to detect if the unpack-feeding
14644 // shuffles will both be pshufb, in which case we shouldn't bother with
14645 // this.
14647 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14648 return Unpack;
14649
14650 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14651 if (Subtarget.hasVBMI())
14652 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
14653 DAG);
14654
14655 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
14656 if (Subtarget.hasXOP()) {
14657 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
14658 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
14659 }
14660
14661 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14662 // PALIGNR will be cheaper than the second PSHUFB+OR.
14664 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14665 return V;
14666 }
14667
14668 return PSHUFB;
14669 }
14670
14671 // There are special ways we can lower some single-element blends.
14672 if (NumV2Elements == 1)
14674 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14675 return V;
14676
14677 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14678 return Blend;
14679
14680 // Check whether a compaction lowering can be done. This handles shuffles
14681 // which take every Nth element for some even N. See the helper function for
14682 // details.
14683 //
14684 // We special case these as they can be particularly efficiently handled with
14685 // the PACKUSB instruction on x86 and they show up in common patterns of
14686 // rearranging bytes to truncate wide elements.
14687 if (NumEvenDrops) {
14688 // NumEvenDrops is the power of two stride of the elements. Another way of
14689 // thinking about it is that we need to drop the even elements this many
14690 // times to get the original input.
14691
14692 // First we need to zero all the dropped bytes.
14693 assert(NumEvenDrops <= 3 &&
14694 "No support for dropping even elements more than 3 times.");
14695 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
14696 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14697 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
14698 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
14699 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
14700 WordClearMask);
14701 if (!IsSingleInput)
14702 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
14703 WordClearMask);
14704
14705 // Now pack things back together.
14706 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14707 IsSingleInput ? V1 : V2);
14708 for (int i = 1; i < NumEvenDrops; ++i) {
14709 Result = DAG.getBitcast(MVT::v8i16, Result);
14710 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
14711 }
14712 return Result;
14713 }
14714
14715 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
14716 if (NumOddDrops == 1) {
14717 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14718 DAG.getBitcast(MVT::v8i16, V1),
14719 DAG.getTargetConstant(8, DL, MVT::i8));
14720 if (!IsSingleInput)
14721 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14722 DAG.getBitcast(MVT::v8i16, V2),
14723 DAG.getTargetConstant(8, DL, MVT::i8));
14724 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14725 IsSingleInput ? V1 : V2);
14726 }
14727
14728 // Handle multi-input cases by blending/unpacking single-input shuffles.
14729 if (NumV2Elements > 0)
14730 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
14731 Zeroable, Subtarget, DAG);
14732
14733 // The fallback path for single-input shuffles widens this into two v8i16
14734 // vectors with unpacks, shuffles those, and then pulls them back together
14735 // with a pack.
14736 SDValue V = V1;
14737
14738 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14739 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14740 for (int i = 0; i < 16; ++i)
14741 if (Mask[i] >= 0)
14742 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
14743
14744 SDValue VLoHalf, VHiHalf;
14745 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
14746 // them out and avoid using UNPCK{L,H} to extract the elements of V as
14747 // i16s.
14748 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
14749 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
14750 // Use a mask to drop the high bytes.
14751 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
14752 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
14753 DAG.getConstant(0x00FF, DL, MVT::v8i16));
14754
14755 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
14756 VHiHalf = DAG.getUNDEF(MVT::v8i16);
14757
14758 // Squash the masks to point directly into VLoHalf.
14759 for (int &M : LoBlendMask)
14760 if (M >= 0)
14761 M /= 2;
14762 for (int &M : HiBlendMask)
14763 if (M >= 0)
14764 M /= 2;
14765 } else {
14766 // Otherwise just unpack the low half of V into VLoHalf and the high half into
14767 // VHiHalf so that we can blend them as i16s.
14768 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
14769
14770 VLoHalf = DAG.getBitcast(
14771 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
14772 VHiHalf = DAG.getBitcast(
14773 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
14774 }
14775
14776 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
14777 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
14778
14779 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
14780}
14781
14782/// Dispatching routine to lower various 128-bit x86 vector shuffles.
14783///
14784/// This routine breaks down the specific type of 128-bit shuffle and
14785/// dispatches to the lowering routines accordingly.
14787 MVT VT, SDValue V1, SDValue V2,
14788 const APInt &Zeroable,
14789 const X86Subtarget &Subtarget,
14790 SelectionDAG &DAG) {
14791 if (VT == MVT::v8bf16) {
14792 V1 = DAG.getBitcast(MVT::v8i16, V1);
14793 V2 = DAG.getBitcast(MVT::v8i16, V2);
14794 return DAG.getBitcast(VT,
14795 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14796 }
14797
14798 switch (VT.SimpleTy) {
14799 case MVT::v2i64:
14800 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14801 case MVT::v2f64:
14802 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14803 case MVT::v4i32:
14804 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14805 case MVT::v4f32:
14806 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14807 case MVT::v8i16:
14808 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14809 case MVT::v8f16:
14810 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14811 case MVT::v16i8:
14812 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14813
14814 default:
14815 llvm_unreachable("Unimplemented!");
14816 }
14817}
14818
14819/// Generic routine to split vector shuffle into half-sized shuffles.
14820///
14821/// This routine just extracts two subvectors, shuffles them independently, and
14822/// then concatenates them back together. This should work effectively with all
14823/// AVX vector shuffle types.
14825 SDValue V2, ArrayRef<int> Mask,
14826 SelectionDAG &DAG, bool SimpleOnly) {
14827 assert(VT.getSizeInBits() >= 256 &&
14828 "Only for 256-bit or wider vector shuffles!");
14829 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
14830 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
14831
14832 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
14833 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
14834
14835 int NumElements = VT.getVectorNumElements();
14836 int SplitNumElements = NumElements / 2;
14837 MVT ScalarVT = VT.getVectorElementType();
14838 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
14839
14840 // Use splitVector/extractSubVector so that split build-vectors just build two
14841 // narrower build vectors. This helps shuffling with splats and zeros.
14842 auto SplitVector = [&](SDValue V) {
14843 SDValue LoV, HiV;
14844 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
14845 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
14846 DAG.getBitcast(SplitVT, HiV));
14847 };
14848
14849 SDValue LoV1, HiV1, LoV2, HiV2;
14850 std::tie(LoV1, HiV1) = SplitVector(V1);
14851 std::tie(LoV2, HiV2) = SplitVector(V2);
14852
14853 // Now create two 4-way blends of these half-width vectors.
14854 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
14855 bool &UseHiV1, bool &UseLoV2,
14856 bool &UseHiV2) {
14857 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
14858 for (int i = 0; i < SplitNumElements; ++i) {
14859 int M = HalfMask[i];
14860 if (M >= NumElements) {
14861 if (M >= NumElements + SplitNumElements)
14862 UseHiV2 = true;
14863 else
14864 UseLoV2 = true;
14865 } else if (M >= 0) {
14866 if (M >= SplitNumElements)
14867 UseHiV1 = true;
14868 else
14869 UseLoV1 = true;
14870 }
14871 }
14872 };
14873
14874 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
14875 if (!SimpleOnly)
14876 return true;
14877
14878 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14879 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14880
14881 return !(UseHiV1 || UseHiV2);
14882 };
14883
14884 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
14885 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
14886 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
14887 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
14888 for (int i = 0; i < SplitNumElements; ++i) {
14889 int M = HalfMask[i];
14890 if (M >= NumElements) {
14891 V2BlendMask[i] = M - NumElements;
14892 BlendMask[i] = SplitNumElements + i;
14893 } else if (M >= 0) {
14894 V1BlendMask[i] = M;
14895 BlendMask[i] = i;
14896 }
14897 }
14898
14899 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14900 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14901
14902 // Because the lowering happens after all combining takes place, we need to
14903 // manually combine these blend masks as much as possible so that we create
14904 // a minimal number of high-level vector shuffle nodes.
14905 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
14906
14907 // First try just blending the halves of V1 or V2.
14908 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
14909 return DAG.getUNDEF(SplitVT);
14910 if (!UseLoV2 && !UseHiV2)
14911 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14912 if (!UseLoV1 && !UseHiV1)
14913 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14914
14915 SDValue V1Blend, V2Blend;
14916 if (UseLoV1 && UseHiV1) {
14917 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14918 } else {
14919 // We only use half of V1 so map the usage down into the final blend mask.
14920 V1Blend = UseLoV1 ? LoV1 : HiV1;
14921 for (int i = 0; i < SplitNumElements; ++i)
14922 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
14923 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
14924 }
14925 if (UseLoV2 && UseHiV2) {
14926 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14927 } else {
14928 // We only use half of V2 so map the usage down into the final blend mask.
14929 V2Blend = UseLoV2 ? LoV2 : HiV2;
14930 for (int i = 0; i < SplitNumElements; ++i)
14931 if (BlendMask[i] >= SplitNumElements)
14932 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
14933 }
14934 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
14935 };
14936
14937 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
14938 return SDValue();
14939
14940 SDValue Lo = HalfBlend(LoMask);
14941 SDValue Hi = HalfBlend(HiMask);
14942 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
14943}
14944
14945/// Either split a vector in halves or decompose the shuffles and the
14946/// blend/unpack.
14947///
14948/// This is provided as a good fallback for many lowerings of non-single-input
14949/// shuffles with more than one 128-bit lane. In those cases, we want to select
14950/// between splitting the shuffle into 128-bit components and stitching those
14951/// back together vs. extracting the single-input shuffles and blending those
14952/// results.
14954 SDValue V2, ArrayRef<int> Mask,
14955 const APInt &Zeroable,
14956 const X86Subtarget &Subtarget,
14957 SelectionDAG &DAG) {
14958 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
14959 "shuffles as it could then recurse on itself.");
14960 int Size = Mask.size();
14961
14962 // If this can be modeled as a broadcast of two elements followed by a blend,
14963 // prefer that lowering. This is especially important because broadcasts can
14964 // often fold with memory operands.
14965 auto DoBothBroadcast = [&] {
14966 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
14967 for (int M : Mask)
14968 if (M >= Size) {
14969 if (V2BroadcastIdx < 0)
14970 V2BroadcastIdx = M - Size;
14971 else if (M - Size != V2BroadcastIdx)
14972 return false;
14973 } else if (M >= 0) {
14974 if (V1BroadcastIdx < 0)
14975 V1BroadcastIdx = M;
14976 else if (M != V1BroadcastIdx)
14977 return false;
14978 }
14979 return true;
14980 };
14981 if (DoBothBroadcast())
14982 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
14983 Subtarget, DAG);
14984
14985 // If the inputs all stem from a single 128-bit lane of each input, then we
14986 // split them rather than blending because the split will decompose to
14987 // unusually few instructions.
14988 int LaneCount = VT.getSizeInBits() / 128;
14989 int LaneSize = Size / LaneCount;
14990 SmallBitVector LaneInputs[2];
14991 LaneInputs[0].resize(LaneCount, false);
14992 LaneInputs[1].resize(LaneCount, false);
14993 for (int i = 0; i < Size; ++i)
14994 if (Mask[i] >= 0)
14995 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
14996 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
14997 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
14998 /*SimpleOnly*/ false);
14999
15000 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
15001 // requires that the decomposed single-input shuffles don't end up here.
15002 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
15003 Subtarget, DAG);
15004}
15005
15006// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15007// TODO: Extend to support v8f32 (+ 512-bit shuffles).
15009 SDValue V1, SDValue V2,
15010 ArrayRef<int> Mask,
15011 SelectionDAG &DAG) {
15012 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
15013
15014 int LHSMask[4] = {-1, -1, -1, -1};
15015 int RHSMask[4] = {-1, -1, -1, -1};
15016 int SHUFPDMask[4] = {-1, -1, -1, -1};
15017
15018 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15019 // perform the shuffle once the lanes have been shuffled in place.
15020 for (int i = 0; i != 4; ++i) {
15021 int M = Mask[i];
15022 if (M < 0)
15023 continue;
15024 int LaneBase = i & ~1;
15025 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15026 LaneMask[LaneBase + (M & 1)] = M;
15027 SHUFPDMask[i] = M & 1;
15028 }
15029
15030 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15031 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15032 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15033 getSHUFPDImmForMask(SHUFPDMask, DL, DAG));
15034}
15035
15036/// Lower a vector shuffle crossing multiple 128-bit lanes as
15037/// a lane permutation followed by a per-lane permutation.
15038///
15039/// This is mainly for cases where we can have non-repeating permutes
15040/// in each lane.
15041///
15042/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15043/// we should investigate merging them.
15045 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15046 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15047 int NumElts = VT.getVectorNumElements();
15048 int NumLanes = VT.getSizeInBits() / 128;
15049 int NumEltsPerLane = NumElts / NumLanes;
15050 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15051
15052 /// Attempts to find a sublane permute with the given size
15053 /// that gets all elements into their target lanes.
15054 ///
15055 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15056 /// If unsuccessful, returns false and may overwrite InLaneMask.
15057 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15058 int NumSublanesPerLane = NumSublanes / NumLanes;
15059 int NumEltsPerSublane = NumElts / NumSublanes;
15060
15061 SmallVector<int, 16> CrossLaneMask;
15062 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15063 // CrossLaneMask but one entry == one sublane.
15064 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15065 APInt DemandedCrossLane = APInt::getZero(NumElts);
15066
15067 for (int i = 0; i != NumElts; ++i) {
15068 int M = Mask[i];
15069 if (M < 0)
15070 continue;
15071
15072 int SrcSublane = M / NumEltsPerSublane;
15073 int DstLane = i / NumEltsPerLane;
15074
15075 // We only need to get the elements into the right lane, not sublane.
15076 // So search all sublanes that make up the destination lane.
15077 bool Found = false;
15078 int DstSubStart = DstLane * NumSublanesPerLane;
15079 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15080 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15081 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15082 continue;
15083
15084 Found = true;
15085 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15086 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15087 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15088 DemandedCrossLane.setBit(InLaneMask[i]);
15089 break;
15090 }
15091 if (!Found)
15092 return SDValue();
15093 }
15094
15095 // Fill CrossLaneMask using CrossLaneMaskLarge.
15096 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15097
15098 if (!CanUseSublanes) {
15099 // If we're only shuffling a single lowest lane and the rest are identity
15100 // then don't bother.
15101 // TODO - isShuffleMaskInputInPlace could be extended to something like
15102 // this.
15103 int NumIdentityLanes = 0;
15104 bool OnlyShuffleLowestLane = true;
15105 for (int i = 0; i != NumLanes; ++i) {
15106 int LaneOffset = i * NumEltsPerLane;
15107 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15108 i * NumEltsPerLane))
15109 NumIdentityLanes++;
15110 else if (CrossLaneMask[LaneOffset] != 0)
15111 OnlyShuffleLowestLane = false;
15112 }
15113 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15114 return SDValue();
15115 }
15116
15117 // Avoid returning the same shuffle operation. For example,
15118 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
15119 // undef:v16i16
15120 if (CrossLaneMask == Mask || InLaneMask == Mask)
15121 return SDValue();
15122
15123 // Simplify CrossLaneMask based on the actual demanded elements.
15124 if (V1.hasOneUse())
15125 for (int i = 0; i != NumElts; ++i)
15126 if (!DemandedCrossLane[i])
15127 CrossLaneMask[i] = SM_SentinelUndef;
15128
15129 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15130 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15131 InLaneMask);
15132 };
15133
15134 // First attempt a solution with full lanes.
15135 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15136 return V;
15137
15138 // The rest of the solutions use sublanes.
15139 if (!CanUseSublanes)
15140 return SDValue();
15141
15142 // Then attempt a solution with 64-bit sublanes (vpermq).
15143 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
15144 return V;
15145
15146 // If that doesn't work and we have fast variable cross-lane shuffle,
15147 // attempt 32-bit sublanes (vpermd).
15148 if (!Subtarget.hasFastVariableCrossLaneShuffle())
15149 return SDValue();
15150
15151 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
15152}
15153
15154/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
15155static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
15156 SmallVector<int> &InLaneMask) {
15157 int Size = Mask.size();
15158 InLaneMask.assign(Mask.begin(), Mask.end());
15159 for (int i = 0; i < Size; ++i) {
15160 int &M = InLaneMask[i];
15161 if (M < 0)
15162 continue;
15163 if (((M % Size) / LaneSize) != (i / LaneSize))
15164 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
15165 }
15166}
15167
15168/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
15169/// source with a lane permutation.
15170///
15171/// This lowering strategy results in four instructions in the worst case for a
15172/// single-input cross lane shuffle which is lower than any other fully general
15173/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
15174/// shuffle pattern should be handled prior to trying this lowering.
15176 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15177 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15178 // FIXME: This should probably be generalized for 512-bit vectors as well.
15179 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
15180 int Size = Mask.size();
15181 int LaneSize = Size / 2;
15182
15183 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15184 // Only do this if the elements aren't all from the lower lane,
15185 // otherwise we're (probably) better off doing a split.
15186 if (VT == MVT::v4f64 &&
15187 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
15188 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
15189
15190 // If there are only inputs from one 128-bit lane, splitting will in fact be
15191 // less expensive. The flags track whether the given lane contains an element
15192 // that crosses to another lane.
15193 bool AllLanes;
15194 if (!Subtarget.hasAVX2()) {
15195 bool LaneCrossing[2] = {false, false};
15196 for (int i = 0; i < Size; ++i)
15197 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
15198 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
15199 AllLanes = LaneCrossing[0] && LaneCrossing[1];
15200 } else {
15201 bool LaneUsed[2] = {false, false};
15202 for (int i = 0; i < Size; ++i)
15203 if (Mask[i] >= 0)
15204 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
15205 AllLanes = LaneUsed[0] && LaneUsed[1];
15206 }
15207
15208 // TODO - we could support shuffling V2 in the Flipped input.
15209 assert(V2.isUndef() &&
15210 "This last part of this routine only works on single input shuffles");
15211
15212 SmallVector<int> InLaneMask;
15213 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
15214
15215 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
15216 "In-lane shuffle mask expected");
15217
15218 // If we're not using both lanes in each lane and the inlane mask is not
15219 // repeating, then we're better off splitting.
15220 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
15221 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15222 /*SimpleOnly*/ false);
15223
15224 // Flip the lanes, and shuffle the results which should now be in-lane.
15225 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
15226 SDValue Flipped = DAG.getBitcast(PVT, V1);
15227 Flipped =
15228 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
15229 Flipped = DAG.getBitcast(VT, Flipped);
15230 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
15231}
15232
15233/// Handle lowering 2-lane 128-bit shuffles.
15235 SDValue V2, ArrayRef<int> Mask,
15236 const APInt &Zeroable,
15237 const X86Subtarget &Subtarget,
15238 SelectionDAG &DAG) {
15239 if (V2.isUndef()) {
15240 // Attempt to match VBROADCAST*128 subvector broadcast load.
15241 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
15242 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
15243 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
15245 MVT MemVT = VT.getHalfNumVectorElementsVT();
15246 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
15247 auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
15249 VT, MemVT, Ld, Ofs, DAG))
15250 return BcstLd;
15251 }
15252
15253 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
15254 if (Subtarget.hasAVX2())
15255 return SDValue();
15256 }
15257
15258 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
15259
15260 SmallVector<int, 4> WidenedMask;
15261 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
15262 return SDValue();
15263
15264 bool IsLowZero = (Zeroable & 0x3) == 0x3;
15265 bool IsHighZero = (Zeroable & 0xc) == 0xc;
15266
15267 // Try to use an insert into a zero vector.
15268 if (WidenedMask[0] == 0 && IsHighZero) {
15269 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15270 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
15271 DAG.getVectorIdxConstant(0, DL));
15272 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
15273 getZeroVector(VT, Subtarget, DAG, DL), LoV,
15274 DAG.getVectorIdxConstant(0, DL));
15275 }
15276
15277 // TODO: If minimizing size and one of the inputs is a zero vector and the
15278 // the zero vector has only one use, we could use a VPERM2X128 to save the
15279 // instruction bytes needed to explicitly generate the zero vector.
15280
15281 // Blends are faster and handle all the non-lane-crossing cases.
15282 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
15283 Subtarget, DAG))
15284 return Blend;
15285
15286 // If either input operand is a zero vector, use VPERM2X128 because its mask
15287 // allows us to replace the zero input with an implicit zero.
15288 if (!IsLowZero && !IsHighZero) {
15289 // Check for patterns which can be matched with a single insert of a 128-bit
15290 // subvector.
15291 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
15292 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
15293
15294 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
15295 // this will likely become vinsertf128 which can't fold a 256-bit memop.
15296 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
15297 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15298 SDValue SubVec =
15299 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
15300 DAG.getVectorIdxConstant(0, DL));
15301 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
15302 DAG.getVectorIdxConstant(2, DL));
15303 }
15304 }
15305
15306 // Try to use SHUF128 if possible.
15307 if (Subtarget.hasVLX()) {
15308 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
15309 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
15310 ((WidenedMask[1] % 2) << 1);
15311 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
15312 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15313 }
15314 }
15315 }
15316
15317 // Otherwise form a 128-bit permutation. After accounting for undefs,
15318 // convert the 64-bit shuffle mask selection values into 128-bit
15319 // selection bits by dividing the indexes by 2 and shifting into positions
15320 // defined by a vperm2*128 instruction's immediate control byte.
15321
15322 // The immediate permute control byte looks like this:
15323 // [1:0] - select 128 bits from sources for low half of destination
15324 // [2] - ignore
15325 // [3] - zero low half of destination
15326 // [5:4] - select 128 bits from sources for high half of destination
15327 // [6] - ignore
15328 // [7] - zero high half of destination
15329
15330 assert((WidenedMask[0] >= 0 || IsLowZero) &&
15331 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
15332
15333 unsigned PermMask = 0;
15334 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15335 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15336
15337 // Check the immediate mask and replace unused sources with undef.
15338 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15339 V1 = DAG.getUNDEF(VT);
15340 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15341 V2 = DAG.getUNDEF(VT);
15342
15343 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15344 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15345}
15346
15347/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15348/// shuffling each lane.
15349///
15350/// This attempts to create a repeated lane shuffle where each lane uses one
15351/// or two of the lanes of the inputs. The lanes of the input vectors are
15352/// shuffled in one or two independent shuffles to get the lanes into the
15353/// position needed by the final shuffle.
15355 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15356 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15357 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
15358
15359 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15360 return SDValue();
15361
15362 int NumElts = Mask.size();
15363 int NumLanes = VT.getSizeInBits() / 128;
15364 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15365 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15366 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15367
15368 // First pass will try to fill in the RepeatMask from lanes that need two
15369 // sources.
15370 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15371 int Srcs[2] = {-1, -1};
15372 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15373 for (int i = 0; i != NumLaneElts; ++i) {
15374 int M = Mask[(Lane * NumLaneElts) + i];
15375 if (M < 0)
15376 continue;
15377 // Determine which of the possible input lanes (NumLanes from each source)
15378 // this element comes from. Assign that as one of the sources for this
15379 // lane. We can assign up to 2 sources for this lane. If we run out
15380 // sources we can't do anything.
15381 int LaneSrc = M / NumLaneElts;
15382 int Src;
15383 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15384 Src = 0;
15385 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15386 Src = 1;
15387 else
15388 return SDValue();
15389
15390 Srcs[Src] = LaneSrc;
15391 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15392 }
15393
15394 // If this lane has two sources, see if it fits with the repeat mask so far.
15395 if (Srcs[1] < 0)
15396 continue;
15397
15398 LaneSrcs[Lane][0] = Srcs[0];
15399 LaneSrcs[Lane][1] = Srcs[1];
15400
15401 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
15402 assert(M1.size() == M2.size() && "Unexpected mask size");
15403 for (int i = 0, e = M1.size(); i != e; ++i)
15404 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
15405 return false;
15406 return true;
15407 };
15408
15409 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
15410 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
15411 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
15412 int M = Mask[i];
15413 if (M < 0)
15414 continue;
15415 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
15416 "Unexpected mask element");
15417 MergedMask[i] = M;
15418 }
15419 };
15420
15421 if (MatchMasks(InLaneMask, RepeatMask)) {
15422 // Merge this lane mask into the final repeat mask.
15423 MergeMasks(InLaneMask, RepeatMask);
15424 continue;
15425 }
15426
15427 // Didn't find a match. Swap the operands and try again.
15428 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15430
15431 if (MatchMasks(InLaneMask, RepeatMask)) {
15432 // Merge this lane mask into the final repeat mask.
15433 MergeMasks(InLaneMask, RepeatMask);
15434 continue;
15435 }
15436
15437 // Couldn't find a match with the operands in either order.
15438 return SDValue();
15439 }
15440
15441 // Now handle any lanes with only one source.
15442 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15443 // If this lane has already been processed, skip it.
15444 if (LaneSrcs[Lane][0] >= 0)
15445 continue;
15446
15447 for (int i = 0; i != NumLaneElts; ++i) {
15448 int M = Mask[(Lane * NumLaneElts) + i];
15449 if (M < 0)
15450 continue;
15451
15452 // If RepeatMask isn't defined yet we can define it ourself.
15453 if (RepeatMask[i] < 0)
15454 RepeatMask[i] = M % NumLaneElts;
15455
15456 if (RepeatMask[i] < NumElts) {
15457 if (RepeatMask[i] != M % NumLaneElts)
15458 return SDValue();
15459 LaneSrcs[Lane][0] = M / NumLaneElts;
15460 } else {
15461 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15462 return SDValue();
15463 LaneSrcs[Lane][1] = M / NumLaneElts;
15464 }
15465 }
15466
15467 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15468 return SDValue();
15469 }
15470
15471 SmallVector<int, 16> NewMask(NumElts, -1);
15472 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15473 int Src = LaneSrcs[Lane][0];
15474 for (int i = 0; i != NumLaneElts; ++i) {
15475 int M = -1;
15476 if (Src >= 0)
15477 M = Src * NumLaneElts + i;
15478 NewMask[Lane * NumLaneElts + i] = M;
15479 }
15480 }
15481 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15482 // Ensure we didn't get back the shuffle we started with.
15483 // FIXME: This is a hack to make up for some splat handling code in
15484 // getVectorShuffle.
15485 if (isa<ShuffleVectorSDNode>(NewV1) &&
15486 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15487 return SDValue();
15488
15489 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15490 int Src = LaneSrcs[Lane][1];
15491 for (int i = 0; i != NumLaneElts; ++i) {
15492 int M = -1;
15493 if (Src >= 0)
15494 M = Src * NumLaneElts + i;
15495 NewMask[Lane * NumLaneElts + i] = M;
15496 }
15497 }
15498 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15499 // Ensure we didn't get back the shuffle we started with.
15500 // FIXME: This is a hack to make up for some splat handling code in
15501 // getVectorShuffle.
15502 if (isa<ShuffleVectorSDNode>(NewV2) &&
15503 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15504 return SDValue();
15505
15506 for (int i = 0; i != NumElts; ++i) {
15507 if (Mask[i] < 0) {
15508 NewMask[i] = -1;
15509 continue;
15510 }
15511 NewMask[i] = RepeatMask[i % NumLaneElts];
15512 if (NewMask[i] < 0)
15513 continue;
15514
15515 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15516 }
15517 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
15518}
15519
15520/// If the input shuffle mask results in a vector that is undefined in all upper
15521/// or lower half elements and that mask accesses only 2 halves of the
15522/// shuffle's operands, return true. A mask of half the width with mask indexes
15523/// adjusted to access the extracted halves of the original shuffle operands is
15524/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
15525/// lower half of each input operand is accessed.
15526static bool
15528 int &HalfIdx1, int &HalfIdx2) {
15529 assert((Mask.size() == HalfMask.size() * 2) &&
15530 "Expected input mask to be twice as long as output");
15531
15532 // Exactly one half of the result must be undef to allow narrowing.
15533 bool UndefLower = isUndefLowerHalf(Mask);
15534 bool UndefUpper = isUndefUpperHalf(Mask);
15535 if (UndefLower == UndefUpper)
15536 return false;
15537
15538 unsigned HalfNumElts = HalfMask.size();
15539 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15540 HalfIdx1 = -1;
15541 HalfIdx2 = -1;
15542 for (unsigned i = 0; i != HalfNumElts; ++i) {
15543 int M = Mask[i + MaskIndexOffset];
15544 if (M < 0) {
15545 HalfMask[i] = M;
15546 continue;
15547 }
15548
15549 // Determine which of the 4 half vectors this element is from.
15550 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15551 int HalfIdx = M / HalfNumElts;
15552
15553 // Determine the element index into its half vector source.
15554 int HalfElt = M % HalfNumElts;
15555
15556 // We can shuffle with up to 2 half vectors, set the new 'half'
15557 // shuffle mask accordingly.
15558 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15559 HalfMask[i] = HalfElt;
15560 HalfIdx1 = HalfIdx;
15561 continue;
15562 }
15563 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15564 HalfMask[i] = HalfElt + HalfNumElts;
15565 HalfIdx2 = HalfIdx;
15566 continue;
15567 }
15568
15569 // Too many half vectors referenced.
15570 return false;
15571 }
15572
15573 return true;
15574}
15575
15576/// Given the output values from getHalfShuffleMask(), create a half width
15577/// shuffle of extracted vectors followed by an insert back to full width.
15579 ArrayRef<int> HalfMask, int HalfIdx1,
15580 int HalfIdx2, bool UndefLower,
15581 SelectionDAG &DAG, bool UseConcat = false) {
15582 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
15583 assert(V1.getValueType().isSimple() && "Expecting only simple types");
15584
15585 MVT VT = V1.getSimpleValueType();
15586 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15587 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15588
15589 auto getHalfVector = [&](int HalfIdx) {
15590 if (HalfIdx < 0)
15591 return DAG.getUNDEF(HalfVT);
15592 SDValue V = (HalfIdx < 2 ? V1 : V2);
15593 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15594 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15595 DAG.getVectorIdxConstant(HalfIdx, DL));
15596 };
15597
15598 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15599 SDValue Half1 = getHalfVector(HalfIdx1);
15600 SDValue Half2 = getHalfVector(HalfIdx2);
15601 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15602 if (UseConcat) {
15603 SDValue Op0 = V;
15604 SDValue Op1 = DAG.getUNDEF(HalfVT);
15605 if (UndefLower)
15606 std::swap(Op0, Op1);
15607 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15608 }
15609
15610 unsigned Offset = UndefLower ? HalfNumElts : 0;
15611 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15613}
15614
15615/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15616/// This allows for fast cases such as subvector extraction/insertion
15617/// or shuffling smaller vector types which can lower more efficiently.
15619 SDValue V2, ArrayRef<int> Mask,
15620 const X86Subtarget &Subtarget,
15621 SelectionDAG &DAG) {
15622 assert((VT.is256BitVector() || VT.is512BitVector()) &&
15623 "Expected 256-bit or 512-bit vector");
15624
15625 bool UndefLower = isUndefLowerHalf(Mask);
15626 if (!UndefLower && !isUndefUpperHalf(Mask))
15627 return SDValue();
15628
15629 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
15630 "Completely undef shuffle mask should have been simplified already");
15631
15632 // Upper half is undef and lower half is whole upper subvector.
15633 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15634 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15635 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15636 if (!UndefLower &&
15637 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15638 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15639 DAG.getVectorIdxConstant(HalfNumElts, DL));
15640 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15641 DAG.getVectorIdxConstant(0, DL));
15642 }
15643
15644 // Lower half is undef and upper half is whole lower subvector.
15645 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15646 if (UndefLower &&
15647 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15648 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15649 DAG.getVectorIdxConstant(0, DL));
15650 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15651 DAG.getVectorIdxConstant(HalfNumElts, DL));
15652 }
15653
15654 int HalfIdx1, HalfIdx2;
15655 SmallVector<int, 8> HalfMask(HalfNumElts);
15656 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15657 return SDValue();
15658
15659 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
15660
15661 // Only shuffle the halves of the inputs when useful.
15662 unsigned NumLowerHalves =
15663 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15664 unsigned NumUpperHalves =
15665 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15666 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
15667
15668 // Determine the larger pattern of undef/halves, then decide if it's worth
15669 // splitting the shuffle based on subtarget capabilities and types.
15670 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
15671 if (!UndefLower) {
15672 // XXXXuuuu: no insert is needed.
15673 // Always extract lowers when setting lower - these are all free subreg ops.
15674 if (NumUpperHalves == 0)
15675 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15676 UndefLower, DAG);
15677
15678 if (NumUpperHalves == 1) {
15679 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
15680 if (Subtarget.hasAVX2()) {
15681 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
15682 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
15683 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
15684 (!isSingleSHUFPSMask(HalfMask) ||
15685 Subtarget.hasFastVariableCrossLaneShuffle()))
15686 return SDValue();
15687 // If this is an unary shuffle (assume that the 2nd operand is
15688 // canonicalized to undef), then we can use vpermpd. Otherwise, we
15689 // are better off extracting the upper half of 1 operand and using a
15690 // narrow shuffle.
15691 if (EltWidth == 64 && V2.isUndef())
15692 return SDValue();
15693 // If this is an unary vXi8 shuffle with inplace halves, then perform as
15694 // full width pshufb, and then merge.
15695 if (EltWidth == 8 && HalfIdx1 == 0 && HalfIdx2 == 1)
15696 return SDValue();
15697 }
15698 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15699 if (Subtarget.hasAVX512() && VT.is512BitVector())
15700 return SDValue();
15701 // Extract + narrow shuffle is better than the wide alternative.
15702 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15703 UndefLower, DAG);
15704 }
15705
15706 // Don't extract both uppers, instead shuffle and then extract.
15707 assert(NumUpperHalves == 2 && "Half vector count went wrong");
15708 return SDValue();
15709 }
15710
15711 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
15712 if (NumUpperHalves == 0) {
15713 // AVX2 has efficient 64-bit element cross-lane shuffles.
15714 // TODO: Refine to account for unary shuffle, splat, and other masks?
15715 if (Subtarget.hasAVX2() && EltWidth == 64)
15716 return SDValue();
15717 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15718 if (Subtarget.hasAVX512() && VT.is512BitVector())
15719 return SDValue();
15720 // Narrow shuffle + insert is better than the wide alternative.
15721 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15722 UndefLower, DAG);
15723 }
15724
15725 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
15726 return SDValue();
15727}
15728
15729/// Handle case where shuffle sources are coming from the same 128-bit lane and
15730/// every lane can be represented as the same repeating mask - allowing us to
15731/// shuffle the sources with the repeating shuffle and then permute the result
15732/// to the destination lanes.
15734 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15735 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15736 int NumElts = VT.getVectorNumElements();
15737 int NumLanes = VT.getSizeInBits() / 128;
15738 int NumLaneElts = NumElts / NumLanes;
15739
15740 // On AVX2 we may be able to just shuffle the lowest elements and then
15741 // broadcast the result.
15742 if (Subtarget.hasAVX2()) {
15743 for (unsigned BroadcastSize : {16, 32, 64}) {
15744 if (BroadcastSize <= VT.getScalarSizeInBits())
15745 continue;
15746 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
15747
15748 // Attempt to match a repeating pattern every NumBroadcastElts,
15749 // accounting for UNDEFs but only references the lowest 128-bit
15750 // lane of the inputs.
15751 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
15752 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15753 for (int j = 0; j != NumBroadcastElts; ++j) {
15754 int M = Mask[i + j];
15755 if (M < 0)
15756 continue;
15757 int &R = RepeatMask[j];
15758 if (0 != ((M % NumElts) / NumLaneElts))
15759 return false;
15760 if (0 <= R && R != M)
15761 return false;
15762 R = M;
15763 }
15764 return true;
15765 };
15766
15767 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
15768 if (!FindRepeatingBroadcastMask(RepeatMask))
15769 continue;
15770
15771 // Shuffle the (lowest) repeated elements in place for broadcast.
15772 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
15773
15774 // Shuffle the actual broadcast.
15775 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
15776 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15777 for (int j = 0; j != NumBroadcastElts; ++j)
15778 BroadcastMask[i + j] = j;
15779
15780 // Avoid returning the same shuffle operation. For example,
15781 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
15782 if (BroadcastMask == Mask)
15783 return SDValue();
15784
15785 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
15786 BroadcastMask);
15787 }
15788 }
15789
15790 // Bail if the shuffle mask doesn't cross 128-bit lanes.
15791 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
15792 return SDValue();
15793
15794 // Bail if we already have a repeated lane shuffle mask.
15795 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15796 return SDValue();
15797
15798 // Helper to look for repeated mask in each split sublane, and that those
15799 // sublanes can then be permuted into place.
15800 auto ShuffleSubLanes = [&](int SubLaneScale) {
15801 int NumSubLanes = NumLanes * SubLaneScale;
15802 int NumSubLaneElts = NumLaneElts / SubLaneScale;
15803
15804 // Check that all the sources are coming from the same lane and see if we
15805 // can form a repeating shuffle mask (local to each sub-lane). At the same
15806 // time, determine the source sub-lane for each destination sub-lane.
15807 int TopSrcSubLane = -1;
15808 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
15809 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
15810 SubLaneScale,
15811 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
15812
15813 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
15814 // Extract the sub-lane mask, check that it all comes from the same lane
15815 // and normalize the mask entries to come from the first lane.
15816 int SrcLane = -1;
15817 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
15818 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15819 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
15820 if (M < 0)
15821 continue;
15822 int Lane = (M % NumElts) / NumLaneElts;
15823 if ((0 <= SrcLane) && (SrcLane != Lane))
15824 return SDValue();
15825 SrcLane = Lane;
15826 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
15827 SubLaneMask[Elt] = LocalM;
15828 }
15829
15830 // Whole sub-lane is UNDEF.
15831 if (SrcLane < 0)
15832 continue;
15833
15834 // Attempt to match against the candidate repeated sub-lane masks.
15835 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
15836 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
15837 for (int i = 0; i != NumSubLaneElts; ++i) {
15838 if (M1[i] < 0 || M2[i] < 0)
15839 continue;
15840 if (M1[i] != M2[i])
15841 return false;
15842 }
15843 return true;
15844 };
15845
15846 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
15847 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
15848 continue;
15849
15850 // Merge the sub-lane mask into the matching repeated sub-lane mask.
15851 for (int i = 0; i != NumSubLaneElts; ++i) {
15852 int M = SubLaneMask[i];
15853 if (M < 0)
15854 continue;
15855 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
15856 "Unexpected mask element");
15857 RepeatedSubLaneMask[i] = M;
15858 }
15859
15860 // Track the top most source sub-lane - by setting the remaining to
15861 // UNDEF we can greatly simplify shuffle matching.
15862 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
15863 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
15864 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
15865 break;
15866 }
15867
15868 // Bail if we failed to find a matching repeated sub-lane mask.
15869 if (Dst2SrcSubLanes[DstSubLane] < 0)
15870 return SDValue();
15871 }
15872 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
15873 "Unexpected source lane");
15874
15875 // Create a repeating shuffle mask for the entire vector.
15876 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
15877 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
15878 int Lane = SubLane / SubLaneScale;
15879 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
15880 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15881 int M = RepeatedSubLaneMask[Elt];
15882 if (M < 0)
15883 continue;
15884 int Idx = (SubLane * NumSubLaneElts) + Elt;
15885 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
15886 }
15887 }
15888
15889 // Shuffle each source sub-lane to its destination.
15890 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
15891 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
15892 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
15893 if (SrcSubLane < 0)
15894 continue;
15895 for (int j = 0; j != NumSubLaneElts; ++j)
15896 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
15897 }
15898
15899 // Avoid returning the same shuffle operation.
15900 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
15901 if (RepeatedMask == Mask || SubLaneMask == Mask)
15902 return SDValue();
15903
15904 SDValue RepeatedShuffle =
15905 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
15906
15907 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
15908 SubLaneMask);
15909 };
15910
15911 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
15912 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
15913 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
15914 // Otherwise we can only permute whole 128-bit lanes.
15915 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
15916 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
15917 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
15918 MinSubLaneScale = 2;
15919 MaxSubLaneScale =
15920 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
15921 }
15922 if (Subtarget.hasBWI() && VT == MVT::v64i8)
15923 MinSubLaneScale = MaxSubLaneScale = 4;
15924
15925 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
15926 if (SDValue Shuffle = ShuffleSubLanes(Scale))
15927 return Shuffle;
15928
15929 return SDValue();
15930}
15931
15933 bool &ForceV1Zero, bool &ForceV2Zero,
15934 unsigned &ShuffleImm, ArrayRef<int> Mask,
15935 const APInt &Zeroable) {
15936 int NumElts = VT.getVectorNumElements();
15937 assert(VT.getScalarSizeInBits() == 64 &&
15938 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
15939 "Unexpected data type for VSHUFPD");
15940 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
15941 "Illegal shuffle mask");
15942
15943 bool ZeroLane[2] = { true, true };
15944 for (int i = 0; i < NumElts; ++i)
15945 ZeroLane[i & 1] &= Zeroable[i];
15946
15947 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
15948 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
15949 bool IsSHUFPD = true;
15950 bool IsCommutable = true;
15951 SmallVector<int, 8> SHUFPDMask(NumElts, -1);
15952 for (int i = 0; i < NumElts; ++i) {
15953 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
15954 continue;
15955 if (Mask[i] < 0)
15956 return false;
15957 int Val = (i & 6) + NumElts * (i & 1);
15958 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
15959 if (Mask[i] < Val || Mask[i] > Val + 1)
15960 IsSHUFPD = false;
15961 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
15962 IsCommutable = false;
15963 SHUFPDMask[i] = Mask[i] % 2;
15964 }
15965
15966 if (!IsSHUFPD && !IsCommutable)
15967 return false;
15968
15969 if (!IsSHUFPD && IsCommutable)
15970 std::swap(V1, V2);
15971
15972 ForceV1Zero = ZeroLane[0];
15973 ForceV2Zero = ZeroLane[1];
15974 ShuffleImm = getSHUFPDImm(SHUFPDMask);
15975 return true;
15976}
15977
15979 SDValue V2, ArrayRef<int> Mask,
15980 const APInt &Zeroable,
15981 const X86Subtarget &Subtarget,
15982 SelectionDAG &DAG) {
15983 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
15984 "Unexpected data type for VSHUFPD");
15985
15986 unsigned Immediate = 0;
15987 bool ForceV1Zero = false, ForceV2Zero = false;
15988 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
15989 Mask, Zeroable))
15990 return SDValue();
15991
15992 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
15993 if (ForceV1Zero)
15994 V1 = getZeroVector(VT, Subtarget, DAG, DL);
15995 if (ForceV2Zero)
15996 V2 = getZeroVector(VT, Subtarget, DAG, DL);
15997
15998 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
15999 DAG.getTargetConstant(Immediate, DL, MVT::i8));
16000}
16001
16002// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16003// by zeroable elements in the remaining 24 elements. Turn this into two
16004// vmovqb instructions shuffled together.
16006 SDValue V1, SDValue V2,
16007 ArrayRef<int> Mask,
16008 const APInt &Zeroable,
16009 SelectionDAG &DAG) {
16010 assert(VT == MVT::v32i8 && "Unexpected type!");
16011
16012 // The first 8 indices should be every 8th element.
16013 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
16014 return SDValue();
16015
16016 // Remaining elements need to be zeroable.
16017 if (Zeroable.countl_one() < (Mask.size() - 8))
16018 return SDValue();
16019
16020 V1 = DAG.getBitcast(MVT::v4i64, V1);
16021 V2 = DAG.getBitcast(MVT::v4i64, V2);
16022
16023 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16024 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16025
16026 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16027 // the upper bits of the result using an unpckldq.
16028 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16029 { 0, 1, 2, 3, 16, 17, 18, 19,
16030 4, 5, 6, 7, 20, 21, 22, 23 });
16031 // Insert the unpckldq into a zero vector to widen to v32i8.
16032 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16033 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16034 DAG.getVectorIdxConstant(0, DL));
16035}
16036
16037// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
16038// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
16039// =>
16040// ul = unpckl v1, v2
16041// uh = unpckh v1, v2
16042// a = vperm ul, uh
16043// b = vperm ul, uh
16044//
16045// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
16046// and permute. We cannot directly match v3 because it is split into two
16047// 256-bit vectors in earlier isel stages. Therefore, this function matches a
16048// pair of 256-bit shuffles and makes sure the masks are consecutive.
16049//
16050// Once unpck and permute nodes are created, the permute corresponding to this
16051// shuffle is returned, while the other permute replaces the other half of the
16052// shuffle in the selection dag.
16054 SDValue V1, SDValue V2,
16055 ArrayRef<int> Mask,
16056 SelectionDAG &DAG) {
16057 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
16058 VT != MVT::v32i8)
16059 return SDValue();
16060 // <B0, B1, B0+1, B1+1, ..., >
16061 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
16062 unsigned Begin1) {
16063 size_t Size = Mask.size();
16064 assert(Size % 2 == 0 && "Expected even mask size");
16065 for (unsigned I = 0; I < Size; I += 2) {
16066 if (Mask[I] != (int)(Begin0 + I / 2) ||
16067 Mask[I + 1] != (int)(Begin1 + I / 2))
16068 return false;
16069 }
16070 return true;
16071 };
16072 // Check which half is this shuffle node
16073 int NumElts = VT.getVectorNumElements();
16074 size_t FirstQtr = NumElts / 2;
16075 size_t ThirdQtr = NumElts + NumElts / 2;
16076 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
16077 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
16078 if (!IsFirstHalf && !IsSecondHalf)
16079 return SDValue();
16080
16081 // Find the intersection between shuffle users of V1 and V2.
16082 SmallVector<SDNode *, 2> Shuffles;
16083 for (SDNode *User : V1->users())
16084 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
16085 User->getOperand(1) == V2)
16086 Shuffles.push_back(User);
16087 // Limit user size to two for now.
16088 if (Shuffles.size() != 2)
16089 return SDValue();
16090 // Find out which half of the 512-bit shuffles is each smaller shuffle
16091 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
16092 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
16093 SDNode *FirstHalf;
16094 SDNode *SecondHalf;
16095 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
16096 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
16097 FirstHalf = Shuffles[0];
16098 SecondHalf = Shuffles[1];
16099 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
16100 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
16101 FirstHalf = Shuffles[1];
16102 SecondHalf = Shuffles[0];
16103 } else {
16104 return SDValue();
16105 }
16106 // Lower into unpck and perm. Return the perm of this shuffle and replace
16107 // the other.
16108 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
16109 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
16110 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16111 DAG.getTargetConstant(0x20, DL, MVT::i8));
16112 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16113 DAG.getTargetConstant(0x31, DL, MVT::i8));
16114 if (IsFirstHalf) {
16115 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
16116 return Perm1;
16117 }
16118 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
16119 return Perm2;
16120}
16121
16122/// Handle lowering of 4-lane 64-bit floating point shuffles.
16123///
16124/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16125/// isn't available.
16127 const APInt &Zeroable, SDValue V1, SDValue V2,
16128 const X86Subtarget &Subtarget,
16129 SelectionDAG &DAG) {
16130 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16131 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16132 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16133
16134 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16135 Subtarget, DAG))
16136 return V;
16137
16138 if (V2.isUndef()) {
16139 // Check for being able to broadcast a single element.
16140 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16141 Mask, Subtarget, DAG))
16142 return Broadcast;
16143
16144 // Use low duplicate instructions for masks that match their pattern.
16145 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16146 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16147
16148 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16149 // Non-half-crossing single input shuffles can be lowered with an
16150 // interleaved permutation.
16151 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16152 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16153 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16154 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16155 }
16156
16157 // With AVX2 we have direct support for this permutation.
16158 if (Subtarget.hasAVX2())
16159 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16160 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16161
16162 // Try to create an in-lane repeating shuffle mask and then shuffle the
16163 // results into the target lanes.
16165 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16166 return V;
16167
16168 // Try to permute the lanes and then use a per-lane permute.
16169 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16170 Mask, DAG, Subtarget))
16171 return V;
16172
16173 // Otherwise, fall back.
16174 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16175 DAG, Subtarget);
16176 }
16177
16178 // Use dedicated unpack instructions for masks that match their pattern.
16179 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, V1, V2, Mask, DAG))
16180 return V;
16181
16182 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16183 Zeroable, Subtarget, DAG))
16184 return Blend;
16185
16186 // Check if the blend happens to exactly fit that of SHUFPD.
16187 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16188 Zeroable, Subtarget, DAG))
16189 return Op;
16190
16191 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16192 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16193
16194 // If we have lane crossing shuffles AND they don't all come from the lower
16195 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16196 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16197 // canonicalize to a blend of splat which isn't necessary for this combine.
16198 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16199 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16200 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16201 (V2.getOpcode() != ISD::BUILD_VECTOR))
16202 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
16203
16204 // If we have one input in place, then we can permute the other input and
16205 // blend the result.
16206 if (V1IsInPlace || V2IsInPlace)
16207 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16208 Zeroable, Subtarget, DAG);
16209
16210 // Try to create an in-lane repeating shuffle mask and then shuffle the
16211 // results into the target lanes.
16213 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16214 return V;
16215
16216 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16217 // shuffle. However, if we have AVX2 and either inputs are already in place,
16218 // we will be able to shuffle even across lanes the other input in a single
16219 // instruction so skip this pattern.
16220 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
16222 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16223 return V;
16224
16225 // If we have VLX support, we can use VEXPAND.
16226 if (Subtarget.hasVLX())
16227 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4f64, V1, V2, Mask,
16228 Zeroable, Subtarget, DAG))
16229 return V;
16230
16231 // If we have AVX2 then we always want to lower with a blend because an v4 we
16232 // can fully permute the elements.
16233 if (Subtarget.hasAVX2())
16234 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16235 Zeroable, Subtarget, DAG);
16236
16237 // Otherwise fall back on generic lowering.
16238 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16239 Subtarget, DAG);
16240}
16241
16242/// Handle lowering of 4-lane 64-bit integer shuffles.
16243///
16244/// This routine is only called when we have AVX2 and thus a reasonable
16245/// instruction set for v4i64 shuffling..
16247 const APInt &Zeroable, SDValue V1, SDValue V2,
16248 const X86Subtarget &Subtarget,
16249 SelectionDAG &DAG) {
16250 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16251 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16252 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16253 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
16254
16255 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16256 Subtarget, DAG))
16257 return V;
16258
16259 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
16260 Zeroable, Subtarget, DAG))
16261 return Blend;
16262
16263 // Check for being able to broadcast a single element.
16264 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
16265 Subtarget, DAG))
16266 return Broadcast;
16267
16268 // Try to use shift instructions if fast.
16269 if (Subtarget.preferLowerShuffleAsShift())
16270 if (SDValue Shift =
16271 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16272 Subtarget, DAG, /*BitwiseOnly*/ true))
16273 return Shift;
16274
16275 if (V2.isUndef()) {
16276 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16277 // can use lower latency instructions that will operate on both lanes.
16278 SmallVector<int, 2> RepeatedMask;
16279 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
16280 SmallVector<int, 4> PSHUFDMask;
16281 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
16282 return DAG.getBitcast(
16283 MVT::v4i64,
16284 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
16285 DAG.getBitcast(MVT::v8i32, V1),
16286 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16287 }
16288
16289 // AVX2 provides a direct instruction for permuting a single input across
16290 // lanes.
16291 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
16292 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16293 }
16294
16295 // Try to use shift instructions.
16296 if (SDValue Shift =
16297 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
16298 DAG, /*BitwiseOnly*/ false))
16299 return Shift;
16300
16301 // If we have VLX support, we can use VALIGN or VEXPAND.
16302 if (Subtarget.hasVLX()) {
16303 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
16304 Zeroable, Subtarget, DAG))
16305 return Rotate;
16306
16307 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4i64, V1, V2, Mask,
16308 Zeroable, Subtarget, DAG))
16309 return V;
16310 }
16311
16312 // Try to use PALIGNR.
16313 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
16314 Subtarget, DAG))
16315 return Rotate;
16316
16317 // Use dedicated unpack instructions for masks that match their pattern.
16318 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, V1, V2, Mask, DAG))
16319 return V;
16320
16321 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16322 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16323
16324 // If we have one input in place, then we can permute the other input and
16325 // blend the result.
16326 if (V1IsInPlace || V2IsInPlace)
16327 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16328 Zeroable, Subtarget, DAG);
16329
16330 // Try to create an in-lane repeating shuffle mask and then shuffle the
16331 // results into the target lanes.
16333 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16334 return V;
16335
16336 // Try to lower to PERMQ(BLENDD(V1,V2)).
16337 if (SDValue V =
16338 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
16339 return V;
16340
16341 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16342 // shuffle. However, if we have AVX2 and either inputs are already in place,
16343 // we will be able to shuffle even across lanes the other input in a single
16344 // instruction so skip this pattern.
16345 if (!V1IsInPlace && !V2IsInPlace)
16347 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16348 return Result;
16349
16350 // Otherwise fall back on generic blend lowering.
16351 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16352 Zeroable, Subtarget, DAG);
16353}
16354
16355/// Handle lowering of 8-lane 32-bit floating point shuffles.
16356///
16357/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16358/// isn't available.
16360 const APInt &Zeroable, SDValue V1, SDValue V2,
16361 const X86Subtarget &Subtarget,
16362 SelectionDAG &DAG) {
16363 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16364 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16365 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16366
16367 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16368 Zeroable, Subtarget, DAG))
16369 return Blend;
16370
16371 // Check for being able to broadcast a single element.
16372 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16373 Subtarget, DAG))
16374 return Broadcast;
16375
16376 if (!Subtarget.hasAVX2()) {
16377 SmallVector<int> InLaneMask;
16378 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16379
16380 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
16381 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
16382 /*SimpleOnly*/ true))
16383 return R;
16384 }
16385 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16386 Zeroable, Subtarget, DAG))
16387 return DAG.getBitcast(MVT::v8f32, ZExt);
16388
16389 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16390 // options to efficiently lower the shuffle.
16391 SmallVector<int, 4> RepeatedMask;
16392 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16393 assert(RepeatedMask.size() == 4 &&
16394 "Repeated masks must be half the mask width!");
16395
16396 // Use even/odd duplicate instructions for masks that match their pattern.
16397 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16398 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16399 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16400 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16401
16402 if (V2.isUndef())
16403 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16404 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16405
16406 // Use dedicated unpack instructions for masks that match their pattern.
16407 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, V1, V2, Mask, DAG))
16408 return V;
16409
16410 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16411 // have already handled any direct blends.
16412 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16413 }
16414
16415 // Try to create an in-lane repeating shuffle mask and then shuffle the
16416 // results into the target lanes.
16418 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16419 return V;
16420
16421 // If we have a single input shuffle with different shuffle patterns in the
16422 // two 128-bit lanes use the variable mask to VPERMILPS.
16423 if (V2.isUndef()) {
16424 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16425 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16426 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16427 }
16428 if (Subtarget.hasAVX2()) {
16429 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16430 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
16431 }
16432 // Otherwise, fall back.
16433 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
16434 DAG, Subtarget);
16435 }
16436
16437 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16438 // shuffle.
16440 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16441 return Result;
16442
16443 // If we have VLX support, we can use VEXPAND.
16444 if (Subtarget.hasVLX())
16445 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f32, V1, V2, Mask,
16446 Zeroable, Subtarget, DAG))
16447 return V;
16448
16449 // Try to match an interleave of two v8f32s and lower them as unpck and
16450 // permutes using ymms. This needs to go before we try to split the vectors.
16451 //
16452 // TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits
16453 // this path inadvertently.
16454 if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())
16455 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
16456 Mask, DAG))
16457 return V;
16458
16459 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16460 // since after split we get a more efficient code using vpunpcklwd and
16461 // vpunpckhwd instrs than vblend.
16462 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
16463 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16464 Subtarget, DAG);
16465
16466 // If we have AVX2 then we always want to lower with a blend because at v8 we
16467 // can fully permute the elements.
16468 if (Subtarget.hasAVX2())
16469 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
16470 Zeroable, Subtarget, DAG);
16471
16472 // Otherwise fall back on generic lowering.
16473 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16474 Subtarget, DAG);
16475}
16476
16477/// Handle lowering of 8-lane 32-bit integer shuffles.
16478///
16479/// This routine is only called when we have AVX2 and thus a reasonable
16480/// instruction set for v8i32 shuffling..
16482 const APInt &Zeroable, SDValue V1, SDValue V2,
16483 const X86Subtarget &Subtarget,
16484 SelectionDAG &DAG) {
16485 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16486 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16487 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16488 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
16489
16490 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16491
16492 // Whenever we can lower this as a zext, that instruction is strictly faster
16493 // than any alternative. It also allows us to fold memory operands into the
16494 // shuffle in many cases.
16495 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16496 Zeroable, Subtarget, DAG))
16497 return ZExt;
16498
16499 // Try to match an interleave of two v8i32s and lower them as unpck and
16500 // permutes using ymms. This needs to go before we try to split the vectors.
16501 if (!Subtarget.hasAVX512())
16502 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
16503 Mask, DAG))
16504 return V;
16505
16506 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16507 // since after split we get a more efficient code than vblend by using
16508 // vpunpcklwd and vpunpckhwd instrs.
16509 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
16510 !Subtarget.hasAVX512())
16511 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16512 Subtarget, DAG);
16513
16514 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
16515 Zeroable, Subtarget, DAG))
16516 return Blend;
16517
16518 // Check for being able to broadcast a single element.
16519 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
16520 Subtarget, DAG))
16521 return Broadcast;
16522
16523 // Try to use shift instructions if fast.
16524 if (Subtarget.preferLowerShuffleAsShift()) {
16525 if (SDValue Shift =
16526 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16527 Subtarget, DAG, /*BitwiseOnly*/ true))
16528 return Shift;
16529 if (NumV2Elements == 0)
16530 if (SDValue Rotate =
16531 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16532 return Rotate;
16533 }
16534
16535 // If the shuffle mask is repeated in each 128-bit lane we can use more
16536 // efficient instructions that mirror the shuffles across the two 128-bit
16537 // lanes.
16538 SmallVector<int, 4> RepeatedMask;
16539 bool Is128BitLaneRepeatedShuffle =
16540 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
16541 if (Is128BitLaneRepeatedShuffle) {
16542 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16543 if (V2.isUndef())
16544 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
16545 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16546
16547 // Use dedicated unpack instructions for masks that match their pattern.
16548 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, V1, V2, Mask, DAG))
16549 return V;
16550 }
16551
16552 // Try to use shift instructions.
16553 if (SDValue Shift =
16554 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
16555 DAG, /*BitwiseOnly*/ false))
16556 return Shift;
16557
16558 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
16559 if (SDValue Rotate =
16560 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16561 return Rotate;
16562
16563 // If we have VLX support, we can use VALIGN or EXPAND.
16564 if (Subtarget.hasVLX()) {
16565 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16566 Zeroable, Subtarget, DAG))
16567 return Rotate;
16568
16569 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i32, V1, V2, Mask,
16570 Zeroable, Subtarget, DAG))
16571 return V;
16572 }
16573
16574 // Try to use byte rotation instructions.
16575 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
16576 Subtarget, DAG))
16577 return Rotate;
16578
16579 // Try to create an in-lane repeating shuffle mask and then shuffle the
16580 // results into the target lanes.
16582 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16583 return V;
16584
16585 if (V2.isUndef()) {
16586 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16587 // because that should be faster than the variable permute alternatives.
16588 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, V1, V2, Mask, DAG))
16589 return V;
16590
16591 // If the shuffle patterns aren't repeated but it's a single input, directly
16592 // generate a cross-lane VPERMD instruction.
16593 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16594 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
16595 }
16596
16597 // Assume that a single SHUFPS is faster than an alternative sequence of
16598 // multiple instructions (even if the CPU has a domain penalty).
16599 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16600 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16601 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
16602 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
16603 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
16604 CastV1, CastV2, DAG);
16605 return DAG.getBitcast(MVT::v8i32, ShufPS);
16606 }
16607
16608 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16609 // shuffle.
16611 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16612 return Result;
16613
16614 // Otherwise fall back on generic blend lowering.
16615 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
16616 Zeroable, Subtarget, DAG);
16617}
16618
16619/// Handle lowering of 16-lane 16-bit integer shuffles.
16620///
16621/// This routine is only called when we have AVX2 and thus a reasonable
16622/// instruction set for v16i16 shuffling..
16624 const APInt &Zeroable, SDValue V1, SDValue V2,
16625 const X86Subtarget &Subtarget,
16626 SelectionDAG &DAG) {
16627 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16628 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16629 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16630 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
16631
16632 // Whenever we can lower this as a zext, that instruction is strictly faster
16633 // than any alternative. It also allows us to fold memory operands into the
16634 // shuffle in many cases.
16636 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16637 return ZExt;
16638
16639 // Check for being able to broadcast a single element.
16640 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16641 Subtarget, DAG))
16642 return Broadcast;
16643
16644 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16645 Zeroable, Subtarget, DAG))
16646 return Blend;
16647
16648 // Use dedicated unpack instructions for masks that match their pattern.
16649 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, V1, V2, Mask, DAG))
16650 return V;
16651
16652 // Use dedicated pack instructions for masks that match their pattern.
16653 if (SDValue V =
16654 lowerShuffleWithPACK(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16655 return V;
16656
16657 // Try to use lower using a truncation.
16658 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16659 Subtarget, DAG))
16660 return V;
16661
16662 // Try to use shift instructions.
16663 if (SDValue Shift =
16664 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16665 Subtarget, DAG, /*BitwiseOnly*/ false))
16666 return Shift;
16667
16668 // Try to use byte rotation instructions.
16669 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
16670 Subtarget, DAG))
16671 return Rotate;
16672
16673 // Try to create an in-lane repeating shuffle mask and then shuffle the
16674 // results into the target lanes.
16676 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16677 return V;
16678
16679 if (V2.isUndef()) {
16680 // Try to use bit rotation instructions.
16681 if (SDValue Rotate =
16682 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
16683 return Rotate;
16684
16685 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16686 // because that should be faster than the variable permute alternatives.
16687 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
16688 return V;
16689
16690 // There are no generalized cross-lane shuffle operations available on i16
16691 // element types.
16692 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
16694 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16695 return V;
16696
16697 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
16698 DAG, Subtarget);
16699 }
16700
16701 SmallVector<int, 8> RepeatedMask;
16702 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
16703 // As this is a single-input shuffle, the repeated mask should be
16704 // a strictly valid v8i16 mask that we can pass through to the v8i16
16705 // lowering to handle even the v16 case.
16707 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
16708 }
16709 }
16710
16711 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
16712 Zeroable, Subtarget, DAG))
16713 return PSHUFB;
16714
16715 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
16716 if (Subtarget.hasBWI())
16717 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
16718
16719 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16720 // shuffle.
16722 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16723 return Result;
16724
16725 // Try to permute the lanes and then use a per-lane permute.
16727 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16728 return V;
16729
16730 // Try to match an interleave of two v16i16s and lower them as unpck and
16731 // permutes using ymms.
16732 if (!Subtarget.hasAVX512())
16733 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
16734 Mask, DAG))
16735 return V;
16736
16737 // Otherwise fall back on generic lowering.
16738 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16739 Subtarget, DAG);
16740}
16741
16742/// Handle lowering of 32-lane 8-bit integer shuffles.
16743///
16744/// This routine is only called when we have AVX2 and thus a reasonable
16745/// instruction set for v32i8 shuffling..
16747 const APInt &Zeroable, SDValue V1, SDValue V2,
16748 const X86Subtarget &Subtarget,
16749 SelectionDAG &DAG) {
16750 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
16751 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
16752 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
16753 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
16754
16755 // Whenever we can lower this as a zext, that instruction is strictly faster
16756 // than any alternative. It also allows us to fold memory operands into the
16757 // shuffle in many cases.
16758 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
16759 Zeroable, Subtarget, DAG))
16760 return ZExt;
16761
16762 // Check for being able to broadcast a single element.
16763 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
16764 Subtarget, DAG))
16765 return Broadcast;
16766
16767 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
16768 Zeroable, Subtarget, DAG))
16769 return Blend;
16770
16771 // Use dedicated unpack instructions for masks that match their pattern.
16772 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, V1, V2, Mask, DAG))
16773 return V;
16774
16775 // Use dedicated pack instructions for masks that match their pattern.
16776 if (SDValue V =
16777 lowerShuffleWithPACK(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16778 return V;
16779
16780 // Try to use lower using a truncation.
16781 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
16782 Subtarget, DAG))
16783 return V;
16784
16785 // Try to use shift instructions.
16786 if (SDValue Shift =
16787 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
16788 DAG, /*BitwiseOnly*/ false))
16789 return Shift;
16790
16791 // Try to use byte rotation instructions.
16792 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
16793 Subtarget, DAG))
16794 return Rotate;
16795
16796 // Try to use bit rotation instructions.
16797 if (V2.isUndef())
16798 if (SDValue Rotate =
16799 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
16800 return Rotate;
16801
16802 // Try to create an in-lane repeating shuffle mask and then shuffle the
16803 // results into the target lanes.
16805 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16806 return V;
16807
16808 // There are no generalized cross-lane shuffle operations available on i8
16809 // element types.
16810 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
16811 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16812 // because that should be faster than the variable permute alternatives.
16813 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, V1, V2, Mask, DAG))
16814 return V;
16815
16817 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16818 return V;
16819
16820 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
16821 DAG, Subtarget);
16822 }
16823
16824 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
16825 Zeroable, Subtarget, DAG))
16826 return PSHUFB;
16827
16828 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
16829 if (Subtarget.hasVBMI())
16830 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
16831
16832 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16833 // shuffle.
16835 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16836 return Result;
16837
16838 // Try to permute the lanes and then use a per-lane permute.
16840 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16841 return V;
16842
16843 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16844 // by zeroable elements in the remaining 24 elements. Turn this into two
16845 // vmovqb instructions shuffled together.
16846 if (Subtarget.hasVLX())
16847 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
16848 Mask, Zeroable, DAG))
16849 return V;
16850
16851 // Try to match an interleave of two v32i8s and lower them as unpck and
16852 // permutes using ymms.
16853 if (!Subtarget.hasAVX512())
16854 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
16855 Mask, DAG))
16856 return V;
16857
16858 // Otherwise fall back on generic lowering.
16859 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
16860 Subtarget, DAG);
16861}
16862
16863/// High-level routine to lower various 256-bit x86 vector shuffles.
16864///
16865/// This routine either breaks down the specific type of a 256-bit x86 vector
16866/// shuffle or splits it into two 128-bit shuffles and fuses the results back
16867/// together based on the available instructions.
16869 SDValue V1, SDValue V2, const APInt &Zeroable,
16870 const X86Subtarget &Subtarget,
16871 SelectionDAG &DAG) {
16872 // If we have a single input to the zero element, insert that into V1 if we
16873 // can do so cheaply.
16874 int NumElts = VT.getVectorNumElements();
16875 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
16876
16877 if (NumV2Elements == 1 && Mask[0] >= NumElts)
16879 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
16880 return Insertion;
16881
16882 // Handle special cases where the lower or upper half is UNDEF.
16883 if (SDValue V =
16884 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
16885 return V;
16886
16887 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
16888 // can check for those subtargets here and avoid much of the subtarget
16889 // querying in the per-vector-type lowering routines. With AVX1 we have
16890 // essentially *zero* ability to manipulate a 256-bit vector with integer
16891 // types. Since we'll use floating point types there eventually, just
16892 // immediately cast everything to a float and operate entirely in that domain.
16893 if (VT.isInteger() && !Subtarget.hasAVX2()) {
16894 int ElementBits = VT.getScalarSizeInBits();
16895 if (ElementBits < 32) {
16896 // No floating point type available, if we can't use the bit operations
16897 // for masking/blending then decompose into 128-bit vectors.
16898 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
16899 Subtarget, DAG))
16900 return V;
16901 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
16902 return V;
16903 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
16904 }
16905
16906 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
16908 V1 = DAG.getBitcast(FpVT, V1);
16909 V2 = DAG.getBitcast(FpVT, V2);
16910 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
16911 }
16912
16913 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
16914 V1 = DAG.getBitcast(MVT::v16i16, V1);
16915 V2 = DAG.getBitcast(MVT::v16i16, V2);
16916 return DAG.getBitcast(VT,
16917 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
16918 }
16919
16920 switch (VT.SimpleTy) {
16921 case MVT::v4f64:
16922 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16923 case MVT::v4i64:
16924 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16925 case MVT::v8f32:
16926 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16927 case MVT::v8i32:
16928 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16929 case MVT::v16i16:
16930 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16931 case MVT::v32i8:
16932 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16933
16934 default:
16935 llvm_unreachable("Not a valid 256-bit x86 vector type!");
16936 }
16937}
16938
16939/// Try to lower a vector shuffle as a 128-bit shuffles.
16941 const APInt &Zeroable, SDValue V1, SDValue V2,
16942 const X86Subtarget &Subtarget,
16943 SelectionDAG &DAG) {
16944 assert(VT.getScalarSizeInBits() == 64 &&
16945 "Unexpected element type size for 128bit shuffle.");
16946
16947 // To handle 256 bit vector requires VLX and most probably
16948 // function lowerV2X128VectorShuffle() is better solution.
16949 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
16950
16951 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
16952 SmallVector<int, 4> Widened128Mask;
16953 if (!canWidenShuffleElements(Mask, Widened128Mask))
16954 return SDValue();
16955 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
16956
16957 // Try to use an insert into a zero vector.
16958 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
16959 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
16960 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
16961 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
16962 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16963 DAG.getVectorIdxConstant(0, DL));
16964 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16965 getZeroVector(VT, Subtarget, DAG, DL), LoV,
16966 DAG.getVectorIdxConstant(0, DL));
16967 }
16968
16969 // Check for patterns which can be matched with a single insert of a 256-bit
16970 // subvector.
16971 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
16972 if (OnlyUsesV1 ||
16973 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
16974 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
16975 SDValue SubVec =
16976 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
16977 DAG.getVectorIdxConstant(0, DL));
16978 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16979 DAG.getVectorIdxConstant(4, DL));
16980 }
16981
16982 // See if this is an insertion of the lower 128-bits of V2 into V1.
16983 bool IsInsert = true;
16984 int V2Index = -1;
16985 for (int i = 0; i < 4; ++i) {
16986 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
16987 if (Widened128Mask[i] < 0)
16988 continue;
16989
16990 // Make sure all V1 subvectors are in place.
16991 if (Widened128Mask[i] < 4) {
16992 if (Widened128Mask[i] != i) {
16993 IsInsert = false;
16994 break;
16995 }
16996 } else {
16997 // Make sure we only have a single V2 index and its the lowest 128-bits.
16998 if (V2Index >= 0 || Widened128Mask[i] != 4) {
16999 IsInsert = false;
17000 break;
17001 }
17002 V2Index = i;
17003 }
17004 }
17005 if (IsInsert && V2Index >= 0) {
17006 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
17007 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
17008 DAG.getVectorIdxConstant(0, DL));
17009 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
17010 }
17011
17012 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
17013 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
17014 // possible we at least ensure the lanes stay sequential to help later
17015 // combines.
17016 SmallVector<int, 2> Widened256Mask;
17017 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17018 Widened128Mask.clear();
17019 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17020 }
17021
17022 // Try to lower to vshuf64x2/vshuf32x4.
17023 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17024 int PermMask[4] = {-1, -1, -1, -1};
17025 // Ensure elements came from the same Op.
17026 for (int i = 0; i < 4; ++i) {
17027 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17028 if (Widened128Mask[i] < 0)
17029 continue;
17030
17031 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17032 unsigned OpIndex = i / 2;
17033 if (Ops[OpIndex].isUndef())
17034 Ops[OpIndex] = Op;
17035 else if (Ops[OpIndex] != Op)
17036 return SDValue();
17037
17038 PermMask[i] = Widened128Mask[i] % 4;
17039 }
17040
17041 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17042 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
17043}
17044
17045/// Handle lowering of 8-lane 64-bit floating point shuffles.
17047 const APInt &Zeroable, SDValue V1, SDValue V2,
17048 const X86Subtarget &Subtarget,
17049 SelectionDAG &DAG) {
17050 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17051 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17052 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17053
17054 if (V2.isUndef()) {
17055 // Use low duplicate instructions for masks that match their pattern.
17056 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17057 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17058
17059 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17060 // Non-half-crossing single input shuffles can be lowered with an
17061 // interleaved permutation.
17062 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17063 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17064 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17065 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17066 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17067 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17068 }
17069
17070 SmallVector<int, 4> RepeatedMask;
17071 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17072 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17073 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17074 }
17075
17076 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17077 V2, Subtarget, DAG))
17078 return Shuf128;
17079
17080 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, V1, V2, Mask, DAG))
17081 return Unpck;
17082
17083 // Check if the blend happens to exactly fit that of SHUFPD.
17084 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17085 Zeroable, Subtarget, DAG))
17086 return Op;
17087
17088 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f64, V1, V2, Mask, Zeroable,
17089 Subtarget, DAG))
17090 return V;
17091
17092 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17093 Zeroable, Subtarget, DAG))
17094 return Blend;
17095
17096 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17097}
17098
17099/// Handle lowering of 16-lane 32-bit floating point shuffles.
17101 const APInt &Zeroable, SDValue V1, SDValue V2,
17102 const X86Subtarget &Subtarget,
17103 SelectionDAG &DAG) {
17104 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17105 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17106 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17107
17108 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17109 // options to efficiently lower the shuffle.
17110 SmallVector<int, 4> RepeatedMask;
17111 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17112 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17113
17114 // Use even/odd duplicate instructions for masks that match their pattern.
17115 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17116 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17117 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17118 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17119
17120 if (V2.isUndef())
17121 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17122 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17123
17124 // Use dedicated unpack instructions for masks that match their pattern.
17125 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, V1, V2, Mask, DAG))
17126 return V;
17127
17128 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17129 Zeroable, Subtarget, DAG))
17130 return Blend;
17131
17132 // Otherwise, fall back to a SHUFPS sequence.
17133 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17134 }
17135
17136 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17137 Zeroable, Subtarget, DAG))
17138 return Blend;
17139
17141 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17142 return DAG.getBitcast(MVT::v16f32, ZExt);
17143
17144 // Try to create an in-lane repeating shuffle mask and then shuffle the
17145 // results into the target lanes.
17147 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17148 return V;
17149
17150 // If we have a single input shuffle with different shuffle patterns in the
17151 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17152 if (V2.isUndef() &&
17153 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
17154 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
17155 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
17156 }
17157
17158 // If we have AVX512F support, we can use VEXPAND.
17159 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16f32, V1, V2, Mask,
17160 Zeroable, Subtarget, DAG))
17161 return V;
17162
17163 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
17164}
17165
17166/// Handle lowering of 8-lane 64-bit integer shuffles.
17168 const APInt &Zeroable, SDValue V1, SDValue V2,
17169 const X86Subtarget &Subtarget,
17170 SelectionDAG &DAG) {
17171 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17172 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17173 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17174
17175 // Try to use shift instructions if fast.
17176 if (Subtarget.preferLowerShuffleAsShift())
17177 if (SDValue Shift =
17178 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17179 Subtarget, DAG, /*BitwiseOnly*/ true))
17180 return Shift;
17181
17182 if (V2.isUndef()) {
17183 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17184 // can use lower latency instructions that will operate on all four
17185 // 128-bit lanes.
17186 SmallVector<int, 2> Repeated128Mask;
17187 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
17188 SmallVector<int, 4> PSHUFDMask;
17189 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
17190 return DAG.getBitcast(
17191 MVT::v8i64,
17192 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
17193 DAG.getBitcast(MVT::v16i32, V1),
17194 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17195 }
17196
17197 SmallVector<int, 4> Repeated256Mask;
17198 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
17199 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
17200 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
17201 }
17202
17203 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
17204 V2, Subtarget, DAG))
17205 return Shuf128;
17206
17207 // Try to use shift instructions.
17208 if (SDValue Shift =
17209 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
17210 DAG, /*BitwiseOnly*/ false))
17211 return Shift;
17212
17213 // Try to use VALIGN.
17214 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17215 Zeroable, Subtarget, DAG))
17216 return Rotate;
17217
17218 // Try to use PALIGNR.
17219 if (Subtarget.hasBWI())
17220 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17221 Subtarget, DAG))
17222 return Rotate;
17223
17224 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, V1, V2, Mask, DAG))
17225 return Unpck;
17226
17227 // If we have AVX512F support, we can use VEXPAND.
17228 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17229 Subtarget, DAG))
17230 return V;
17231
17232 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17233 Zeroable, Subtarget, DAG))
17234 return Blend;
17235
17236 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
17237}
17238
17239/// Handle lowering of 16-lane 32-bit integer shuffles.
17241 const APInt &Zeroable, SDValue V1, SDValue V2,
17242 const X86Subtarget &Subtarget,
17243 SelectionDAG &DAG) {
17244 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17245 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17246 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17247
17248 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
17249
17250 // Whenever we can lower this as a zext, that instruction is strictly faster
17251 // than any alternative. It also allows us to fold memory operands into the
17252 // shuffle in many cases.
17254 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17255 return ZExt;
17256
17257 // Try to use shift instructions if fast.
17258 if (Subtarget.preferLowerShuffleAsShift()) {
17259 if (SDValue Shift =
17260 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17261 Subtarget, DAG, /*BitwiseOnly*/ true))
17262 return Shift;
17263 if (NumV2Elements == 0)
17264 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
17265 Subtarget, DAG))
17266 return Rotate;
17267 }
17268
17269 // If the shuffle mask is repeated in each 128-bit lane we can use more
17270 // efficient instructions that mirror the shuffles across the four 128-bit
17271 // lanes.
17272 SmallVector<int, 4> RepeatedMask;
17273 bool Is128BitLaneRepeatedShuffle =
17274 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17275 if (Is128BitLaneRepeatedShuffle) {
17276 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17277 if (V2.isUndef())
17278 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17279 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17280
17281 // Use dedicated unpack instructions for masks that match their pattern.
17282 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, V1, V2, Mask, DAG))
17283 return V;
17284 }
17285
17286 // Try to use shift instructions.
17287 if (SDValue Shift =
17288 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17289 Subtarget, DAG, /*BitwiseOnly*/ false))
17290 return Shift;
17291
17292 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
17293 if (SDValue Rotate =
17294 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
17295 return Rotate;
17296
17297 // Try to use VALIGN.
17298 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17299 Zeroable, Subtarget, DAG))
17300 return Rotate;
17301
17302 // Try to use byte rotation instructions.
17303 if (Subtarget.hasBWI())
17304 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17305 Subtarget, DAG))
17306 return Rotate;
17307
17308 // Assume that a single SHUFPS is faster than using a permv shuffle.
17309 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17310 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17311 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17312 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17313 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17314 CastV1, CastV2, DAG);
17315 return DAG.getBitcast(MVT::v16i32, ShufPS);
17316 }
17317
17318 // Try to create an in-lane repeating shuffle mask and then shuffle the
17319 // results into the target lanes.
17321 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17322 return V;
17323
17324 // If we have AVX512F support, we can use VEXPAND.
17325 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16i32, V1, V2, Mask,
17326 Zeroable, Subtarget, DAG))
17327 return V;
17328
17329 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17330 Zeroable, Subtarget, DAG))
17331 return Blend;
17332
17333 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17334}
17335
17336/// Handle lowering of 32-lane 16-bit integer shuffles.
17338 const APInt &Zeroable, SDValue V1, SDValue V2,
17339 const X86Subtarget &Subtarget,
17340 SelectionDAG &DAG) {
17341 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17342 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17343 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17344 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17345
17346 // Whenever we can lower this as a zext, that instruction is strictly faster
17347 // than any alternative. It also allows us to fold memory operands into the
17348 // shuffle in many cases.
17350 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17351 return ZExt;
17352
17353 // Use dedicated unpack instructions for masks that match their pattern.
17354 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, V1, V2, Mask, DAG))
17355 return V;
17356
17357 // Use dedicated pack instructions for masks that match their pattern.
17358 if (SDValue V =
17359 lowerShuffleWithPACK(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17360 return V;
17361
17362 // Try to use shift instructions.
17363 if (SDValue Shift =
17364 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
17365 Subtarget, DAG, /*BitwiseOnly*/ false))
17366 return Shift;
17367
17368 // Try to use byte rotation instructions.
17369 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17370 Subtarget, DAG))
17371 return Rotate;
17372
17373 if (V2.isUndef()) {
17374 // Try to use bit rotation instructions.
17375 if (SDValue Rotate =
17376 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17377 return Rotate;
17378
17379 SmallVector<int, 8> RepeatedMask;
17380 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17381 // As this is a single-input shuffle, the repeated mask should be
17382 // a strictly valid v8i16 mask that we can pass through to the v8i16
17383 // lowering to handle even the v32 case.
17384 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17385 RepeatedMask, Subtarget, DAG);
17386 }
17387 }
17388
17389 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17390 Zeroable, Subtarget, DAG))
17391 return Blend;
17392
17393 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17394 Zeroable, Subtarget, DAG))
17395 return PSHUFB;
17396
17397 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17398 // shuffle.
17399 if (!V2.isUndef())
17401 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17402 return Result;
17403
17404 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17405}
17406
17407/// Handle lowering of 64-lane 8-bit integer shuffles.
17409 const APInt &Zeroable, SDValue V1, SDValue V2,
17410 const X86Subtarget &Subtarget,
17411 SelectionDAG &DAG) {
17412 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17413 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17414 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
17415 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17416
17417 // Whenever we can lower this as a zext, that instruction is strictly faster
17418 // than any alternative. It also allows us to fold memory operands into the
17419 // shuffle in many cases.
17421 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17422 return ZExt;
17423
17424 // Use dedicated unpack instructions for masks that match their pattern.
17425 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, V1, V2, Mask, DAG))
17426 return V;
17427
17428 // Use dedicated pack instructions for masks that match their pattern.
17429 if (SDValue V =
17430 lowerShuffleWithPACK(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17431 return V;
17432
17433 // Try to use shift instructions.
17434 if (SDValue Shift =
17435 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
17436 DAG, /*BitwiseOnly*/ false))
17437 return Shift;
17438
17439 // Try to use byte rotation instructions.
17440 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17441 Subtarget, DAG))
17442 return Rotate;
17443
17444 // Try to use bit rotation instructions.
17445 if (V2.isUndef())
17446 if (SDValue Rotate =
17447 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
17448 return Rotate;
17449
17450 // Lower as AND if possible.
17451 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
17452 Zeroable, Subtarget, DAG))
17453 return Masked;
17454
17455 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
17456 Zeroable, Subtarget, DAG))
17457 return PSHUFB;
17458
17459 // Try to create an in-lane repeating shuffle mask and then shuffle the
17460 // results into the target lanes.
17462 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17463 return V;
17464
17466 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
17467 return Result;
17468
17469 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
17470 Zeroable, Subtarget, DAG))
17471 return Blend;
17472
17473 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
17474 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
17475 // PALIGNR will be cheaper than the second PSHUFB+OR.
17476 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
17477 Mask, Subtarget, DAG))
17478 return V;
17479
17480 // If we can't directly blend but can use PSHUFB, that will be better as it
17481 // can both shuffle and set up the inefficient blend.
17482 bool V1InUse, V2InUse;
17483 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
17484 DAG, V1InUse, V2InUse);
17485 }
17486
17487 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17488 // shuffle.
17489 if (!V2.isUndef())
17491 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17492 return Result;
17493
17494 // VBMI can use VPERMV/VPERMV3 byte shuffles.
17495 if (Subtarget.hasVBMI())
17496 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
17497
17498 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17499}
17500
17501/// High-level routine to lower various 512-bit x86 vector shuffles.
17502///
17503/// This routine either breaks down the specific type of a 512-bit x86 vector
17504/// shuffle or splits it into two 256-bit shuffles and fuses the results back
17505/// together based on the available instructions.
17507 MVT VT, SDValue V1, SDValue V2,
17508 const APInt &Zeroable,
17509 const X86Subtarget &Subtarget,
17510 SelectionDAG &DAG) {
17511 assert(Subtarget.hasAVX512() &&
17512 "Cannot lower 512-bit vectors w/ basic ISA!");
17513
17514 // If we have a single input to the zero element, insert that into V1 if we
17515 // can do so cheaply.
17516 int NumElts = Mask.size();
17517 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17518
17519 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17521 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17522 return Insertion;
17523
17524 // Handle special cases where the lower or upper half is UNDEF.
17525 if (SDValue V =
17526 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17527 return V;
17528
17529 // Check for being able to broadcast a single element.
17530 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
17531 Subtarget, DAG))
17532 return Broadcast;
17533
17534 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
17535 // Try using bit ops for masking and blending before falling back to
17536 // splitting.
17537 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17538 Subtarget, DAG))
17539 return V;
17540 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17541 return V;
17542
17543 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17544 }
17545
17546 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
17547 if (!Subtarget.hasBWI())
17548 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17549 /*SimpleOnly*/ false);
17550
17551 V1 = DAG.getBitcast(MVT::v32i16, V1);
17552 V2 = DAG.getBitcast(MVT::v32i16, V2);
17553 return DAG.getBitcast(VT,
17554 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
17555 }
17556
17557 // Dispatch to each element type for lowering. If we don't have support for
17558 // specific element type shuffles at 512 bits, immediately split them and
17559 // lower them. Each lowering routine of a given type is allowed to assume that
17560 // the requisite ISA extensions for that element type are available.
17561 switch (VT.SimpleTy) {
17562 case MVT::v8f64:
17563 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17564 case MVT::v16f32:
17565 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17566 case MVT::v8i64:
17567 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17568 case MVT::v16i32:
17569 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17570 case MVT::v32i16:
17571 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17572 case MVT::v64i8:
17573 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17574
17575 default:
17576 llvm_unreachable("Not a valid 512-bit x86 vector type!");
17577 }
17578}
17579
17581 MVT VT, SDValue V1, SDValue V2,
17582 const X86Subtarget &Subtarget,
17583 SelectionDAG &DAG) {
17584 // Shuffle should be unary.
17585 if (!V2.isUndef())
17586 return SDValue();
17587
17588 int ShiftAmt = -1;
17589 int NumElts = Mask.size();
17590 for (int i = 0; i != NumElts; ++i) {
17591 int M = Mask[i];
17592 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
17593 "Unexpected mask index.");
17594 if (M < 0)
17595 continue;
17596
17597 // The first non-undef element determines our shift amount.
17598 if (ShiftAmt < 0) {
17599 ShiftAmt = M - i;
17600 // Need to be shifting right.
17601 if (ShiftAmt <= 0)
17602 return SDValue();
17603 }
17604 // All non-undef elements must shift by the same amount.
17605 if (ShiftAmt != M - i)
17606 return SDValue();
17607 }
17608 assert(ShiftAmt >= 0 && "All undef?");
17609
17610 // Great we found a shift right.
17611 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
17612 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
17613 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17614 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17615 DAG.getVectorIdxConstant(0, DL));
17616}
17617
17618// Determine if this shuffle can be implemented with a KSHIFT instruction.
17619// Returns the shift amount if possible or -1 if not. This is a simplified
17620// version of matchShuffleAsShift.
17621static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17622 int MaskOffset, const APInt &Zeroable) {
17623 int Size = Mask.size();
17624
17625 auto CheckZeros = [&](int Shift, bool Left) {
17626 for (int j = 0; j < Shift; ++j)
17627 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17628 return false;
17629
17630 return true;
17631 };
17632
17633 auto MatchShift = [&](int Shift, bool Left) {
17634 unsigned Pos = Left ? Shift : 0;
17635 unsigned Low = Left ? 0 : Shift;
17636 unsigned Len = Size - Shift;
17637 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17638 };
17639
17640 for (int Shift = 1; Shift != Size; ++Shift)
17641 for (bool Left : {true, false})
17642 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17644 return Shift;
17645 }
17646
17647 return -1;
17648}
17649
17650
17651// Lower vXi1 vector shuffles.
17652// There is no a dedicated instruction on AVX-512 that shuffles the masks.
17653// The only way to shuffle bits is to sign-extend the mask vector to SIMD
17654// vector, shuffle and then truncate it back.
17656 MVT VT, SDValue V1, SDValue V2,
17657 const APInt &Zeroable,
17658 const X86Subtarget &Subtarget,
17659 SelectionDAG &DAG) {
17660 assert(Subtarget.hasAVX512() &&
17661 "Cannot lower 512-bit vectors w/o basic ISA!");
17662
17663 int NumElts = Mask.size();
17664 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17665
17666 // Try to recognize shuffles that are just padding a subvector with zeros.
17667 int SubvecElts = 0;
17668 int Src = -1;
17669 for (int i = 0; i != NumElts; ++i) {
17670 if (Mask[i] >= 0) {
17671 // Grab the source from the first valid mask. All subsequent elements need
17672 // to use this same source.
17673 if (Src < 0)
17674 Src = Mask[i] / NumElts;
17675 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
17676 break;
17677 }
17678
17679 ++SubvecElts;
17680 }
17681 assert(SubvecElts != NumElts && "Identity shuffle?");
17682
17683 // Clip to a power 2.
17684 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
17685
17686 // Make sure the number of zeroable bits in the top at least covers the bits
17687 // not covered by the subvector.
17688 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
17689 assert(Src >= 0 && "Expected a source!");
17690 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
17691 SDValue Extract =
17692 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Src == 0 ? V1 : V2,
17693 DAG.getVectorIdxConstant(0, DL));
17694 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17695 DAG.getConstant(0, DL, VT), Extract,
17696 DAG.getVectorIdxConstant(0, DL));
17697 }
17698
17699 // Try a simple shift right with undef elements. Later we'll try with zeros.
17700 if (SDValue Shift =
17701 lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, DAG))
17702 return Shift;
17703
17704 // Try to match KSHIFTs.
17705 unsigned Offset = 0;
17706 for (SDValue V : {V1, V2}) {
17707 unsigned Opcode;
17708 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
17709 if (ShiftAmt >= 0) {
17710 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
17711 MVT WideVT = Res.getSimpleValueType();
17712 // Widened right shifts need two shifts to ensure we shift in zeroes.
17713 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
17714 int WideElts = WideVT.getVectorNumElements();
17715 // Shift left to put the original vector in the MSBs of the new size.
17716 Res =
17717 DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
17718 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
17719 // Increase the shift amount to account for the left shift.
17720 ShiftAmt += WideElts - NumElts;
17721 }
17722
17723 Res = DAG.getNode(Opcode, DL, WideVT, Res,
17724 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17725 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17726 DAG.getVectorIdxConstant(0, DL));
17727 }
17728 Offset += NumElts; // Increment for next iteration.
17729 }
17730
17731 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
17732 // ops instead.
17733 // TODO: What other unary shuffles would benefit from this?
17734 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
17735 SDValue Op0 = V1.getOperand(0);
17736 SDValue Op1 = V1.getOperand(1);
17737 ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
17738 EVT OpVT = Op0.getValueType();
17739 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
17740 return DAG.getSetCC(
17741 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
17742 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
17743 }
17744
17745 MVT ExtVT;
17746 switch (VT.SimpleTy) {
17747 default:
17748 llvm_unreachable("Expected a vector of i1 elements");
17749 case MVT::v2i1:
17750 ExtVT = MVT::v2i64;
17751 break;
17752 case MVT::v4i1:
17753 ExtVT = MVT::v4i32;
17754 break;
17755 case MVT::v8i1:
17756 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
17757 // shuffle.
17758 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
17759 break;
17760 case MVT::v16i1:
17761 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17762 // 256-bit operation available.
17763 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
17764 break;
17765 case MVT::v32i1:
17766 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17767 // 256-bit operation available.
17768 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
17769 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
17770 break;
17771 case MVT::v64i1:
17772 // Fall back to scalarization. FIXME: We can do better if the shuffle
17773 // can be partitioned cleanly.
17774 if (!Subtarget.useBWIRegs())
17775 return SDValue();
17776 ExtVT = MVT::v64i8;
17777 break;
17778 }
17779
17780 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
17781 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
17782
17783 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
17784 // i1 was sign extended we can use X86ISD::CVT2MASK.
17785 int NumElems = VT.getVectorNumElements();
17786 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
17787 (Subtarget.hasDQI() && (NumElems < 32)))
17788 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
17789 Shuffle, ISD::SETGT);
17790
17791 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
17792}
17793
17794/// Helper function that returns true if the shuffle mask should be
17795/// commuted to improve canonicalization.
17797 int NumElements = Mask.size();
17798
17799 int NumV1Elements = 0, NumV2Elements = 0;
17800 for (int M : Mask)
17801 if (M < 0)
17802 continue;
17803 else if (M < NumElements)
17804 ++NumV1Elements;
17805 else
17806 ++NumV2Elements;
17807
17808 // Commute the shuffle as needed such that more elements come from V1 than
17809 // V2. This allows us to match the shuffle pattern strictly on how many
17810 // elements come from V1 without handling the symmetric cases.
17811 if (NumV2Elements > NumV1Elements)
17812 return true;
17813
17814 assert(NumV1Elements > 0 && "No V1 indices");
17815
17816 if (NumV2Elements == 0)
17817 return false;
17818
17819 // When the number of V1 and V2 elements are the same, try to minimize the
17820 // number of uses of V2 in the low half of the vector. When that is tied,
17821 // ensure that the sum of indices for V1 is equal to or lower than the sum
17822 // indices for V2. When those are equal, try to ensure that the number of odd
17823 // indices for V1 is lower than the number of odd indices for V2.
17824 if (NumV1Elements == NumV2Elements) {
17825 int LowV1Elements = 0, LowV2Elements = 0;
17826 for (int M : Mask.slice(0, NumElements / 2))
17827 if (M >= NumElements)
17828 ++LowV2Elements;
17829 else if (M >= 0)
17830 ++LowV1Elements;
17831 if (LowV2Elements > LowV1Elements)
17832 return true;
17833 if (LowV2Elements == LowV1Elements) {
17834 int SumV1Indices = 0, SumV2Indices = 0;
17835 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17836 if (Mask[i] >= NumElements)
17837 SumV2Indices += i;
17838 else if (Mask[i] >= 0)
17839 SumV1Indices += i;
17840 if (SumV2Indices < SumV1Indices)
17841 return true;
17842 if (SumV2Indices == SumV1Indices) {
17843 int NumV1OddIndices = 0, NumV2OddIndices = 0;
17844 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17845 if (Mask[i] >= NumElements)
17846 NumV2OddIndices += i % 2;
17847 else if (Mask[i] >= 0)
17848 NumV1OddIndices += i % 2;
17849 if (NumV2OddIndices < NumV1OddIndices)
17850 return true;
17851 }
17852 }
17853 }
17854
17855 return false;
17856}
17857
17859 const X86Subtarget &Subtarget) {
17860 if (!Subtarget.hasAVX512())
17861 return false;
17862
17863 if (!V.getValueType().isSimple())
17864 return false;
17865
17866 MVT VT = V.getSimpleValueType().getScalarType();
17867 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
17868 return false;
17869
17870 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
17871 // are preferable to blendw/blendvb/masked-mov.
17872 if ((VT == MVT::i16 || VT == MVT::i8) &&
17873 V.getSimpleValueType().getSizeInBits() < 512)
17874 return false;
17875
17876 auto HasMaskOperation = [&](SDValue V) {
17877 // TODO: Currently we only check limited opcode. We probably extend
17878 // it to all binary operation by checking TLI.isBinOp().
17879 switch (V->getOpcode()) {
17880 default:
17881 return false;
17882 case ISD::ADD:
17883 case ISD::SUB:
17884 case ISD::AND:
17885 case ISD::XOR:
17886 case ISD::OR:
17887 case ISD::SMAX:
17888 case ISD::SMIN:
17889 case ISD::UMAX:
17890 case ISD::UMIN:
17891 case ISD::ABS:
17892 case ISD::SHL:
17893 case ISD::SRL:
17894 case ISD::SRA:
17895 case ISD::MUL:
17896 break;
17897 }
17898 if (!V->hasOneUse())
17899 return false;
17900
17901 return true;
17902 };
17903
17904 if (HasMaskOperation(V))
17905 return true;
17906
17907 return false;
17908}
17909
17910// Forward declaration.
17913 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
17914 const X86Subtarget &Subtarget);
17915
17916 /// Top-level lowering for x86 vector shuffles.
17917///
17918/// This handles decomposition, canonicalization, and lowering of all x86
17919/// vector shuffles. Most of the specific lowering strategies are encapsulated
17920/// above in helper routines. The canonicalization attempts to widen shuffles
17921/// to involve fewer lanes of wider elements, consolidate symmetric patterns
17922/// s.t. only one of the two inputs needs to be tested, etc.
17924 SelectionDAG &DAG) {
17925 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
17926 ArrayRef<int> OrigMask = SVOp->getMask();
17927 SDValue V1 = Op.getOperand(0);
17928 SDValue V2 = Op.getOperand(1);
17929 MVT VT = Op.getSimpleValueType();
17930 int NumElements = VT.getVectorNumElements();
17931 SDLoc DL(Op);
17932 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
17933
17934 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
17935 "Can't lower MMX shuffles");
17936
17937 bool V1IsUndef = V1.isUndef();
17938 bool V2IsUndef = V2.isUndef();
17939 if (V1IsUndef && V2IsUndef)
17940 return DAG.getUNDEF(VT);
17941
17942 // When we create a shuffle node we put the UNDEF node to second operand,
17943 // but in some cases the first operand may be transformed to UNDEF.
17944 // In this case we should just commute the node.
17945 if (V1IsUndef)
17946 return DAG.getCommutedVectorShuffle(*SVOp);
17947
17948 // Check for non-undef masks pointing at an undef vector and make the masks
17949 // undef as well. This makes it easier to match the shuffle based solely on
17950 // the mask.
17951 if (V2IsUndef &&
17952 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
17953 SmallVector<int, 8> NewMask(OrigMask);
17954 for (int &M : NewMask)
17955 if (M >= NumElements)
17956 M = -1;
17957 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
17958 }
17959
17960 // Check for illegal shuffle mask element index values.
17961 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
17962 (void)MaskUpperLimit;
17963 assert(llvm::all_of(OrigMask,
17964 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
17965 "Out of bounds shuffle index");
17966
17967 // We actually see shuffles that are entirely re-arrangements of a set of
17968 // zero inputs. This mostly happens while decomposing complex shuffles into
17969 // simple ones. Directly lower these as a buildvector of zeros.
17970 APInt KnownUndef, KnownZero;
17971 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
17972
17973 APInt Zeroable = KnownUndef | KnownZero;
17974 if (Zeroable.isAllOnes())
17975 return getZeroVector(VT, Subtarget, DAG, DL);
17976
17977 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
17978
17979 // Try to collapse shuffles into using a vector type with fewer elements but
17980 // wider element types. We cap this to not form integers or floating point
17981 // elements wider than 64 bits. It does not seem beneficial to form i128
17982 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
17983 SmallVector<int, 16> WidenedMask;
17984 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
17985 !canCombineAsMaskOperation(V1, Subtarget) &&
17986 !canCombineAsMaskOperation(V2, Subtarget) &&
17987 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
17988 // Shuffle mask widening should not interfere with a broadcast opportunity
17989 // by obfuscating the operands with bitcasts.
17990 // TODO: Avoid lowering directly from this top-level function: make this
17991 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
17992 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
17993 Subtarget, DAG))
17994 return Broadcast;
17995
17996 MVT NewEltVT = VT.isFloatingPoint()
17999 int NewNumElts = NumElements / 2;
18000 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
18001 // Make sure that the new vector type is legal. For example, v2f64 isn't
18002 // legal on SSE1.
18003 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
18004 if (V2IsZero) {
18005 // Modify the new Mask to take all zeros from the all-zero vector.
18006 // Choose indices that are blend-friendly.
18007 bool UsedZeroVector = false;
18008 assert(is_contained(WidenedMask, SM_SentinelZero) &&
18009 "V2's non-undef elements are used?!");
18010 for (int i = 0; i != NewNumElts; ++i)
18011 if (WidenedMask[i] == SM_SentinelZero) {
18012 WidenedMask[i] = i + NewNumElts;
18013 UsedZeroVector = true;
18014 }
18015 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
18016 // some elements to be undef.
18017 if (UsedZeroVector)
18018 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18019 }
18020 V1 = DAG.getBitcast(NewVT, V1);
18021 V2 = DAG.getBitcast(NewVT, V2);
18022 return DAG.getBitcast(
18023 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18024 }
18025 }
18026
18027 SmallVector<SDValue> Ops = {V1, V2};
18028 SmallVector<int> Mask(OrigMask);
18029
18030 // Canonicalize the shuffle with any horizontal ops inputs.
18031 // NOTE: This may update Ops and Mask.
18033 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
18034 return DAG.getBitcast(VT, HOp);
18035
18036 V1 = DAG.getBitcast(VT, Ops[0]);
18037 V2 = DAG.getBitcast(VT, Ops[1]);
18038 assert(NumElements == (int)Mask.size() &&
18039 "canonicalizeShuffleMaskWithHorizOp "
18040 "shouldn't alter the shuffle mask size");
18041
18042 // Commute the shuffle if it will improve canonicalization.
18045 std::swap(V1, V2);
18046 }
18047
18048 // For each vector width, delegate to a specialized lowering routine.
18049 if (VT.is128BitVector())
18050 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18051
18052 if (VT.is256BitVector())
18053 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18054
18055 if (VT.is512BitVector())
18056 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18057
18058 if (Is1BitVector)
18059 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18060
18061 llvm_unreachable("Unimplemented!");
18062}
18063
18064// As legal vpcompress instructions depend on various AVX512 extensions, try to
18065// convert illegal vector sizes to legal ones to avoid expansion.
18067 SelectionDAG &DAG) {
18068 assert(Subtarget.hasAVX512() &&
18069 "Need AVX512 for custom VECTOR_COMPRESS lowering.");
18070
18071 SDLoc DL(Op);
18072 SDValue Vec = Op.getOperand(0);
18073 SDValue Mask = Op.getOperand(1);
18074 SDValue Passthru = Op.getOperand(2);
18075
18076 EVT VecVT = Vec.getValueType();
18077 EVT ElementVT = VecVT.getVectorElementType();
18078 unsigned NumElements = VecVT.getVectorNumElements();
18079 unsigned NumVecBits = VecVT.getFixedSizeInBits();
18080 unsigned NumElementBits = ElementVT.getFixedSizeInBits();
18081
18082 // 128- and 256-bit vectors with <= 16 elements can be converted to and
18083 // compressed as 512-bit vectors in AVX512F.
18084 if (NumVecBits != 128 && NumVecBits != 256)
18085 return SDValue();
18086
18087 if (NumElementBits == 32 || NumElementBits == 64) {
18088 unsigned NumLargeElements = 512 / NumElementBits;
18089 MVT LargeVecVT =
18090 MVT::getVectorVT(ElementVT.getSimpleVT(), NumLargeElements);
18091 MVT LargeMaskVT = MVT::getVectorVT(MVT::i1, NumLargeElements);
18092
18093 Vec = widenSubVector(LargeVecVT, Vec, /*ZeroNewElements=*/false, Subtarget,
18094 DAG, DL);
18095 Mask = widenSubVector(LargeMaskVT, Mask, /*ZeroNewElements=*/true,
18096 Subtarget, DAG, DL);
18097 Passthru = Passthru.isUndef() ? DAG.getUNDEF(LargeVecVT)
18098 : widenSubVector(LargeVecVT, Passthru,
18099 /*ZeroNewElements=*/false,
18100 Subtarget, DAG, DL);
18101
18102 SDValue Compressed =
18103 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18104 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Compressed,
18105 DAG.getConstant(0, DL, MVT::i64));
18106 }
18107
18108 if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||
18109 VecVT == MVT::v16i16) {
18110 MVT LageElementVT = MVT::getIntegerVT(512 / NumElements);
18111 EVT LargeVecVT = MVT::getVectorVT(LageElementVT, NumElements);
18112
18113 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Vec);
18114 Passthru = Passthru.isUndef()
18115 ? DAG.getUNDEF(LargeVecVT)
18116 : DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Passthru);
18117
18118 SDValue Compressed =
18119 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18120 return DAG.getNode(ISD::TRUNCATE, DL, VecVT, Compressed);
18121 }
18122
18123 return SDValue();
18124}
18125
18126/// Try to lower a VSELECT instruction to a vector shuffle.
18128 const X86Subtarget &Subtarget,
18129 SelectionDAG &DAG) {
18130 SDValue Cond = Op.getOperand(0);
18131 SDValue LHS = Op.getOperand(1);
18132 SDValue RHS = Op.getOperand(2);
18133 MVT VT = Op.getSimpleValueType();
18134
18135 // Only non-legal VSELECTs reach this lowering, convert those into generic
18136 // shuffles and re-use the shuffle lowering path for blends.
18140 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18141 }
18142
18143 return SDValue();
18144}
18145
18146SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18147 SDValue Cond = Op.getOperand(0);
18148 SDValue LHS = Op.getOperand(1);
18149 SDValue RHS = Op.getOperand(2);
18150
18151 SDLoc dl(Op);
18152 MVT VT = Op.getSimpleValueType();
18153 if (isSoftF16(VT, Subtarget)) {
18155 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
18156 DAG.getBitcast(NVT, LHS),
18157 DAG.getBitcast(NVT, RHS)));
18158 }
18159
18160 // A vselect where all conditions and data are constants can be optimized into
18161 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18165 return SDValue();
18166
18167 // Try to lower this to a blend-style vector shuffle. This can handle all
18168 // constant condition cases.
18169 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18170 return BlendOp;
18171
18172 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18173 // with patterns on the mask registers on AVX-512.
18174 MVT CondVT = Cond.getSimpleValueType();
18175 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18176 if (CondEltSize == 1)
18177 return Op;
18178
18179 // Variable blends are only legal from SSE4.1 onward.
18180 if (!Subtarget.hasSSE41())
18181 return SDValue();
18182
18183 unsigned EltSize = VT.getScalarSizeInBits();
18184 unsigned NumElts = VT.getVectorNumElements();
18185
18186 // Expand v32i16/v64i8 without BWI.
18187 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18188 return SDValue();
18189
18190 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18191 // into an i1 condition so that we can use the mask-based 512-bit blend
18192 // instructions.
18193 if (VT.getSizeInBits() == 512) {
18194 // Build a mask by testing the condition against zero.
18195 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18196 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18197 DAG.getConstant(0, dl, CondVT),
18198 ISD::SETNE);
18199 // Now return a new VSELECT using the mask.
18200 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18201 }
18202
18203 // SEXT/TRUNC cases where the mask doesn't match the destination size.
18204 if (CondEltSize != EltSize) {
18205 // If we don't have a sign splat, rely on the expansion.
18206 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18207 return SDValue();
18208
18209 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18210 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18211 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18212 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18213 }
18214
18215 // v16i16/v32i8 selects without AVX2, if the condition and another operand
18216 // are free to split, then better to split before expanding the
18217 // select. Don't bother with XOP as it has the fast VPCMOV instruction.
18218 // TODO: This is very similar to narrowVectorSelect.
18219 // TODO: Add Load splitting to isFreeToSplitVector ?
18220 if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
18221 !Subtarget.hasXOP()) {
18222 bool FreeCond = isFreeToSplitVector(Cond.getNode(), DAG);
18223 bool FreeLHS = isFreeToSplitVector(LHS.getNode(), DAG) ||
18224 (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
18225 bool FreeRHS = isFreeToSplitVector(RHS.getNode(), DAG) ||
18226 (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
18227 if (FreeCond && (FreeLHS || FreeRHS))
18228 return splitVectorOp(Op, DAG, dl);
18229 }
18230
18231 // Only some types will be legal on some subtargets. If we can emit a legal
18232 // VSELECT-matching blend, return Op, and but if we need to expand, return
18233 // a null value.
18234 switch (VT.SimpleTy) {
18235 default:
18236 // Most of the vector types have blends past SSE4.1.
18237 return Op;
18238
18239 case MVT::v32i8:
18240 // The byte blends for AVX vectors were introduced only in AVX2.
18241 if (Subtarget.hasAVX2())
18242 return Op;
18243
18244 return SDValue();
18245
18246 case MVT::v8i16:
18247 case MVT::v16i16: {
18248 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18249 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18250 Cond = DAG.getBitcast(CastVT, Cond);
18251 LHS = DAG.getBitcast(CastVT, LHS);
18252 RHS = DAG.getBitcast(CastVT, RHS);
18253 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18254 return DAG.getBitcast(VT, Select);
18255 }
18256 }
18257}
18258
18260 MVT VT = Op.getSimpleValueType();
18261 SDValue Vec = Op.getOperand(0);
18262 SDValue Idx = Op.getOperand(1);
18263 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
18264 SDLoc dl(Op);
18265
18267 return SDValue();
18268
18269 if (VT.getSizeInBits() == 8) {
18270 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18271 // we're going to zero extend the register or fold the store.
18274 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
18275 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18276 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18277
18278 unsigned IdxVal = Idx->getAsZExtVal();
18279 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
18280 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18281 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18282 }
18283
18284 if (VT == MVT::f32) {
18285 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
18286 // the result back to FR32 register. It's only worth matching if the
18287 // result has a single use which is a store or a bitcast to i32. And in
18288 // the case of a store, it's not worth it if the index is a constant 0,
18289 // because a MOVSSmr can be used instead, which is smaller and faster.
18290 if (!Op.hasOneUse())
18291 return SDValue();
18292 SDNode *User = *Op.getNode()->user_begin();
18293 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18294 (User->getOpcode() != ISD::BITCAST ||
18295 User->getValueType(0) != MVT::i32))
18296 return SDValue();
18297 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18298 DAG.getBitcast(MVT::v4i32, Vec), Idx);
18299 return DAG.getBitcast(MVT::f32, Extract);
18300 }
18301
18302 if (VT == MVT::i32 || VT == MVT::i64)
18303 return Op;
18304
18305 return SDValue();
18306}
18307
18308/// Extract one bit from mask vector, like v16i1 or v8i1.
18309/// AVX-512 feature.
18311 const X86Subtarget &Subtarget) {
18312 SDValue Vec = Op.getOperand(0);
18313 SDLoc dl(Vec);
18314 MVT VecVT = Vec.getSimpleValueType();
18315 SDValue Idx = Op.getOperand(1);
18316 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18317 MVT EltVT = Op.getSimpleValueType();
18318
18319 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
18320 "Unexpected vector type in ExtractBitFromMaskVector");
18321
18322 // variable index can't be handled in mask registers,
18323 // extend vector to VR512/128
18324 if (!IdxC) {
18325 unsigned NumElts = VecVT.getVectorNumElements();
18326 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18327 // than extending to 128/256bit.
18328 if (NumElts == 1) {
18329 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18331 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
18332 }
18333 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18334 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18335 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
18336 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
18337 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
18338 }
18339
18340 unsigned IdxVal = IdxC->getZExtValue();
18341 if (IdxVal == 0) // the operation is legal
18342 return Op;
18343
18344 // Extend to natively supported kshift.
18345 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18346
18347 // Use kshiftr instruction to move to the lower element.
18348 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18349 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18350
18351 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18352 DAG.getVectorIdxConstant(0, dl));
18353}
18354
18355// Helper to find all the extracted elements from a vector.
18357 MVT VT = N->getSimpleValueType(0);
18358 unsigned NumElts = VT.getVectorNumElements();
18359 APInt DemandedElts = APInt::getZero(NumElts);
18360 for (SDNode *User : N->users()) {
18361 switch (User->getOpcode()) {
18362 case X86ISD::PEXTRB:
18363 case X86ISD::PEXTRW:
18365 if (!isa<ConstantSDNode>(User->getOperand(1))) {
18366 DemandedElts.setAllBits();
18367 return DemandedElts;
18368 }
18369 DemandedElts.setBit(User->getConstantOperandVal(1));
18370 break;
18371 case ISD::BITCAST: {
18372 if (!User->getValueType(0).isSimple() ||
18373 !User->getValueType(0).isVector()) {
18374 DemandedElts.setAllBits();
18375 return DemandedElts;
18376 }
18377 APInt DemandedSrcElts = getExtractedDemandedElts(User);
18378 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
18379 break;
18380 }
18381 default:
18382 DemandedElts.setAllBits();
18383 return DemandedElts;
18384 }
18385 }
18386 return DemandedElts;
18387}
18388
18389SDValue
18390X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18391 SelectionDAG &DAG) const {
18392 SDLoc dl(Op);
18393 SDValue Vec = Op.getOperand(0);
18394 MVT VecVT = Vec.getSimpleValueType();
18395 SDValue Idx = Op.getOperand(1);
18396 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18397
18398 if (VecVT.getVectorElementType() == MVT::i1)
18399 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18400
18401 if (!IdxC) {
18402 // Its more profitable to go through memory (1 cycles throughput)
18403 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
18404 // IACA tool was used to get performance estimation
18405 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18406 //
18407 // example : extractelement <16 x i8> %a, i32 %i
18408 //
18409 // Block Throughput: 3.00 Cycles
18410 // Throughput Bottleneck: Port5
18411 //
18412 // | Num Of | Ports pressure in cycles | |
18413 // | Uops | 0 - DV | 5 | 6 | 7 | |
18414 // ---------------------------------------------
18415 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
18416 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
18417 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
18418 // Total Num Of Uops: 4
18419 //
18420 //
18421 // Block Throughput: 1.00 Cycles
18422 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18423 //
18424 // | | Ports pressure in cycles | |
18425 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18426 // ---------------------------------------------------------
18427 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18428 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18429 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
18430 // Total Num Of Uops: 4
18431
18432 return SDValue();
18433 }
18434
18435 unsigned IdxVal = IdxC->getZExtValue();
18436
18437 // If this is a 256-bit vector result, first extract the 128-bit vector and
18438 // then extract the element from the 128-bit vector.
18439 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18440 // Get the 128-bit vector.
18441 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18442 MVT EltVT = VecVT.getVectorElementType();
18443
18444 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18445 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
18446
18447 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18448 // this can be done with a mask.
18449 IdxVal &= ElemsPerChunk - 1;
18450 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18451 DAG.getVectorIdxConstant(IdxVal, dl));
18452 }
18453
18454 assert(VecVT.is128BitVector() && "Unexpected vector length");
18455
18456 MVT VT = Op.getSimpleValueType();
18457
18458 if (VT == MVT::i16) {
18459 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18460 // we're going to zero extend the register or fold the store (SSE41 only).
18461 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
18462 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
18463 if (Subtarget.hasFP16())
18464 return Op;
18465
18466 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18467 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18468 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18469 }
18470
18471 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18472 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18473 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18474 }
18475
18476 if (Subtarget.hasSSE41())
18477 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18478 return Res;
18479
18480 // Only extract a single element from a v16i8 source - determine the common
18481 // DWORD/WORD that all extractions share, and extract the sub-byte.
18482 // TODO: Add QWORD MOVQ extraction?
18483 if (VT == MVT::i8) {
18484 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
18485 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
18486
18487 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18488 int DWordIdx = IdxVal / 4;
18489 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
18490 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18491 DAG.getBitcast(MVT::v4i32, Vec),
18492 DAG.getVectorIdxConstant(DWordIdx, dl));
18493 int ShiftVal = (IdxVal % 4) * 8;
18494 if (ShiftVal != 0)
18495 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18496 DAG.getConstant(ShiftVal, dl, MVT::i8));
18497 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18498 }
18499
18500 int WordIdx = IdxVal / 2;
18501 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
18502 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18503 DAG.getBitcast(MVT::v8i16, Vec),
18504 DAG.getVectorIdxConstant(WordIdx, dl));
18505 int ShiftVal = (IdxVal % 2) * 8;
18506 if (ShiftVal != 0)
18507 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18508 DAG.getConstant(ShiftVal, dl, MVT::i8));
18509 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18510 }
18511 }
18512
18513 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
18514 if (IdxVal == 0)
18515 return Op;
18516
18517 // Shuffle the element to the lowest element, then movss or movsh.
18519 Mask[0] = static_cast<int>(IdxVal);
18520 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18521 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18522 DAG.getVectorIdxConstant(0, dl));
18523 }
18524
18525 if (VT.getSizeInBits() == 64) {
18526 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18527 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18528 // to match extract_elt for f64.
18529 if (IdxVal == 0)
18530 return Op;
18531
18532 // UNPCKHPD the element to the lowest double word, then movsd.
18533 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18534 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18535 int Mask[2] = { 1, -1 };
18536 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18537 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18538 DAG.getVectorIdxConstant(0, dl));
18539 }
18540
18541 return SDValue();
18542}
18543
18544/// Insert one bit to mask vector, like v16i1 or v8i1.
18545/// AVX-512 feature.
18547 const X86Subtarget &Subtarget) {
18548 SDLoc dl(Op);
18549 SDValue Vec = Op.getOperand(0);
18550 SDValue Elt = Op.getOperand(1);
18551 SDValue Idx = Op.getOperand(2);
18552 MVT VecVT = Vec.getSimpleValueType();
18553
18554 if (!isa<ConstantSDNode>(Idx)) {
18555 // Non constant index. Extend source and destination,
18556 // insert element and then truncate the result.
18557 unsigned NumElts = VecVT.getVectorNumElements();
18558 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18559 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18560 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18561 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18562 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18563 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18564 }
18565
18566 // Copy into a k-register, extract to v1i1 and insert_subvector.
18567 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18568 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18569}
18570
18571SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18572 SelectionDAG &DAG) const {
18573 MVT VT = Op.getSimpleValueType();
18574 MVT EltVT = VT.getVectorElementType();
18575 unsigned NumElts = VT.getVectorNumElements();
18576 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18577
18578 if (EltVT == MVT::i1)
18579 return InsertBitToMaskVector(Op, DAG, Subtarget);
18580
18581 SDLoc dl(Op);
18582 SDValue N0 = Op.getOperand(0);
18583 SDValue N1 = Op.getOperand(1);
18584 SDValue N2 = Op.getOperand(2);
18585 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18586
18587 if (EltVT == MVT::bf16) {
18589 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
18590 DAG.getBitcast(IVT, N0),
18591 DAG.getBitcast(MVT::i16, N1), N2);
18592 return DAG.getBitcast(VT, Res);
18593 }
18594
18595 if (!N2C) {
18596 // Variable insertion indices, usually we're better off spilling to stack,
18597 // but AVX512 can use a variable compare+select by comparing against all
18598 // possible vector indices, and FP insertion has less gpr->simd traffic.
18599 if (!(Subtarget.hasBWI() ||
18600 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18601 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
18602 return SDValue();
18603
18604 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18605 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18606 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18607 return SDValue();
18608
18609 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18610 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18611 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18612
18613 SmallVector<SDValue, 16> RawIndices;
18614 for (unsigned I = 0; I != NumElts; ++I)
18615 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18616 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18617
18618 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18619 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18621 }
18622
18623 if (N2C->getAPIntValue().uge(NumElts))
18624 return SDValue();
18625 uint64_t IdxVal = N2C->getZExtValue();
18626
18627 bool IsZeroElt = X86::isZeroNode(N1);
18628 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18629
18630 if (IsZeroElt || IsAllOnesElt) {
18631 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
18632 // We don't deal with i8 0 since it appears to be handled elsewhere.
18633 if (IsAllOnesElt &&
18634 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
18635 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
18636 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
18637 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
18638 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
18639 CstVectorElts[IdxVal] = OnesCst;
18640 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
18641 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
18642 }
18643 // See if we can do this more efficiently with a blend shuffle with a
18644 // rematerializable vector.
18645 if (Subtarget.hasSSE41() &&
18646 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
18647 SmallVector<int, 8> BlendMask;
18648 for (unsigned i = 0; i != NumElts; ++i)
18649 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18650 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
18651 : getOnesVector(VT, DAG, dl);
18652 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
18653 }
18654 }
18655
18656 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
18657 // into that, and then insert the subvector back into the result.
18658 if (VT.is256BitVector() || VT.is512BitVector()) {
18659 // With a 256-bit vector, we can insert into the zero element efficiently
18660 // using a blend if we have AVX or AVX2 and the right data type.
18661 if (VT.is256BitVector() && IdxVal == 0) {
18662 // TODO: It is worthwhile to cast integer to floating point and back
18663 // and incur a domain crossing penalty if that's what we'll end up
18664 // doing anyway after extracting to a 128-bit vector.
18665 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
18666 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
18667 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18668 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
18669 DAG.getTargetConstant(1, dl, MVT::i8));
18670 }
18671 }
18672
18673 unsigned NumEltsIn128 = 128 / EltSizeInBits;
18674 assert(isPowerOf2_32(NumEltsIn128) &&
18675 "Vectors will always have power-of-two number of elements.");
18676
18677 // If we are not inserting into the low 128-bit vector chunk,
18678 // then prefer the broadcast+blend sequence.
18679 // FIXME: relax the profitability check iff all N1 uses are insertions.
18680 if (IdxVal >= NumEltsIn128 &&
18681 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
18682 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
18683 X86::mayFoldLoad(N1, Subtarget)))) {
18684 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
18685 SmallVector<int, 8> BlendMask;
18686 for (unsigned i = 0; i != NumElts; ++i)
18687 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18688 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
18689 }
18690
18691 // Get the desired 128-bit vector chunk.
18692 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
18693
18694 // Insert the element into the desired chunk.
18695 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
18696 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
18697
18698 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
18699 DAG.getVectorIdxConstant(IdxIn128, dl));
18700
18701 // Insert the changed part back into the bigger vector
18702 return insert128BitVector(N0, V, IdxVal, DAG, dl);
18703 }
18704 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
18705
18706 // This will be just movw/movd/movq/movsh/movss/movsd.
18707 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
18708 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
18709 EltVT == MVT::f16 || EltVT == MVT::i64) {
18710 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18711 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18712 }
18713
18714 // We can't directly insert an i8 or i16 into a vector, so zero extend
18715 // it to i32 first.
18716 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
18717 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
18718 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
18719 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
18720 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18721 return DAG.getBitcast(VT, N1);
18722 }
18723 }
18724
18725 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
18726 // argument. SSE41 required for pinsrb.
18727 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
18728 unsigned Opc;
18729 if (VT == MVT::v8i16) {
18730 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
18731 Opc = X86ISD::PINSRW;
18732 } else {
18733 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
18734 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
18735 Opc = X86ISD::PINSRB;
18736 }
18737
18738 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
18739 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
18740 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
18741 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
18742 }
18743
18744 if (Subtarget.hasSSE41()) {
18745 if (EltVT == MVT::f32) {
18746 // Bits [7:6] of the constant are the source select. This will always be
18747 // zero here. The DAG Combiner may combine an extract_elt index into
18748 // these bits. For example (insert (extract, 3), 2) could be matched by
18749 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
18750 // Bits [5:4] of the constant are the destination select. This is the
18751 // value of the incoming immediate.
18752 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
18753 // combine either bitwise AND or insert of float 0.0 to set these bits.
18754
18755 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
18756 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
18757 // If this is an insertion of 32-bits into the low 32-bits of
18758 // a vector, we prefer to generate a blend with immediate rather
18759 // than an insertps. Blends are simpler operations in hardware and so
18760 // will always have equal or better performance than insertps.
18761 // But if optimizing for size and there's a load folding opportunity,
18762 // generate insertps because blendps does not have a 32-bit memory
18763 // operand form.
18764 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18765 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
18766 DAG.getTargetConstant(1, dl, MVT::i8));
18767 }
18768 // Create this as a scalar to vector..
18769 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18770 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
18771 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
18772 }
18773
18774 // PINSR* works with constant index.
18775 if (EltVT == MVT::i32 || EltVT == MVT::i64)
18776 return Op;
18777 }
18778
18779 return SDValue();
18780}
18781
18783 SelectionDAG &DAG) {
18784 SDLoc dl(Op);
18785 MVT OpVT = Op.getSimpleValueType();
18786
18787 // It's always cheaper to replace a xor+movd with xorps and simplifies further
18788 // combines.
18789 if (X86::isZeroNode(Op.getOperand(0)))
18790 return getZeroVector(OpVT, Subtarget, DAG, dl);
18791
18792 // If this is a 256-bit vector result, first insert into a 128-bit
18793 // vector and then insert into the 256-bit vector.
18794 if (!OpVT.is128BitVector()) {
18795 // Insert into a 128-bit vector.
18796 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
18798 OpVT.getVectorNumElements() / SizeFactor);
18799
18800 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
18801
18802 // Insert the 128-bit vector.
18803 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
18804 }
18805 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
18806 "Expected an SSE type!");
18807
18808 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
18809 // tblgen.
18810 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
18811 return Op;
18812
18813 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
18814 return DAG.getBitcast(
18815 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
18816}
18817
18818// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
18819// simple superregister reference or explicit instructions to insert
18820// the upper bits of a vector.
18822 SelectionDAG &DAG) {
18823 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
18824
18825 return insert1BitVector(Op, DAG, Subtarget);
18826}
18827
18829 SelectionDAG &DAG) {
18830 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
18831 "Only vXi1 extract_subvectors need custom lowering");
18832
18833 SDLoc dl(Op);
18834 SDValue Vec = Op.getOperand(0);
18835 uint64_t IdxVal = Op.getConstantOperandVal(1);
18836
18837 if (IdxVal == 0) // the operation is legal
18838 return Op;
18839
18840 // Extend to natively supported kshift.
18841 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18842
18843 // Shift to the LSB.
18844 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18845 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18846
18847 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
18848 DAG.getVectorIdxConstant(0, dl));
18849}
18850
18851// Returns the appropriate wrapper opcode for a global reference.
18852unsigned X86TargetLowering::getGlobalWrapperKind(
18853 const GlobalValue *GV, const unsigned char OpFlags) const {
18854 // References to absolute symbols are never PC-relative.
18855 if (GV && GV->isAbsoluteSymbolRef())
18856 return X86ISD::Wrapper;
18857
18858 // The following OpFlags under RIP-rel PIC use RIP.
18859 if (Subtarget.isPICStyleRIPRel() &&
18860 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
18861 OpFlags == X86II::MO_DLLIMPORT))
18862 return X86ISD::WrapperRIP;
18863
18864 // GOTPCREL references must always use RIP.
18865 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
18866 return X86ISD::WrapperRIP;
18867
18868 return X86ISD::Wrapper;
18869}
18870
18871// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
18872// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
18873// one of the above mentioned nodes. It has to be wrapped because otherwise
18874// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
18875// be used to form addressing mode. These wrapped nodes will be selected
18876// into MOV32ri.
18877SDValue
18878X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
18879 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
18880
18881 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18882 // global base reg.
18883 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18884
18885 auto PtrVT = getPointerTy(DAG.getDataLayout());
18887 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
18888 SDLoc DL(CP);
18889 Result =
18890 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
18891 // With PIC, the address is actually $g + Offset.
18892 if (OpFlag) {
18893 Result =
18894 DAG.getNode(ISD::ADD, DL, PtrVT,
18895 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18896 }
18897
18898 return Result;
18899}
18900
18901SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
18902 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
18903
18904 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18905 // global base reg.
18906 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18907
18908 auto PtrVT = getPointerTy(DAG.getDataLayout());
18909 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
18910 SDLoc DL(JT);
18911 Result =
18912 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
18913
18914 // With PIC, the address is actually $g + Offset.
18915 if (OpFlag)
18916 Result =
18917 DAG.getNode(ISD::ADD, DL, PtrVT,
18918 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18919
18920 return Result;
18921}
18922
18923SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
18924 SelectionDAG &DAG) const {
18925 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
18926}
18927
18928SDValue
18929X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
18930 // Create the TargetBlockAddressAddress node.
18931 unsigned char OpFlags =
18933 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
18934 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
18935 SDLoc dl(Op);
18936 auto PtrVT = getPointerTy(DAG.getDataLayout());
18937 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
18938 Result =
18939 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
18940
18941 // With PIC, the address is actually $g + Offset.
18942 if (isGlobalRelativeToPICBase(OpFlags)) {
18943 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
18944 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
18945 }
18946
18947 return Result;
18948}
18949
18950/// Creates target global address or external symbol nodes for calls or
18951/// other uses.
18952SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
18953 bool ForCall) const {
18954 // Unpack the global address or external symbol.
18955 SDLoc dl(Op);
18956 const GlobalValue *GV = nullptr;
18957 int64_t Offset = 0;
18958 const char *ExternalSym = nullptr;
18959 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
18960 GV = G->getGlobal();
18961 Offset = G->getOffset();
18962 } else {
18963 const auto *ES = cast<ExternalSymbolSDNode>(Op);
18964 ExternalSym = ES->getSymbol();
18965 }
18966
18967 // Calculate some flags for address lowering.
18969 unsigned char OpFlags;
18970 if (ForCall)
18971 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
18972 else
18973 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
18974 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
18975 bool NeedsLoad = isGlobalStubReference(OpFlags);
18976
18978 auto PtrVT = getPointerTy(DAG.getDataLayout());
18980
18981 if (GV) {
18982 // Create a target global address if this is a global. If possible, fold the
18983 // offset into the global address reference. Otherwise, ADD it on later.
18984 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
18985 // allowed because if the address of foo is 0, the ELF R_X86_64_32
18986 // relocation will compute to a negative value, which is invalid.
18987 int64_t GlobalOffset = 0;
18988 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
18990 std::swap(GlobalOffset, Offset);
18991 }
18992 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
18993 } else {
18994 // If this is not a global address, this must be an external symbol.
18995 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
18996 }
18997
18998 // If this is a direct call, avoid the wrapper if we don't need to do any
18999 // loads or adds. This allows SDAG ISel to match direct calls.
19000 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
19001 return Result;
19002
19003 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
19004
19005 // With PIC, the address is actually $g + Offset.
19006 if (HasPICReg) {
19007 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
19008 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
19009 }
19010
19011 // For globals that require a load from a stub to get the address, emit the
19012 // load.
19013 if (NeedsLoad)
19014 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
19016
19017 // If there was a non-zero offset that we didn't fold, create an explicit
19018 // addition for it.
19019 if (Offset != 0)
19020 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19021 DAG.getSignedConstant(Offset, dl, PtrVT));
19022
19023 return Result;
19024}
19025
19026SDValue
19027X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19028 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
19029}
19030
19032 const EVT PtrVT, unsigned ReturnReg,
19033 unsigned char OperandFlags,
19034 bool LoadGlobalBaseReg = false,
19035 bool LocalDynamic = false) {
19037 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19038 SDLoc dl(GA);
19039 SDValue TGA;
19040 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
19041 SDValue Chain = DAG.getEntryNode();
19042 SDValue Ret;
19043 if (LocalDynamic && UseTLSDESC) {
19044 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
19045 // Reuse existing GetTLSADDR node if we can find it.
19046 if (TGA->hasOneUse()) {
19047 // TLSDESC uses TGA.
19048 SDNode *TLSDescOp = *TGA->user_begin();
19049 assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC &&
19050 "Unexpected TLSDESC DAG");
19051 // CALLSEQ_END uses TGA via a chain and glue.
19052 auto *CallSeqEndOp = TLSDescOp->getGluedUser();
19053 assert(CallSeqEndOp && CallSeqEndOp->getOpcode() == ISD::CALLSEQ_END &&
19054 "Unexpected TLSDESC DAG");
19055 // CopyFromReg uses CALLSEQ_END via a chain and glue.
19056 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
19057 assert(CopyFromRegOp && CopyFromRegOp->getOpcode() == ISD::CopyFromReg &&
19058 "Unexpected TLSDESC DAG");
19059 Ret = SDValue(CopyFromRegOp, 0);
19060 }
19061 } else {
19062 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19063 GA->getOffset(), OperandFlags);
19064 }
19065
19066 if (!Ret) {
19067 X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC
19068 : LocalDynamic ? X86ISD::TLSBASEADDR
19070
19071 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19072 if (LoadGlobalBaseReg) {
19073 SDValue InGlue;
19074 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
19075 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT),
19076 InGlue);
19077 InGlue = Chain.getValue(1);
19078 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
19079 } else {
19080 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA});
19081 }
19082 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), dl);
19083
19084 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19085 MFI.setHasCalls(true);
19086
19087 SDValue Glue = Chain.getValue(1);
19088 Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
19089 }
19090
19091 if (!UseTLSDESC)
19092 return Ret;
19093
19094 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
19095 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
19096
19098 SDValue Offset =
19099 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19101 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
19102}
19103
19104// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19105static SDValue
19107 const EVT PtrVT) {
19108 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD,
19109 /*LoadGlobalBaseReg=*/true);
19110}
19111
19112// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19113static SDValue
19115 const EVT PtrVT) {
19116 return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD);
19117}
19118
19119// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19120static SDValue
19122 const EVT PtrVT) {
19123 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD);
19124}
19125
19127 SelectionDAG &DAG, const EVT PtrVT,
19128 bool Is64Bit, bool Is64BitLP64) {
19129 SDLoc dl(GA);
19130
19131 // Get the start address of the TLS block for this module.
19135
19136 SDValue Base;
19137 if (Is64Bit) {
19138 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19139 Base = GetTLSADDR(DAG, GA, PtrVT, ReturnReg, X86II::MO_TLSLD,
19140 /*LoadGlobalBaseReg=*/false,
19141 /*LocalDynamic=*/true);
19142 } else {
19143 Base = GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSLDM,
19144 /*LoadGlobalBaseReg=*/true,
19145 /*LocalDynamic=*/true);
19146 }
19147
19148 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19149 // of Base.
19150
19151 // Build x@dtpoff.
19152 unsigned char OperandFlags = X86II::MO_DTPOFF;
19153 unsigned WrapperKind = X86ISD::Wrapper;
19154 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19155 GA->getValueType(0),
19156 GA->getOffset(), OperandFlags);
19157 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19158
19159 // Add x@dtpoff with the base.
19160 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19161}
19162
19163// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
19165 const EVT PtrVT, TLSModel::Model model,
19166 bool is64Bit, bool isPIC) {
19167 SDLoc dl(GA);
19168
19169 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19172
19173 SDValue ThreadPointer =
19174 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19176
19177 unsigned char OperandFlags = 0;
19178 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
19179 // initialexec.
19180 unsigned WrapperKind = X86ISD::Wrapper;
19181 if (model == TLSModel::LocalExec) {
19182 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19183 } else if (model == TLSModel::InitialExec) {
19184 if (is64Bit) {
19185 OperandFlags = X86II::MO_GOTTPOFF;
19186 WrapperKind = X86ISD::WrapperRIP;
19187 } else {
19188 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19189 }
19190 } else {
19191 llvm_unreachable("Unexpected model");
19192 }
19193
19194 // emit "addl x@ntpoff,%eax" (local exec)
19195 // or "addl x@indntpoff,%eax" (initial exec)
19196 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19197 SDValue TGA =
19198 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19199 GA->getOffset(), OperandFlags);
19200 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19201
19202 if (model == TLSModel::InitialExec) {
19203 if (isPIC && !is64Bit) {
19204 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19205 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19206 Offset);
19207 }
19208
19209 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19211 }
19212
19213 // The address of the thread local variable is the add of the thread
19214 // pointer with the offset of the variable.
19215 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19216}
19217
19218SDValue
19219X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19220
19221 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19222
19223 if (DAG.getTarget().useEmulatedTLS())
19224 return LowerToTLSEmulatedModel(GA, DAG);
19225
19226 const GlobalValue *GV = GA->getGlobal();
19227 auto PtrVT = getPointerTy(DAG.getDataLayout());
19228 bool PositionIndependent = isPositionIndependent();
19229
19230 if (Subtarget.isTargetELF()) {
19231 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19232 switch (model) {
19234 if (Subtarget.is64Bit()) {
19235 if (Subtarget.isTarget64BitLP64())
19236 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19237 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19238 }
19239 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19241 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19242 Subtarget.isTarget64BitLP64());
19245 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19246 PositionIndependent);
19247 }
19248 llvm_unreachable("Unknown TLS model.");
19249 }
19250
19251 if (Subtarget.isTargetDarwin()) {
19252 // Darwin only has one model of TLS. Lower to that.
19253 unsigned char OpFlag = 0;
19254 unsigned WrapperKind = 0;
19255
19256 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19257 // global base reg.
19258 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19259 if (PIC32) {
19260 OpFlag = X86II::MO_TLVP_PIC_BASE;
19261 WrapperKind = X86ISD::Wrapper;
19262 } else {
19263 OpFlag = X86II::MO_TLVP;
19264 WrapperKind = X86ISD::WrapperRIP;
19265 }
19266 SDLoc DL(Op);
19268 GA->getValueType(0),
19269 GA->getOffset(), OpFlag);
19270 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19271
19272 // With PIC32, the address is actually $g + Offset.
19273 if (PIC32)
19274 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19275 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19276 Offset);
19277
19278 // Lowering the machine isd will make sure everything is in the right
19279 // location.
19280 SDValue Chain = DAG.getEntryNode();
19281 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19282 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19283 SDValue Args[] = { Chain, Offset };
19284 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19285 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
19286
19287 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19289 MFI.setAdjustsStack(true);
19290
19291 // And our return value (tls address) is in the standard call return value
19292 // location.
19293 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19294 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19295 }
19296
19297 if (Subtarget.isOSWindows()) {
19298 // Just use the implicit TLS architecture
19299 // Need to generate something similar to:
19300 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19301 // ; from TEB
19302 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
19303 // mov rcx, qword [rdx+rcx*8]
19304 // mov eax, .tls$:tlsvar
19305 // [rax+rcx] contains the address
19306 // Windows 64bit: gs:0x58
19307 // Windows 32bit: fs:__tls_array
19308
19309 SDLoc dl(GA);
19310 SDValue Chain = DAG.getEntryNode();
19311
19312 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19313 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19314 // use its literal value of 0x2C.
19316 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), X86AS::GS)
19318
19319 SDValue TlsArray = Subtarget.is64Bit()
19320 ? DAG.getIntPtrConstant(0x58, dl)
19321 : (Subtarget.isTargetWindowsGNU()
19322 ? DAG.getIntPtrConstant(0x2C, dl)
19323 : DAG.getExternalSymbol("_tls_array", PtrVT));
19324
19326 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19327
19328 SDValue res;
19330 res = ThreadPointer;
19331 } else {
19332 // Load the _tls_index variable
19333 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19334 if (Subtarget.is64Bit())
19335 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19336 MachinePointerInfo(), MVT::i32);
19337 else
19338 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19339
19340 const DataLayout &DL = DAG.getDataLayout();
19341 SDValue Scale =
19342 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
19343 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
19344
19345 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
19346 }
19347
19348 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
19349
19350 // Get the offset of start of .tls section
19351 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19352 GA->getValueType(0),
19354 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
19355
19356 // The address of the thread local variable is the add of the thread
19357 // pointer with the offset of the variable.
19358 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
19359 }
19360
19361 llvm_unreachable("TLS not implemented for this target.");
19362}
19363
19365 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
19366 const TargetMachine &TM = getTargetMachine();
19367 TLSModel::Model Model = TM.getTLSModel(&GV);
19368 switch (Model) {
19371 // We can include the %fs segment register in addressing modes.
19372 return true;
19375 // These models do not result in %fs relative addresses unless
19376 // TLS descriptior are used.
19377 //
19378 // Even in the case of TLS descriptors we currently have no way to model
19379 // the difference between %fs access and the computations needed for the
19380 // offset and returning `true` for TLS-desc currently duplicates both
19381 // which is detrimental :-/
19382 return false;
19383 }
19384 }
19385 return false;
19386}
19387
19388/// Lower SRA_PARTS and friends, which return two i32 values
19389/// and take a 2 x i32 value to shift plus a shift amount.
19390/// TODO: Can this be moved to general expansion code?
19392 SDValue Lo, Hi;
19393 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
19394 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
19395}
19396
19397// Try to use a packed vector operation to handle i64 on 32-bit targets when
19398// AVX512DQ is enabled.
19400 SelectionDAG &DAG,
19401 const X86Subtarget &Subtarget) {
19402 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19403 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19404 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19405 Op.getOpcode() == ISD::UINT_TO_FP) &&
19406 "Unexpected opcode!");
19407 bool IsStrict = Op->isStrictFPOpcode();
19408 unsigned OpNo = IsStrict ? 1 : 0;
19409 SDValue Src = Op.getOperand(OpNo);
19410 MVT SrcVT = Src.getSimpleValueType();
19411 MVT VT = Op.getSimpleValueType();
19412
19413 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19414 (VT != MVT::f32 && VT != MVT::f64))
19415 return SDValue();
19416
19417 // Pack the i64 into a vector, do the operation and extract.
19418
19419 // Using 256-bit to ensure result is 128-bits for f32 case.
19420 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19421 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19422 MVT VecVT = MVT::getVectorVT(VT, NumElts);
19423
19424 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19425 if (IsStrict) {
19426 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19427 {Op.getOperand(0), InVec});
19428 SDValue Chain = CvtVec.getValue(1);
19429 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19430 DAG.getVectorIdxConstant(0, dl));
19431 return DAG.getMergeValues({Value, Chain}, dl);
19432 }
19433
19434 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19435
19436 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19437 DAG.getVectorIdxConstant(0, dl));
19438}
19439
19440// Try to use a packed vector operation to handle i64 on 32-bit targets.
19442 const X86Subtarget &Subtarget) {
19443 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19444 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19445 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19446 Op.getOpcode() == ISD::UINT_TO_FP) &&
19447 "Unexpected opcode!");
19448 bool IsStrict = Op->isStrictFPOpcode();
19449 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19450 MVT SrcVT = Src.getSimpleValueType();
19451 MVT VT = Op.getSimpleValueType();
19452
19453 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
19454 return SDValue();
19455
19456 // Pack the i64 into a vector, do the operation and extract.
19457
19458 assert(Subtarget.hasFP16() && "Expected FP16");
19459
19460 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
19461 if (IsStrict) {
19462 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
19463 {Op.getOperand(0), InVec});
19464 SDValue Chain = CvtVec.getValue(1);
19465 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19466 DAG.getVectorIdxConstant(0, dl));
19467 return DAG.getMergeValues({Value, Chain}, dl);
19468 }
19469
19470 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
19471
19472 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19473 DAG.getVectorIdxConstant(0, dl));
19474}
19475
19476static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19477 const X86Subtarget &Subtarget) {
19478 switch (Opcode) {
19479 case ISD::SINT_TO_FP:
19480 // TODO: Handle wider types with AVX/AVX512.
19481 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19482 return false;
19483 // CVTDQ2PS or (V)CVTDQ2PD
19484 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19485
19486 case ISD::UINT_TO_FP:
19487 // TODO: Handle wider types and i64 elements.
19488 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19489 return false;
19490 // VCVTUDQ2PS or VCVTUDQ2PD
19491 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19492
19493 default:
19494 return false;
19495 }
19496}
19497
19498/// Given a scalar cast operation that is extracted from a vector, try to
19499/// vectorize the cast op followed by extraction. This will avoid an expensive
19500/// round-trip between XMM and GPR.
19502 SelectionDAG &DAG,
19503 const X86Subtarget &Subtarget) {
19504 // TODO: This could be enhanced to handle smaller integer types by peeking
19505 // through an extend.
19506 SDValue Extract = Cast.getOperand(0);
19507 MVT DestVT = Cast.getSimpleValueType();
19508 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19509 !isa<ConstantSDNode>(Extract.getOperand(1)))
19510 return SDValue();
19511
19512 // See if we have a 128-bit vector cast op for this type of cast.
19513 SDValue VecOp = Extract.getOperand(0);
19514 MVT FromVT = VecOp.getSimpleValueType();
19515 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19516 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19517 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19518 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19519 return SDValue();
19520
19521 // If we are extracting from a non-zero element, first shuffle the source
19522 // vector to allow extracting from element zero.
19523 if (!isNullConstant(Extract.getOperand(1))) {
19524 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19525 Mask[0] = Extract.getConstantOperandVal(1);
19526 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19527 }
19528 // If the source vector is wider than 128-bits, extract the low part. Do not
19529 // create an unnecessarily wide vector cast op.
19530 if (FromVT != Vec128VT)
19531 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19532
19533 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19534 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19535 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19536 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19537 DAG.getVectorIdxConstant(0, DL));
19538}
19539
19540/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19541/// try to vectorize the cast ops. This will avoid an expensive round-trip
19542/// between XMM and GPR.
19543static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
19544 SelectionDAG &DAG,
19545 const X86Subtarget &Subtarget) {
19546 // TODO: Allow FP_TO_UINT.
19547 SDValue CastToInt = CastToFP.getOperand(0);
19548 MVT VT = CastToFP.getSimpleValueType();
19549 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19550 return SDValue();
19551
19552 MVT IntVT = CastToInt.getSimpleValueType();
19553 SDValue X = CastToInt.getOperand(0);
19554 MVT SrcVT = X.getSimpleValueType();
19555 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19556 return SDValue();
19557
19558 // See if we have 128-bit vector cast instructions for this type of cast.
19559 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19560 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19561 IntVT != MVT::i32)
19562 return SDValue();
19563
19564 unsigned SrcSize = SrcVT.getSizeInBits();
19565 unsigned IntSize = IntVT.getSizeInBits();
19566 unsigned VTSize = VT.getSizeInBits();
19567 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19568 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19569 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19570
19571 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19572 unsigned ToIntOpcode =
19573 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19574 unsigned ToFPOpcode =
19575 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19576
19577 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19578 //
19579 // We are not defining the high elements (for example, zero them) because
19580 // that could nullify any performance advantage that we hoped to gain from
19581 // this vector op hack. We do not expect any adverse effects (like denorm
19582 // penalties) with cast ops.
19583 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19584 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19585 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19586 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19587 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19588}
19589
19591 SelectionDAG &DAG,
19592 const X86Subtarget &Subtarget) {
19593 bool IsStrict = Op->isStrictFPOpcode();
19594 MVT VT = Op->getSimpleValueType(0);
19595 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19596
19597 if (Subtarget.hasDQI()) {
19598 assert(!Subtarget.hasVLX() && "Unexpected features");
19599
19600 assert((Src.getSimpleValueType() == MVT::v2i64 ||
19601 Src.getSimpleValueType() == MVT::v4i64) &&
19602 "Unsupported custom type");
19603
19604 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19605 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19606 "Unexpected VT!");
19607 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19608
19609 // Need to concat with zero vector for strict fp to avoid spurious
19610 // exceptions.
19611 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19612 : DAG.getUNDEF(MVT::v8i64);
19613 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19614 DAG.getVectorIdxConstant(0, DL));
19615 SDValue Res, Chain;
19616 if (IsStrict) {
19617 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19618 {Op->getOperand(0), Src});
19619 Chain = Res.getValue(1);
19620 } else {
19621 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19622 }
19623
19624 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19625 DAG.getVectorIdxConstant(0, DL));
19626
19627 if (IsStrict)
19628 return DAG.getMergeValues({Res, Chain}, DL);
19629 return Res;
19630 }
19631
19632 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19633 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19634 if (VT != MVT::v4f32 || IsSigned)
19635 return SDValue();
19636
19637 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
19638 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
19639 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
19640 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
19641 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
19642 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
19643 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
19644 SmallVector<SDValue, 4> SignCvts(4);
19645 SmallVector<SDValue, 4> Chains(4);
19646 for (int i = 0; i != 4; ++i) {
19647 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
19648 DAG.getVectorIdxConstant(i, DL));
19649 if (IsStrict) {
19650 SignCvts[i] =
19651 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
19652 {Op.getOperand(0), Elt});
19653 Chains[i] = SignCvts[i].getValue(1);
19654 } else {
19655 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
19656 }
19657 }
19658 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
19659
19660 SDValue Slow, Chain;
19661 if (IsStrict) {
19662 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
19663 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
19664 {Chain, SignCvt, SignCvt});
19665 Chain = Slow.getValue(1);
19666 } else {
19667 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
19668 }
19669
19670 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
19671 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
19672
19673 if (IsStrict)
19674 return DAG.getMergeValues({Cvt, Chain}, DL);
19675
19676 return Cvt;
19677}
19678
19680 SelectionDAG &DAG) {
19681 bool IsStrict = Op->isStrictFPOpcode();
19682 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19683 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19684 MVT VT = Op.getSimpleValueType();
19685 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
19686
19687 SDValue Rnd = DAG.getIntPtrConstant(0, dl, /*isTarget=*/true);
19688 if (IsStrict)
19689 return DAG.getNode(
19690 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
19691 {Chain,
19692 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
19693 Rnd});
19694 return DAG.getNode(ISD::FP_ROUND, dl, VT,
19695 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
19696}
19697
19698static bool isLegalConversion(MVT VT, bool IsSigned,
19699 const X86Subtarget &Subtarget) {
19700 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
19701 return true;
19702 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
19703 return true;
19704 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
19705 return true;
19706 if (Subtarget.useAVX512Regs()) {
19707 if (VT == MVT::v16i32)
19708 return true;
19709 if (VT == MVT::v8i64 && Subtarget.hasDQI())
19710 return true;
19711 }
19712 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
19713 (VT == MVT::v2i64 || VT == MVT::v4i64))
19714 return true;
19715 return false;
19716}
19717
19718SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
19719 SelectionDAG &DAG) const {
19720 bool IsStrict = Op->isStrictFPOpcode();
19721 unsigned OpNo = IsStrict ? 1 : 0;
19722 SDValue Src = Op.getOperand(OpNo);
19723 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19724 MVT SrcVT = Src.getSimpleValueType();
19725 MVT VT = Op.getSimpleValueType();
19726 SDLoc dl(Op);
19727
19728 if (isSoftF16(VT, Subtarget))
19729 return promoteXINT_TO_FP(Op, dl, DAG);
19730 else if (isLegalConversion(SrcVT, true, Subtarget))
19731 return Op;
19732
19733 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
19734 return LowerWin64_INT128_TO_FP(Op, DAG);
19735
19736 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
19737 return Extract;
19738
19739 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
19740 return R;
19741
19742 if (SrcVT.isVector()) {
19743 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
19744 // Note: Since v2f64 is a legal type. We don't need to zero extend the
19745 // source for strict FP.
19746 if (IsStrict)
19747 return DAG.getNode(
19748 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
19749 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19750 DAG.getUNDEF(SrcVT))});
19751 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
19752 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19753 DAG.getUNDEF(SrcVT)));
19754 }
19755 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
19756 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
19757
19758 return SDValue();
19759 }
19760
19761 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
19762 "Unknown SINT_TO_FP to lower!");
19763
19764 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
19765
19766 // These are really Legal; return the operand so the caller accepts it as
19767 // Legal.
19768 if (SrcVT == MVT::i32 && UseSSEReg)
19769 return Op;
19770 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
19771 return Op;
19772
19773 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
19774 return V;
19775 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
19776 return V;
19777
19778 // SSE doesn't have an i16 conversion so we need to promote.
19779 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
19780 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
19781 if (IsStrict)
19782 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
19783 {Chain, Ext});
19784
19785 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
19786 }
19787
19788 if (VT == MVT::f128 || !Subtarget.hasX87())
19789 return SDValue();
19790
19791 SDValue ValueToStore = Src;
19792 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
19793 // Bitcasting to f64 here allows us to do a single 64-bit store from
19794 // an SSE register, avoiding the store forwarding penalty that would come
19795 // with two 32-bit stores.
19796 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
19797
19798 unsigned Size = SrcVT.getStoreSize();
19799 Align Alignment(Size);
19801 auto PtrVT = getPointerTy(MF.getDataLayout());
19802 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
19803 MachinePointerInfo MPI =
19805 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19806 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
19807 std::pair<SDValue, SDValue> Tmp =
19808 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
19809
19810 if (IsStrict)
19811 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
19812
19813 return Tmp.first;
19814}
19815
19816std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
19817 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
19818 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
19819 // Build the FILD
19820 SDVTList Tys;
19821 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
19822 if (useSSE)
19823 Tys = DAG.getVTList(MVT::f80, MVT::Other);
19824 else
19825 Tys = DAG.getVTList(DstVT, MVT::Other);
19826
19827 SDValue FILDOps[] = {Chain, Pointer};
19828 SDValue Result =
19829 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
19830 Alignment, MachineMemOperand::MOLoad);
19831 Chain = Result.getValue(1);
19832
19833 if (useSSE) {
19835 unsigned SSFISize = DstVT.getStoreSize();
19836 int SSFI =
19837 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
19838 auto PtrVT = getPointerTy(MF.getDataLayout());
19839 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19840 Tys = DAG.getVTList(MVT::Other);
19841 SDValue FSTOps[] = {Chain, Result, StackSlot};
19844 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
19845
19846 Chain =
19847 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
19848 Result = DAG.getLoad(
19849 DstVT, DL, Chain, StackSlot,
19851 Chain = Result.getValue(1);
19852 }
19853
19854 return { Result, Chain };
19855}
19856
19857/// Horizontal vector math instructions may be slower than normal math with
19858/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
19859/// implementation, and likely shuffle complexity of the alternate sequence.
19860static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
19861 const X86Subtarget &Subtarget) {
19862 bool IsOptimizingSize = DAG.shouldOptForSize();
19863 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
19864 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
19865}
19866
19867/// 64-bit unsigned integer to double expansion.
19869 SelectionDAG &DAG,
19870 const X86Subtarget &Subtarget) {
19871 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
19872 // when converting 0 when rounding toward negative infinity. Caller will
19873 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
19874 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
19875 // This algorithm is not obvious. Here it is what we're trying to output:
19876 /*
19877 movq %rax, %xmm0
19878 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
19879 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
19880 #ifdef __SSE3__
19881 haddpd %xmm0, %xmm0
19882 #else
19883 pshufd $0x4e, %xmm0, %xmm1
19884 addpd %xmm1, %xmm0
19885 #endif
19886 */
19887
19888 LLVMContext *Context = DAG.getContext();
19889
19890 // Build some magic constants.
19891 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
19892 Constant *C0 = ConstantDataVector::get(*Context, CV0);
19893 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
19894 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
19895
19897 CV1.push_back(
19898 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19899 APInt(64, 0x4330000000000000ULL))));
19900 CV1.push_back(
19901 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19902 APInt(64, 0x4530000000000000ULL))));
19903 Constant *C1 = ConstantVector::get(CV1);
19904 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
19905
19906 // Load the 64-bit value into an XMM register.
19907 SDValue XR1 =
19908 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
19909 SDValue CLod0 = DAG.getLoad(
19910 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
19912 SDValue Unpck1 =
19913 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
19914
19915 SDValue CLod1 = DAG.getLoad(
19916 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
19918 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
19919 // TODO: Are there any fast-math-flags to propagate here?
19920 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
19921 SDValue Result;
19922
19923 if (Subtarget.hasSSE3() &&
19924 shouldUseHorizontalOp(true, DAG, Subtarget)) {
19925 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
19926 } else {
19927 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
19928 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
19929 }
19930 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
19931 DAG.getVectorIdxConstant(0, dl));
19932 return Result;
19933}
19934
19935/// 32-bit unsigned integer to float expansion.
19937 SelectionDAG &DAG,
19938 const X86Subtarget &Subtarget) {
19939 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19940 // FP constant to bias correct the final result.
19941 SDValue Bias = DAG.getConstantFP(
19942 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
19943
19944 // Load the 32-bit value into an XMM register.
19945 SDValue Load =
19946 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
19947
19948 // Zero out the upper parts of the register.
19949 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
19950
19951 // Or the load with the bias.
19952 SDValue Or = DAG.getNode(
19953 ISD::OR, dl, MVT::v2i64,
19954 DAG.getBitcast(MVT::v2i64, Load),
19955 DAG.getBitcast(MVT::v2i64,
19956 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
19957 Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
19958 DAG.getBitcast(MVT::v2f64, Or),
19959 DAG.getVectorIdxConstant(0, dl));
19960
19961 if (Op.getNode()->isStrictFPOpcode()) {
19962 // Subtract the bias.
19963 // TODO: Are there any fast-math-flags to propagate here?
19964 SDValue Chain = Op.getOperand(0);
19965 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
19966 {Chain, Or, Bias});
19967
19968 if (Op.getValueType() == Sub.getValueType())
19969 return Sub;
19970
19971 // Handle final rounding.
19972 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
19973 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
19974
19975 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
19976 }
19977
19978 // Subtract the bias.
19979 // TODO: Are there any fast-math-flags to propagate here?
19980 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
19981
19982 // Handle final rounding.
19983 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
19984}
19985
19987 SelectionDAG &DAG,
19988 const X86Subtarget &Subtarget) {
19989 if (Op.getSimpleValueType() != MVT::v2f64)
19990 return SDValue();
19991
19992 bool IsStrict = Op->isStrictFPOpcode();
19993
19994 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
19995 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
19996
19997 if (Subtarget.hasAVX512()) {
19998 if (!Subtarget.hasVLX()) {
19999 // Let generic type legalization widen this.
20000 if (!IsStrict)
20001 return SDValue();
20002 // Otherwise pad the integer input with 0s and widen the operation.
20003 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20004 DAG.getConstant(0, DL, MVT::v2i32));
20005 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
20006 {Op.getOperand(0), N0});
20007 SDValue Chain = Res.getValue(1);
20008 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
20009 DAG.getVectorIdxConstant(0, DL));
20010 return DAG.getMergeValues({Res, Chain}, DL);
20011 }
20012
20013 // Legalize to v4i32 type.
20014 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
20015 DAG.getUNDEF(MVT::v2i32));
20016 if (IsStrict)
20017 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20018 {Op.getOperand(0), N0});
20019 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20020 }
20021
20022 // Zero extend to 2i64, OR with the floating point representation of 2^52.
20023 // This gives us the floating point equivalent of 2^52 + the i32 integer
20024 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20025 // point leaving just our i32 integers in double format.
20026 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20027 SDValue VBias = DAG.getConstantFP(
20028 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
20029 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20030 DAG.getBitcast(MVT::v2i64, VBias));
20031 Or = DAG.getBitcast(MVT::v2f64, Or);
20032
20033 if (IsStrict)
20034 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20035 {Op.getOperand(0), Or, VBias});
20036 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20037}
20038
20040 SelectionDAG &DAG,
20041 const X86Subtarget &Subtarget) {
20042 bool IsStrict = Op->isStrictFPOpcode();
20043 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20044 MVT VecIntVT = V.getSimpleValueType();
20045 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
20046 "Unsupported custom type");
20047
20048 if (Subtarget.hasAVX512()) {
20049 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20050 assert(!Subtarget.hasVLX() && "Unexpected features");
20051 MVT VT = Op->getSimpleValueType(0);
20052
20053 // v8i32->v8f64 is legal with AVX512 so just return it.
20054 if (VT == MVT::v8f64)
20055 return Op;
20056
20057 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
20058 "Unexpected VT!");
20059 MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
20060 MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
20061 // Need to concat with zero vector for strict fp to avoid spurious
20062 // exceptions.
20063 SDValue Tmp =
20064 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20065 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20066 DAG.getVectorIdxConstant(0, DL));
20067 SDValue Res, Chain;
20068 if (IsStrict) {
20069 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20070 {Op->getOperand(0), V});
20071 Chain = Res.getValue(1);
20072 } else {
20073 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20074 }
20075
20076 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20077 DAG.getVectorIdxConstant(0, DL));
20078
20079 if (IsStrict)
20080 return DAG.getMergeValues({Res, Chain}, DL);
20081 return Res;
20082 }
20083
20084 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20085 Op->getSimpleValueType(0) == MVT::v4f64) {
20086 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20087 Constant *Bias = ConstantFP::get(
20088 *DAG.getContext(),
20089 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20090 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20091 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20092 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20093 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20094 SDValue VBias = DAG.getMemIntrinsicNode(
20095 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20098
20099 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20100 DAG.getBitcast(MVT::v4i64, VBias));
20101 Or = DAG.getBitcast(MVT::v4f64, Or);
20102
20103 if (IsStrict)
20104 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20105 {Op.getOperand(0), Or, VBias});
20106 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20107 }
20108
20109 // The algorithm is the following:
20110 // #ifdef __SSE4_1__
20111 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20112 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20113 // (uint4) 0x53000000, 0xaa);
20114 // #else
20115 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20116 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20117 // #endif
20118 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20119 // return (float4) lo + fhi;
20120
20121 bool Is128 = VecIntVT == MVT::v4i32;
20122 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20123 // If we convert to something else than the supported type, e.g., to v4f64,
20124 // abort early.
20125 if (VecFloatVT != Op->getSimpleValueType(0))
20126 return SDValue();
20127
20128 // In the #idef/#else code, we have in common:
20129 // - The vector of constants:
20130 // -- 0x4b000000
20131 // -- 0x53000000
20132 // - A shift:
20133 // -- v >> 16
20134
20135 // Create the splat vector for 0x4b000000.
20136 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20137 // Create the splat vector for 0x53000000.
20138 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20139
20140 // Create the right shift.
20141 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20142 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20143
20144 SDValue Low, High;
20145 if (Subtarget.hasSSE41()) {
20146 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20147 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20148 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20149 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20150 // Low will be bitcasted right away, so do not bother bitcasting back to its
20151 // original type.
20152 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20153 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20154 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20155 // (uint4) 0x53000000, 0xaa);
20156 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20157 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20158 // High will be bitcasted right away, so do not bother bitcasting back to
20159 // its original type.
20160 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20161 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20162 } else {
20163 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20164 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20165 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20166 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20167
20168 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20169 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20170 }
20171
20172 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20173 SDValue VecCstFSub = DAG.getConstantFP(
20174 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20175
20176 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20177 // NOTE: By using fsub of a positive constant instead of fadd of a negative
20178 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20179 // enabled. See PR24512.
20180 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20181 // TODO: Are there any fast-math-flags to propagate here?
20182 // (float4) lo;
20183 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20184 // return (float4) lo + fhi;
20185 if (IsStrict) {
20186 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20187 {Op.getOperand(0), HighBitcast, VecCstFSub});
20188 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20189 {FHigh.getValue(1), LowBitcast, FHigh});
20190 }
20191
20192 SDValue FHigh =
20193 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20194 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20195}
20196
20198 const X86Subtarget &Subtarget) {
20199 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20200 SDValue N0 = Op.getOperand(OpNo);
20201 MVT SrcVT = N0.getSimpleValueType();
20202
20203 switch (SrcVT.SimpleTy) {
20204 default:
20205 llvm_unreachable("Custom UINT_TO_FP is not supported!");
20206 case MVT::v2i32:
20207 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
20208 case MVT::v4i32:
20209 case MVT::v8i32:
20210 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
20211 case MVT::v2i64:
20212 case MVT::v4i64:
20213 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20214 }
20215}
20216
20217SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20218 SelectionDAG &DAG) const {
20219 bool IsStrict = Op->isStrictFPOpcode();
20220 unsigned OpNo = IsStrict ? 1 : 0;
20221 SDValue Src = Op.getOperand(OpNo);
20222 SDLoc dl(Op);
20223 auto PtrVT = getPointerTy(DAG.getDataLayout());
20224 MVT SrcVT = Src.getSimpleValueType();
20225 MVT DstVT = Op->getSimpleValueType(0);
20226 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20227
20228 // Bail out when we don't have native conversion instructions.
20229 if (DstVT == MVT::f128)
20230 return SDValue();
20231
20232 if (isSoftF16(DstVT, Subtarget))
20233 return promoteXINT_TO_FP(Op, dl, DAG);
20234 else if (isLegalConversion(SrcVT, false, Subtarget))
20235 return Op;
20236
20237 if (DstVT.isVector())
20238 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
20239
20240 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20241 return LowerWin64_INT128_TO_FP(Op, DAG);
20242
20243 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20244 return Extract;
20245
20246 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20247 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20248 // Conversions from unsigned i32 to f32/f64 are legal,
20249 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
20250 return Op;
20251 }
20252
20253 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20254 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20255 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20256 if (IsStrict)
20257 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20258 {Chain, Src});
20259 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20260 }
20261
20262 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20263 return V;
20264 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20265 return V;
20266
20267 // The transform for i64->f64 isn't correct for 0 when rounding to negative
20268 // infinity. It produces -0.0, so disable under strictfp.
20269 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
20270 !IsStrict)
20271 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
20272 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
20273 // negative infinity. So disable under strictfp. Using FILD instead.
20274 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
20275 !IsStrict)
20276 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
20277 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20278 (DstVT == MVT::f32 || DstVT == MVT::f64))
20279 return SDValue();
20280
20281 // Make a 64-bit buffer, and use it to build an FILD.
20282 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20283 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20284 Align SlotAlign(8);
20285 MachinePointerInfo MPI =
20287 if (SrcVT == MVT::i32) {
20288 SDValue OffsetSlot =
20289 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
20290 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20291 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20292 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20293 std::pair<SDValue, SDValue> Tmp =
20294 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20295 if (IsStrict)
20296 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20297
20298 return Tmp.first;
20299 }
20300
20301 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
20302 SDValue ValueToStore = Src;
20303 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
20304 // Bitcasting to f64 here allows us to do a single 64-bit store from
20305 // an SSE register, avoiding the store forwarding penalty that would come
20306 // with two 32-bit stores.
20307 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20308 }
20309 SDValue Store =
20310 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20311 // For i64 source, we need to add the appropriate power of 2 if the input
20312 // was negative. We must be careful to do the computation in x87 extended
20313 // precision, not in SSE.
20314 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20315 SDValue Ops[] = {Store, StackSlot};
20316 SDValue Fild =
20317 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
20318 SlotAlign, MachineMemOperand::MOLoad);
20319 Chain = Fild.getValue(1);
20320
20321 // Check whether the sign bit is set.
20322 SDValue SignSet = DAG.getSetCC(
20323 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
20324 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
20325
20326 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
20327 APInt FF(64, 0x5F80000000000000ULL);
20328 SDValue FudgePtr =
20329 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
20330 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20331
20332 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
20333 SDValue Zero = DAG.getIntPtrConstant(0, dl);
20334 SDValue Four = DAG.getIntPtrConstant(4, dl);
20335 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
20336 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
20337
20338 // Load the value out, extending it from f32 to f80.
20339 SDValue Fudge = DAG.getExtLoad(
20340 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
20342 CPAlignment);
20343 Chain = Fudge.getValue(1);
20344 // Extend everything to 80 bits to force it to be done on x87.
20345 // TODO: Are there any fast-math-flags to propagate here?
20346 if (IsStrict) {
20347 unsigned Opc = ISD::STRICT_FADD;
20348 // Windows needs the precision control changed to 80bits around this add.
20349 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20351
20352 SDValue Add =
20353 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
20354 // STRICT_FP_ROUND can't handle equal types.
20355 if (DstVT == MVT::f80)
20356 return Add;
20357 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
20358 {Add.getValue(1), Add,
20359 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
20360 }
20361 unsigned Opc = ISD::FADD;
20362 // Windows needs the precision control changed to 80bits around this add.
20363 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20364 Opc = X86ISD::FP80_ADD;
20365
20366 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
20367 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
20368 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
20369}
20370
20371// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
20372// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
20373// just return an SDValue().
20374// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
20375// to i16, i32 or i64, and we lower it to a legal sequence and return the
20376// result.
20377SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
20378 bool IsSigned,
20379 SDValue &Chain) const {
20380 bool IsStrict = Op->isStrictFPOpcode();
20381 SDLoc DL(Op);
20382
20383 EVT DstTy = Op.getValueType();
20384 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
20385 EVT TheVT = Value.getValueType();
20386 auto PtrVT = getPointerTy(DAG.getDataLayout());
20387
20388 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20389 // f16 must be promoted before using the lowering in this routine.
20390 // fp128 does not use this lowering.
20391 return SDValue();
20392 }
20393
20394 // If using FIST to compute an unsigned i64, we'll need some fixup
20395 // to handle values above the maximum signed i64. A FIST is always
20396 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20397 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20398
20399 // FIXME: This does not generate an invalid exception if the input does not
20400 // fit in i32. PR44019
20401 if (!IsSigned && DstTy != MVT::i64) {
20402 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20403 // The low 32 bits of the fist result will have the correct uint32 result.
20404 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
20405 DstTy = MVT::i64;
20406 }
20407
20408 assert(DstTy.getSimpleVT() <= MVT::i64 &&
20409 DstTy.getSimpleVT() >= MVT::i16 &&
20410 "Unknown FP_TO_INT to lower!");
20411
20412 // We lower FP->int64 into FISTP64 followed by a load from a temporary
20413 // stack slot.
20415 unsigned MemSize = DstTy.getStoreSize();
20416 int SSFI =
20417 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20418 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20419
20420 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20421
20422 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20423
20424 if (UnsignedFixup) {
20425 //
20426 // Conversion to unsigned i64 is implemented with a select,
20427 // depending on whether the source value fits in the range
20428 // of a signed i64. Let Thresh be the FP equivalent of
20429 // 0x8000000000000000ULL.
20430 //
20431 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20432 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20433 // FistSrc = (Value - FltOfs);
20434 // Fist-to-mem64 FistSrc
20435 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20436 // to XOR'ing the high 32 bits with Adjust.
20437 //
20438 // Being a power of 2, Thresh is exactly representable in all FP formats.
20439 // For X87 we'd like to use the smallest FP type for this constant, but
20440 // for DAG type consistency we have to match the FP operand type.
20441
20442 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20444 bool LosesInfo = false;
20445 if (TheVT == MVT::f64)
20446 // The rounding mode is irrelevant as the conversion should be exact.
20448 &LosesInfo);
20449 else if (TheVT == MVT::f80)
20450 Status = Thresh.convert(APFloat::x87DoubleExtended(),
20451 APFloat::rmNearestTiesToEven, &LosesInfo);
20452
20453 assert(Status == APFloat::opOK && !LosesInfo &&
20454 "FP conversion should have been exact");
20455
20456 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20457
20458 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20459 *DAG.getContext(), TheVT);
20460 SDValue Cmp;
20461 if (IsStrict) {
20462 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20463 /*IsSignaling*/ true);
20464 Chain = Cmp.getValue(1);
20465 } else {
20466 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20467 }
20468
20469 // Our preferred lowering of
20470 //
20471 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20472 //
20473 // is
20474 //
20475 // (Value >= Thresh) << 63
20476 //
20477 // but since we can get here after LegalOperations, DAGCombine might do the
20478 // wrong thing if we create a select. So, directly create the preferred
20479 // version.
20480 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20481 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20482 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20483
20484 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20485 DAG.getConstantFP(0.0, DL, TheVT));
20486
20487 if (IsStrict) {
20488 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20489 { Chain, Value, FltOfs });
20490 Chain = Value.getValue(1);
20491 } else
20492 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20493 }
20494
20496
20497 // FIXME This causes a redundant load/store if the SSE-class value is already
20498 // in memory, such as if it is on the callstack.
20499 if (isScalarFPTypeInSSEReg(TheVT)) {
20500 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
20501 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20502 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20503 SDValue Ops[] = { Chain, StackSlot };
20504
20505 unsigned FLDSize = TheVT.getStoreSize();
20506 assert(FLDSize <= MemSize && "Stack slot not big enough");
20508 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20509 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20510 Chain = Value.getValue(1);
20511 }
20512
20513 // Build the FP_TO_INT*_IN_MEM
20515 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20516 SDValue Ops[] = { Chain, Value, StackSlot };
20518 DAG.getVTList(MVT::Other),
20519 Ops, DstTy, MMO);
20520
20521 SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
20522 Chain = Res.getValue(1);
20523
20524 // If we need an unsigned fixup, XOR the result with adjust.
20525 if (UnsignedFixup)
20526 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20527
20528 return Res;
20529}
20530
20532 const X86Subtarget &Subtarget) {
20533 MVT VT = Op.getSimpleValueType();
20534 SDValue In = Op.getOperand(0);
20535 MVT InVT = In.getSimpleValueType();
20536 unsigned Opc = Op.getOpcode();
20537
20538 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20539 assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
20540 "Unexpected extension opcode");
20542 "Expected same number of elements");
20543 assert((VT.getVectorElementType() == MVT::i16 ||
20544 VT.getVectorElementType() == MVT::i32 ||
20545 VT.getVectorElementType() == MVT::i64) &&
20546 "Unexpected element type");
20547 assert((InVT.getVectorElementType() == MVT::i8 ||
20548 InVT.getVectorElementType() == MVT::i16 ||
20549 InVT.getVectorElementType() == MVT::i32) &&
20550 "Unexpected element type");
20551
20552 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
20553
20554 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20555 assert(InVT == MVT::v32i8 && "Unexpected VT!");
20556 return splitVectorIntUnary(Op, DAG, dl);
20557 }
20558
20559 if (Subtarget.hasInt256())
20560 return Op;
20561
20562 // Optimize vectors in AVX mode:
20563 //
20564 // v8i16 -> v8i32
20565 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20566 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20567 // Concat upper and lower parts.
20568 //
20569 // v4i32 -> v4i64
20570 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20571 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20572 // Concat upper and lower parts.
20573 //
20574 MVT HalfVT = VT.getHalfNumVectorElementsVT();
20575 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20576
20577 // Short-circuit if we can determine that each 128-bit half is the same value.
20578 // Otherwise, this is difficult to match and optimize.
20579 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20580 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20581 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20582
20583 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20584 SDValue Undef = DAG.getUNDEF(InVT);
20585 bool NeedZero = Opc == ISD::ZERO_EXTEND;
20586 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20587 OpHi = DAG.getBitcast(HalfVT, OpHi);
20588
20589 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20590}
20591
20592// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20593static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20594 const SDLoc &dl, SelectionDAG &DAG) {
20595 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
20596 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20597 DAG.getVectorIdxConstant(0, dl));
20598 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20599 DAG.getVectorIdxConstant(8, dl));
20600 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20601 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20602 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20603 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20604}
20605
20607 const X86Subtarget &Subtarget,
20608 SelectionDAG &DAG) {
20609 MVT VT = Op->getSimpleValueType(0);
20610 SDValue In = Op->getOperand(0);
20611 MVT InVT = In.getSimpleValueType();
20612 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20613 unsigned NumElts = VT.getVectorNumElements();
20614
20615 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20616 // avoids a constant pool load.
20617 if (VT.getVectorElementType() != MVT::i8) {
20618 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20619 return DAG.getNode(ISD::SRL, DL, VT, Extend,
20620 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20621 }
20622
20623 // Extend VT if BWI is not supported.
20624 MVT ExtVT = VT;
20625 if (!Subtarget.hasBWI()) {
20626 // If v16i32 is to be avoided, we'll need to split and concatenate.
20627 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
20628 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
20629
20630 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
20631 }
20632
20633 // Widen to 512-bits if VLX is not supported.
20634 MVT WideVT = ExtVT;
20635 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
20636 NumElts *= 512 / ExtVT.getSizeInBits();
20637 InVT = MVT::getVectorVT(MVT::i1, NumElts);
20638 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT), In,
20639 DAG.getVectorIdxConstant(0, DL));
20640 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
20641 }
20642
20643 SDValue One = DAG.getConstant(1, DL, WideVT);
20644 SDValue Zero = DAG.getConstant(0, DL, WideVT);
20645
20646 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
20647
20648 // Truncate if we had to extend above.
20649 if (VT != ExtVT) {
20650 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
20651 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
20652 }
20653
20654 // Extract back to 128/256-bit if we widened.
20655 if (WideVT != VT)
20656 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
20657 DAG.getVectorIdxConstant(0, DL));
20658
20659 return SelectedVal;
20660}
20661
20663 SelectionDAG &DAG) {
20664 SDValue In = Op.getOperand(0);
20665 MVT SVT = In.getSimpleValueType();
20666 SDLoc DL(Op);
20667
20668 if (SVT.getVectorElementType() == MVT::i1)
20669 return LowerZERO_EXTEND_Mask(Op, DL, Subtarget, DAG);
20670
20671 assert(Subtarget.hasAVX() && "Expected AVX support");
20672 return LowerAVXExtend(Op, DL, DAG, Subtarget);
20673}
20674
20675/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
20676/// It makes use of the fact that vectors with enough leading sign/zero bits
20677/// prevent the PACKSS/PACKUS from saturating the results.
20678/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
20679/// within each 128-bit lane.
20680static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
20681 const SDLoc &DL, SelectionDAG &DAG,
20682 const X86Subtarget &Subtarget) {
20683 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
20684 "Unexpected PACK opcode");
20685 assert(DstVT.isVector() && "VT not a vector?");
20686
20687 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
20688 if (!Subtarget.hasSSE2())
20689 return SDValue();
20690
20691 EVT SrcVT = In.getValueType();
20692
20693 // No truncation required, we might get here due to recursive calls.
20694 if (SrcVT == DstVT)
20695 return In;
20696
20697 unsigned NumElems = SrcVT.getVectorNumElements();
20698 if (NumElems < 2 || !isPowerOf2_32(NumElems) )
20699 return SDValue();
20700
20701 unsigned DstSizeInBits = DstVT.getSizeInBits();
20702 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
20703 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
20704 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
20705
20706 LLVMContext &Ctx = *DAG.getContext();
20707 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
20708 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
20709
20710 // Pack to the largest type possible:
20711 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
20712 EVT InVT = MVT::i16, OutVT = MVT::i8;
20713 if (SrcVT.getScalarSizeInBits() > 16 &&
20714 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
20715 InVT = MVT::i32;
20716 OutVT = MVT::i16;
20717 }
20718
20719 // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
20720 // On pre-AVX512, pack the src in both halves to help value tracking.
20721 if (SrcSizeInBits <= 128) {
20722 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
20723 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
20724 In = widenSubVector(In, false, Subtarget, DAG, DL, 128);
20725 SDValue LHS = DAG.getBitcast(InVT, In);
20726 SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS;
20727 SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS);
20728 Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);
20729 Res = DAG.getBitcast(PackedVT, Res);
20730 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20731 }
20732
20733 // Split lower/upper subvectors.
20734 SDValue Lo, Hi;
20735 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
20736
20737 // If Hi is undef, then don't bother packing it and widen the result instead.
20738 if (Hi.isUndef()) {
20739 EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);
20740 if (SDValue Res =
20741 truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))
20742 return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);
20743 }
20744
20745 unsigned SubSizeInBits = SrcSizeInBits / 2;
20746 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
20747 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
20748
20749 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
20750 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
20751 Lo = DAG.getBitcast(InVT, Lo);
20752 Hi = DAG.getBitcast(InVT, Hi);
20753 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20754 return DAG.getBitcast(DstVT, Res);
20755 }
20756
20757 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
20758 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
20759 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
20760 Lo = DAG.getBitcast(InVT, Lo);
20761 Hi = DAG.getBitcast(InVT, Hi);
20762 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20763
20764 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
20765 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
20766 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
20768 int Scale = 64 / OutVT.getScalarSizeInBits();
20769 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
20770 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
20771
20772 if (DstVT.is256BitVector())
20773 return DAG.getBitcast(DstVT, Res);
20774
20775 // If 512bit -> 128bit truncate another stage.
20776 Res = DAG.getBitcast(PackedVT, Res);
20777 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20778 }
20779
20780 // Recursively pack lower/upper subvectors, concat result and pack again.
20781 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
20782
20783 if (PackedVT.is128BitVector()) {
20784 // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
20785 // type legalization.
20786 SDValue Res =
20787 truncateVectorWithPACK(Opcode, PackedVT, In, DL, DAG, Subtarget);
20788 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20789 }
20790
20791 EVT HalfPackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
20792 Lo = truncateVectorWithPACK(Opcode, HalfPackedVT, Lo, DL, DAG, Subtarget);
20793 Hi = truncateVectorWithPACK(Opcode, HalfPackedVT, Hi, DL, DAG, Subtarget);
20794 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
20795 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20796}
20797
20798/// Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
20799/// e.g. trunc <8 x i32> X to <8 x i16> -->
20800/// MaskX = X & 0xffff (clear high bits to prevent saturation)
20801/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
20803 const X86Subtarget &Subtarget,
20804 SelectionDAG &DAG) {
20805 In = DAG.getZeroExtendInReg(In, DL, DstVT);
20806 return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget);
20807}
20808
20809/// Truncate using inreg sign extension and X86ISD::PACKSS.
20811 const X86Subtarget &Subtarget,
20812 SelectionDAG &DAG) {
20813 EVT SrcVT = In.getValueType();
20814 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In,
20815 DAG.getValueType(DstVT));
20816 return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
20817}
20818
20819/// Helper to determine if \p In truncated to \p DstVT has the necessary
20820/// signbits / leading zero bits to be truncated with PACKSS / PACKUS,
20821/// possibly by converting a SRL node to SRA for sign extension.
20822static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
20823 SDValue In, const SDLoc &DL,
20824 SelectionDAG &DAG,
20825 const X86Subtarget &Subtarget,
20826 const SDNodeFlags Flags = SDNodeFlags()) {
20827 // Requires SSE2.
20828 if (!Subtarget.hasSSE2())
20829 return SDValue();
20830
20831 EVT SrcVT = In.getValueType();
20832 EVT DstSVT = DstVT.getVectorElementType();
20833 EVT SrcSVT = SrcVT.getVectorElementType();
20834 unsigned NumDstEltBits = DstSVT.getSizeInBits();
20835 unsigned NumSrcEltBits = SrcSVT.getSizeInBits();
20836
20837 // Check we have a truncation suited for PACKSS/PACKUS.
20838 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20839 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20840 return SDValue();
20841
20842 assert(NumSrcEltBits > NumDstEltBits && "Bad truncation");
20843 unsigned NumStages = Log2_32(NumSrcEltBits / NumDstEltBits);
20844
20845 // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
20846 // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
20847 // Truncation from v2i64 to v2i8 can be better handled with PSHUFB.
20848 if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) ||
20849 (DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) ||
20850 (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3()))
20851 return SDValue();
20852
20853 // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
20854 // split this for packing.
20855 if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&
20856 !isFreeToSplitVector(In.getNode(), DAG) &&
20857 (!Subtarget.hasAVX() || DAG.ComputeNumSignBits(In) != 64))
20858 return SDValue();
20859
20860 // Don't truncate AVX512 targets as multiple PACK nodes stages.
20861 if (Subtarget.hasAVX512() && NumStages > 1)
20862 return SDValue();
20863
20864 unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);
20865 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
20866
20867 // Truncate with PACKUS if we are truncating a vector with leading zero
20868 // bits that extend all the way to the packed/truncated value.
20869 // e.g. Masks, zext_in_reg, etc.
20870 // Pre-SSE41 we can only use PACKUSWB.
20871 KnownBits Known = DAG.computeKnownBits(In);
20872 if ((Flags.hasNoUnsignedWrap() && NumDstEltBits <= NumPackedZeroBits) ||
20873 (NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
20874 PackOpcode = X86ISD::PACKUS;
20875 return In;
20876 }
20877
20878 // Truncate with PACKSS if we are truncating a vector with sign-bits
20879 // that extend all the way to the packed/truncated value.
20880 // e.g. Comparison result, sext_in_reg, etc.
20881 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
20882
20883 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
20884 // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
20885 // see through BITCASTs later on and combines/simplifications can't then use
20886 // it.
20887 if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&
20888 !Subtarget.hasAVX512())
20889 return SDValue();
20890
20891 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
20892 if (Flags.hasNoSignedWrap() || MinSignBits < NumSignBits) {
20893 PackOpcode = X86ISD::PACKSS;
20894 return In;
20895 }
20896
20897 // If we have a srl that only generates signbits that we will discard in
20898 // the truncation then we can use PACKSS by converting the srl to a sra.
20899 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
20900 if (In.getOpcode() == ISD::SRL && In->hasOneUse())
20901 if (std::optional<uint64_t> ShAmt = DAG.getValidShiftAmount(In)) {
20902 if (*ShAmt == MinSignBits) {
20903 PackOpcode = X86ISD::PACKSS;
20904 return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
20905 }
20906 }
20907
20908 return SDValue();
20909}
20910
20911/// This function lowers a vector truncation of 'extended sign-bits' or
20912/// 'extended zero-bits' values.
20913/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
20915 MVT DstVT, SDValue In, const SDLoc &DL, const X86Subtarget &Subtarget,
20916 SelectionDAG &DAG, const SDNodeFlags Flags = SDNodeFlags()) {
20917 MVT SrcVT = In.getSimpleValueType();
20918 MVT DstSVT = DstVT.getVectorElementType();
20919 MVT SrcSVT = SrcVT.getVectorElementType();
20920 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20921 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20922 return SDValue();
20923
20924 // If the upper half of the source is undef, then attempt to split and
20925 // only truncate the lower half.
20926 if (DstVT.getSizeInBits() >= 128) {
20927 SmallVector<SDValue> LowerOps;
20928 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
20929 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
20930 if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,
20931 Subtarget, DAG))
20932 return widenSubVector(Res, false, Subtarget, DAG, DL,
20933 DstVT.getSizeInBits());
20934 }
20935 }
20936
20937 unsigned PackOpcode;
20938 if (SDValue Src = matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG,
20939 Subtarget, Flags))
20940 return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
20941
20942 return SDValue();
20943}
20944
20945/// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into
20946/// X86ISD::PACKUS/X86ISD::PACKSS operations.
20948 const X86Subtarget &Subtarget,
20949 SelectionDAG &DAG) {
20950 MVT SrcVT = In.getSimpleValueType();
20951 MVT DstSVT = DstVT.getVectorElementType();
20952 MVT SrcSVT = SrcVT.getVectorElementType();
20953 unsigned NumElems = DstVT.getVectorNumElements();
20954 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20955 (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
20956 NumElems >= 8))
20957 return SDValue();
20958
20959 // SSSE3's pshufb results in less instructions in the cases below.
20960 if (Subtarget.hasSSSE3() && NumElems == 8) {
20961 if (SrcSVT == MVT::i16)
20962 return SDValue();
20963 if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))
20964 return SDValue();
20965 }
20966
20967 // If the upper half of the source is undef, then attempt to split and
20968 // only truncate the lower half.
20969 if (DstVT.getSizeInBits() >= 128) {
20970 SmallVector<SDValue> LowerOps;
20971 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
20972 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
20973 if (SDValue Res = LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))
20974 return widenSubVector(Res, false, Subtarget, DAG, DL,
20975 DstVT.getSizeInBits());
20976 }
20977 }
20978
20979 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
20980 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
20981 // truncate 2 x v4i32 to v8i16.
20982 if (Subtarget.hasSSE41() || DstSVT == MVT::i8)
20983 return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);
20984
20985 if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
20986 return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);
20987
20988 // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
20989 if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
20990 MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);
20991 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);
20992 return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);
20993 }
20994
20995 return SDValue();
20996}
20997
20999 SelectionDAG &DAG,
21000 const X86Subtarget &Subtarget) {
21001 MVT VT = Op.getSimpleValueType();
21002 SDValue In = Op.getOperand(0);
21003 MVT InVT = In.getSimpleValueType();
21004 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
21005
21006 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
21007 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
21008 if (InVT.getScalarSizeInBits() <= 16) {
21009 if (Subtarget.hasBWI()) {
21010 // legal, will go to VPMOVB2M, VPMOVW2M
21011 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21012 // We need to shift to get the lsb into sign position.
21013 // Shift packed bytes not supported natively, bitcast to word
21014 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
21015 In = DAG.getNode(ISD::SHL, DL, ExtVT,
21016 DAG.getBitcast(ExtVT, In),
21017 DAG.getConstant(ShiftInx, DL, ExtVT));
21018 In = DAG.getBitcast(InVT, In);
21019 }
21020 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21021 In, ISD::SETGT);
21022 }
21023 // Use TESTD/Q, extended vector to packed dword/qword.
21024 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
21025 "Unexpected vector type.");
21026 unsigned NumElts = InVT.getVectorNumElements();
21027 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
21028 // We need to change to a wider element type that we have support for.
21029 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21030 // For 16 element vectors we extend to v16i32 unless we are explicitly
21031 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21032 // we need to split into two 8 element vectors which we can extend to v8i32,
21033 // truncate and concat the results. There's an additional complication if
21034 // the original type is v16i8. In that case we can't split the v16i8
21035 // directly, so we need to shuffle high elements to low and use
21036 // sign_extend_vector_inreg.
21037 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21038 SDValue Lo, Hi;
21039 if (InVT == MVT::v16i8) {
21040 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21041 Hi = DAG.getVectorShuffle(
21042 InVT, DL, In, In,
21043 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21044 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21045 } else {
21046 assert(InVT == MVT::v16i16 && "Unexpected VT!");
21047 Lo = extract128BitVector(In, 0, DAG, DL);
21048 Hi = extract128BitVector(In, 8, DAG, DL);
21049 }
21050 // We're split now, just emit two truncates and a concat. The two
21051 // truncates will trigger legalization to come back to this function.
21052 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21053 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21054 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21055 }
21056 // We either have 8 elements or we're allowed to use 512-bit vectors.
21057 // If we have VLX, we want to use the narrowest vector that can get the
21058 // job done so we use vXi32.
21059 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21060 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21061 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21062 InVT = ExtVT;
21063 ShiftInx = InVT.getScalarSizeInBits() - 1;
21064 }
21065
21066 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21067 // We need to shift to get the lsb into sign position.
21068 In = DAG.getNode(ISD::SHL, DL, InVT, In,
21069 DAG.getConstant(ShiftInx, DL, InVT));
21070 }
21071 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21072 if (Subtarget.hasDQI())
21073 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21074 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21075}
21076
21077SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21078 SDLoc DL(Op);
21079 MVT VT = Op.getSimpleValueType();
21080 SDValue In = Op.getOperand(0);
21081 MVT InVT = In.getSimpleValueType();
21083 "Invalid TRUNCATE operation");
21084
21085 // If we're called by the type legalizer, handle a few cases.
21086 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21087 if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(InVT)) {
21088 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21089 VT.is128BitVector() && Subtarget.hasAVX512()) {
21090 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
21091 "Unexpected subtarget!");
21092 // The default behavior is to truncate one step, concatenate, and then
21093 // truncate the remainder. We'd rather produce two 64-bit results and
21094 // concatenate those.
21095 SDValue Lo, Hi;
21096 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21097
21098 EVT LoVT, HiVT;
21099 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21100
21101 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21102 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21103 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21104 }
21105
21106 // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
21107 if (!Subtarget.hasAVX512() ||
21108 (InVT.is512BitVector() && VT.is256BitVector()))
21110 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21111 return SignPack;
21112
21113 // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
21114 if (!Subtarget.hasAVX512())
21115 return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
21116
21117 // Otherwise let default legalization handle it.
21118 return SDValue();
21119 }
21120
21121 if (VT.getVectorElementType() == MVT::i1)
21122 return LowerTruncateVecI1(Op, DL, DAG, Subtarget);
21123
21124 // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
21125 // concat from subvectors to use VPTRUNC etc.
21126 if (!Subtarget.hasAVX512() || isFreeToSplitVector(In.getNode(), DAG))
21128 VT, In, DL, Subtarget, DAG, Op->getFlags()))
21129 return SignPack;
21130
21131 // vpmovqb/w/d, vpmovdb/w, vpmovwb
21132 if (Subtarget.hasAVX512()) {
21133 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21134 assert(VT == MVT::v32i8 && "Unexpected VT!");
21135 return splitVectorIntUnary(Op, DAG, DL);
21136 }
21137
21138 // word to byte only under BWI. Otherwise we have to promoted to v16i32
21139 // and then truncate that. But we should only do that if we haven't been
21140 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21141 // handled by isel patterns.
21142 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21143 Subtarget.canExtendTo512DQ())
21144 return Op;
21145 }
21146
21147 // Handle truncation of V256 to V128 using shuffles.
21148 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
21149
21150 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21151 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21152 if (Subtarget.hasInt256()) {
21153 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21154 In = DAG.getBitcast(MVT::v8i32, In);
21155 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21156 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21157 DAG.getVectorIdxConstant(0, DL));
21158 }
21159
21160 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21161 DAG.getVectorIdxConstant(0, DL));
21162 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21163 DAG.getVectorIdxConstant(2, DL));
21164 static const int ShufMask[] = {0, 2, 4, 6};
21165 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
21166 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
21167 }
21168
21169 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21170 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21171 if (Subtarget.hasInt256()) {
21172 // The PSHUFB mask:
21173 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
21174 -1, -1, -1, -1, -1, -1, -1, -1,
21175 16, 17, 20, 21, 24, 25, 28, 29,
21176 -1, -1, -1, -1, -1, -1, -1, -1 };
21177 In = DAG.getBitcast(MVT::v32i8, In);
21178 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21179 In = DAG.getBitcast(MVT::v4i64, In);
21180
21181 static const int ShufMask2[] = {0, 2, -1, -1};
21182 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21183 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21184 DAG.getVectorIdxConstant(0, DL));
21185 return DAG.getBitcast(MVT::v8i16, In);
21186 }
21187
21188 return Subtarget.hasSSE41()
21189 ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)
21190 : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);
21191 }
21192
21193 if (VT == MVT::v16i8 && InVT == MVT::v16i16)
21194 return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG);
21195
21196 llvm_unreachable("All 256->128 cases should have been handled above!");
21197}
21198
21199// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
21200// behaves on out of range inputs to generate optimized conversions.
21202 SelectionDAG &DAG,
21203 const X86Subtarget &Subtarget) {
21204 MVT SrcVT = Src.getSimpleValueType();
21205 unsigned DstBits = VT.getScalarSizeInBits();
21206 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
21207
21208 // Calculate the converted result for values in the range 0 to
21209 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21210 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
21211 SDValue Big =
21212 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
21213 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
21214 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
21215
21216 // The "CVTTP2SI" instruction conveniently sets the sign bit if
21217 // and only if the value was out of range. So we can use that
21218 // as our indicator that we rather use "Big" instead of "Small".
21219 //
21220 // Use "Small" if "IsOverflown" has all bits cleared
21221 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21222
21223 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
21224 // use the slightly slower blendv select instead.
21225 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
21226 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
21227 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
21228 }
21229
21230 SDValue IsOverflown =
21231 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
21232 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
21233 return DAG.getNode(ISD::OR, dl, VT, Small,
21234 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21235}
21236
21237SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21238 bool IsStrict = Op->isStrictFPOpcode();
21239 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21240 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21241 MVT VT = Op->getSimpleValueType(0);
21242 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21243 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
21244 MVT SrcVT = Src.getSimpleValueType();
21245 SDLoc dl(Op);
21246
21247 SDValue Res;
21248 if (isSoftF16(SrcVT, Subtarget)) {
21249 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
21250 if (IsStrict)
21251 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
21252 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
21253 {NVT, MVT::Other}, {Chain, Src})});
21254 return DAG.getNode(Op.getOpcode(), dl, VT,
21255 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
21256 } else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) {
21257 return Op;
21258 }
21259
21260 if (VT.isVector()) {
21261 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21262 MVT ResVT = MVT::v4i32;
21263 MVT TruncVT = MVT::v4i1;
21264 unsigned Opc;
21265 if (IsStrict)
21267 else
21268 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21269
21270 if (!IsSigned && !Subtarget.hasVLX()) {
21271 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21272 // Widen to 512-bits.
21273 ResVT = MVT::v8i32;
21274 TruncVT = MVT::v8i1;
21275 Opc = Op.getOpcode();
21276 // Need to concat with zero vector for strict fp to avoid spurious
21277 // exceptions.
21278 // TODO: Should we just do this for non-strict as well?
21279 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21280 : DAG.getUNDEF(MVT::v8f64);
21281 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21282 DAG.getVectorIdxConstant(0, dl));
21283 }
21284 if (IsStrict) {
21285 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
21286 Chain = Res.getValue(1);
21287 } else {
21288 Res = DAG.getNode(Opc, dl, ResVT, Src);
21289 }
21290
21291 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21292 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21293 DAG.getVectorIdxConstant(0, dl));
21294 if (IsStrict)
21295 return DAG.getMergeValues({Res, Chain}, dl);
21296 return Res;
21297 }
21298
21299 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
21300 if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
21301 return Op;
21302
21303 MVT ResVT = VT;
21304 MVT EleVT = VT.getVectorElementType();
21305 if (EleVT != MVT::i64)
21306 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
21307
21308 if (SrcVT != MVT::v8f16) {
21309 SDValue Tmp =
21310 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
21311 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
21312 Ops[0] = Src;
21313 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
21314 }
21315
21316 if (IsStrict) {
21317 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
21319 dl, {ResVT, MVT::Other}, {Chain, Src});
21320 Chain = Res.getValue(1);
21321 } else {
21322 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
21323 ResVT, Src);
21324 }
21325
21326 // TODO: Need to add exception check code for strict FP.
21327 if (EleVT.getSizeInBits() < 16) {
21328 ResVT = MVT::getVectorVT(EleVT, 8);
21329 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
21330 }
21331
21332 if (ResVT != VT)
21333 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21334 DAG.getVectorIdxConstant(0, dl));
21335
21336 if (IsStrict)
21337 return DAG.getMergeValues({Res, Chain}, dl);
21338 return Res;
21339 }
21340
21341 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
21342 if (VT.getVectorElementType() == MVT::i16) {
21343 assert((SrcVT.getVectorElementType() == MVT::f32 ||
21344 SrcVT.getVectorElementType() == MVT::f64) &&
21345 "Expected f32/f64 vector!");
21346 MVT NVT = VT.changeVectorElementType(MVT::i32);
21347 if (IsStrict) {
21348 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
21350 dl, {NVT, MVT::Other}, {Chain, Src});
21351 Chain = Res.getValue(1);
21352 } else {
21353 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
21354 NVT, Src);
21355 }
21356
21357 // TODO: Need to add exception check code for strict FP.
21358 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21359
21360 if (IsStrict)
21361 return DAG.getMergeValues({Res, Chain}, dl);
21362 return Res;
21363 }
21364
21365 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21366 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21367 assert(!IsSigned && "Expected unsigned conversion!");
21368 assert(Subtarget.useAVX512Regs() && "Requires avx512f");
21369 return Op;
21370 }
21371
21372 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21373 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21374 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
21375 Subtarget.useAVX512Regs()) {
21376 assert(!IsSigned && "Expected unsigned conversion!");
21377 assert(!Subtarget.hasVLX() && "Unexpected features!");
21378 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21379 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21380 // Need to concat with zero vector for strict fp to avoid spurious
21381 // exceptions.
21382 // TODO: Should we just do this for non-strict as well?
21383 SDValue Tmp =
21384 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21385 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21386 DAG.getVectorIdxConstant(0, dl));
21387
21388 if (IsStrict) {
21389 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21390 {Chain, Src});
21391 Chain = Res.getValue(1);
21392 } else {
21393 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21394 }
21395
21396 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21397 DAG.getVectorIdxConstant(0, dl));
21398
21399 if (IsStrict)
21400 return DAG.getMergeValues({Res, Chain}, dl);
21401 return Res;
21402 }
21403
21404 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21405 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21406 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21407 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
21408 assert(!Subtarget.hasVLX() && "Unexpected features!");
21409 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21410 // Need to concat with zero vector for strict fp to avoid spurious
21411 // exceptions.
21412 // TODO: Should we just do this for non-strict as well?
21413 SDValue Tmp =
21414 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21415 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21416 DAG.getVectorIdxConstant(0, dl));
21417
21418 if (IsStrict) {
21419 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21420 {Chain, Src});
21421 Chain = Res.getValue(1);
21422 } else {
21423 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21424 }
21425
21426 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21427 DAG.getVectorIdxConstant(0, dl));
21428
21429 if (IsStrict)
21430 return DAG.getMergeValues({Res, Chain}, dl);
21431 return Res;
21432 }
21433
21434 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21435 if (!Subtarget.hasVLX()) {
21436 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21437 // legalizer and then widened again by vector op legalization.
21438 if (!IsStrict)
21439 return SDValue();
21440
21441 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21442 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21443 {Src, Zero, Zero, Zero});
21444 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21445 {Chain, Tmp});
21446 SDValue Chain = Tmp.getValue(1);
21447 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21448 DAG.getVectorIdxConstant(0, dl));
21449 return DAG.getMergeValues({Tmp, Chain}, dl);
21450 }
21451
21452 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
21453 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21454 DAG.getUNDEF(MVT::v2f32));
21455 if (IsStrict) {
21456 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21458 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21459 }
21460 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21461 return DAG.getNode(Opc, dl, VT, Tmp);
21462 }
21463
21464 // Generate optimized instructions for pre AVX512 unsigned conversions from
21465 // vXf32 to vXi32.
21466 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21467 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21468 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21469 assert(!IsSigned && "Expected unsigned conversion!");
21470 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21471 }
21472
21473 return SDValue();
21474 }
21475
21476 assert(!VT.isVector());
21477
21478 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21479
21480 if (!IsSigned && UseSSEReg) {
21481 // Conversions from f32/f64 with AVX512 should be legal.
21482 if (Subtarget.hasAVX512())
21483 return Op;
21484
21485 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21486 // behaves on out of range inputs to generate optimized conversions.
21487 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21488 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21489 unsigned DstBits = VT.getScalarSizeInBits();
21490 APInt UIntLimit = APInt::getSignMask(DstBits);
21491 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21492 DAG.getConstant(UIntLimit, dl, VT));
21493 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21494
21495 // Calculate the converted result for values in the range:
21496 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21497 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21498 SDValue Small =
21499 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21500 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21501 SDValue Big = DAG.getNode(
21502 X86ISD::CVTTS2SI, dl, VT,
21503 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21504 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21505
21506 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21507 // and only if the value was out of range. So we can use that
21508 // as our indicator that we rather use "Big" instead of "Small".
21509 //
21510 // Use "Small" if "IsOverflown" has all bits cleared
21511 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21512 SDValue IsOverflown = DAG.getNode(
21513 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21514 return DAG.getNode(ISD::OR, dl, VT, Small,
21515 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21516 }
21517
21518 // Use default expansion for i64.
21519 if (VT == MVT::i64)
21520 return SDValue();
21521
21522 assert(VT == MVT::i32 && "Unexpected VT!");
21523
21524 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21525 // FIXME: This does not generate an invalid exception if the input does not
21526 // fit in i32. PR44019
21527 if (Subtarget.is64Bit()) {
21528 if (IsStrict) {
21529 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
21530 {Chain, Src});
21531 Chain = Res.getValue(1);
21532 } else
21533 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21534
21535 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21536 if (IsStrict)
21537 return DAG.getMergeValues({Res, Chain}, dl);
21538 return Res;
21539 }
21540
21541 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21542 // use fisttp which will be handled later.
21543 if (!Subtarget.hasSSE3())
21544 return SDValue();
21545 }
21546
21547 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21548 // FIXME: This does not generate an invalid exception if the input does not
21549 // fit in i16. PR44019
21550 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21551 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
21552 if (IsStrict) {
21553 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
21554 {Chain, Src});
21555 Chain = Res.getValue(1);
21556 } else
21557 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21558
21559 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21560 if (IsStrict)
21561 return DAG.getMergeValues({Res, Chain}, dl);
21562 return Res;
21563 }
21564
21565 // If this is a FP_TO_SINT using SSEReg we're done.
21566 if (UseSSEReg && IsSigned)
21567 return Op;
21568
21569 // fp128 needs to use a libcall.
21570 if (SrcVT == MVT::f128) {
21571 RTLIB::Libcall LC;
21572 if (IsSigned)
21573 LC = RTLIB::getFPTOSINT(SrcVT, VT);
21574 else
21575 LC = RTLIB::getFPTOUINT(SrcVT, VT);
21576
21577 MakeLibCallOptions CallOptions;
21578 std::pair<SDValue, SDValue> Tmp =
21579 makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
21580
21581 if (IsStrict)
21582 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21583
21584 return Tmp.first;
21585 }
21586
21587 // Fall back to X87.
21588 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21589 if (IsStrict)
21590 return DAG.getMergeValues({V, Chain}, dl);
21591 return V;
21592 }
21593
21594 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
21595}
21596
21597SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21598 SelectionDAG &DAG) const {
21599 SDValue Src = Op.getOperand(0);
21600 EVT DstVT = Op.getSimpleValueType();
21601 MVT SrcVT = Src.getSimpleValueType();
21602
21603 if (SrcVT.isVector())
21604 return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();
21605
21606 if (SrcVT == MVT::f16)
21607 return SDValue();
21608
21609 // If the source is in an SSE register, the node is Legal.
21610 if (isScalarFPTypeInSSEReg(SrcVT))
21611 return Op;
21612
21613 return LRINT_LLRINTHelper(Op.getNode(), DAG);
21614}
21615
21616SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
21617 SelectionDAG &DAG) const {
21618 EVT DstVT = N->getValueType(0);
21619 SDValue Src = N->getOperand(0);
21620 EVT SrcVT = Src.getValueType();
21621
21622 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
21623 // f16 must be promoted before using the lowering in this routine.
21624 // fp128 does not use this lowering.
21625 return SDValue();
21626 }
21627
21628 SDLoc DL(N);
21629 SDValue Chain = DAG.getEntryNode();
21630
21631 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
21632
21633 // If we're converting from SSE, the stack slot needs to hold both types.
21634 // Otherwise it only needs to hold the DstVT.
21635 EVT OtherVT = UseSSE ? SrcVT : DstVT;
21636 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
21637 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
21638 MachinePointerInfo MPI =
21640
21641 if (UseSSE) {
21642 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
21643 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
21644 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21645 SDValue Ops[] = { Chain, StackPtr };
21646