LLVM 20.0.0git
X86ISelLowering.cpp
Go to the documentation of this file.
1//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that X86 uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "X86ISelLowering.h"
16#include "X86.h"
17#include "X86FrameLowering.h"
18#include "X86InstrBuilder.h"
19#include "X86IntrinsicsInfo.h"
21#include "X86TargetMachine.h"
23#include "llvm/ADT/SmallSet.h"
24#include "llvm/ADT/Statistic.h"
41#include "llvm/IR/CallingConv.h"
42#include "llvm/IR/Constants.h"
45#include "llvm/IR/Function.h"
46#include "llvm/IR/GlobalAlias.h"
48#include "llvm/IR/IRBuilder.h"
50#include "llvm/IR/Intrinsics.h"
52#include "llvm/MC/MCAsmInfo.h"
53#include "llvm/MC/MCContext.h"
54#include "llvm/MC/MCExpr.h"
55#include "llvm/MC/MCSymbol.h"
57#include "llvm/Support/Debug.h"
62#include <algorithm>
63#include <bitset>
64#include <cctype>
65#include <numeric>
66using namespace llvm;
67
68#define DEBUG_TYPE "x86-isel"
69
71 "x86-experimental-pref-innermost-loop-alignment", cl::init(4),
73 "Sets the preferable loop alignment for experiments (as log2 bytes) "
74 "for innermost loops only. If specified, this option overrides "
75 "alignment set by x86-experimental-pref-loop-alignment."),
77
79 "x86-br-merging-base-cost", cl::init(2),
81 "Sets the cost threshold for when multiple conditionals will be merged "
82 "into one branch versus be split in multiple branches. Merging "
83 "conditionals saves branches at the cost of additional instructions. "
84 "This value sets the instruction cost limit, below which conditionals "
85 "will be merged, and above which conditionals will be split. Set to -1 "
86 "to never merge branches."),
88
90 "x86-br-merging-ccmp-bias", cl::init(6),
91 cl::desc("Increases 'x86-br-merging-base-cost' in cases that the target "
92 "supports conditional compare instructions."),
94
95static cl::opt<bool>
96 WidenShift("x86-widen-shift", cl::init(true),
97 cl::desc("Replace narrow shifts with wider shifts."),
99
101 "x86-br-merging-likely-bias", cl::init(0),
102 cl::desc("Increases 'x86-br-merging-base-cost' in cases that it is likely "
103 "that all conditionals will be executed. For example for merging "
104 "the conditionals (a == b && c > d), if its known that a == b is "
105 "likely, then it is likely that if the conditionals are split "
106 "both sides will be executed, so it may be desirable to increase "
107 "the instruction cost threshold. Set to -1 to never merge likely "
108 "branches."),
109 cl::Hidden);
110
112 "x86-br-merging-unlikely-bias", cl::init(-1),
113 cl::desc(
114 "Decreases 'x86-br-merging-base-cost' in cases that it is unlikely "
115 "that all conditionals will be executed. For example for merging "
116 "the conditionals (a == b && c > d), if its known that a == b is "
117 "unlikely, then it is unlikely that if the conditionals are split "
118 "both sides will be executed, so it may be desirable to decrease "
119 "the instruction cost threshold. Set to -1 to never merge unlikely "
120 "branches."),
121 cl::Hidden);
122
124 "mul-constant-optimization", cl::init(true),
125 cl::desc("Replace 'mul x, Const' with more effective instructions like "
126 "SHIFT, LEA, etc."),
127 cl::Hidden);
128
130 const X86Subtarget &STI)
131 : TargetLowering(TM), Subtarget(STI) {
132 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
133 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
134
135 // Set up the TargetLowering object.
136
137 // X86 is weird. It always uses i8 for shift amounts and setcc results.
139 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
141
142 // X86 instruction cache is coherent with its data cache so we can use the
143 // default expansion to a no-op.
145
146 // For 64-bit, since we have so many registers, use the ILP scheduler.
147 // For 32-bit, use the register pressure specific scheduling.
148 // For Atom, always use ILP scheduling.
149 if (Subtarget.isAtom())
151 else if (Subtarget.is64Bit())
153 else
155 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
157
158 // Bypass expensive divides and use cheaper ones.
159 if (TM.getOptLevel() >= CodeGenOptLevel::Default) {
160 if (Subtarget.hasSlowDivide32())
161 addBypassSlowDiv(32, 8);
162 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
163 addBypassSlowDiv(64, 32);
164 }
165
166 // Setup Windows compiler runtime calls.
167 if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {
168 static const struct {
169 const RTLIB::Libcall Op;
170 const char * const Name;
171 const CallingConv::ID CC;
172 } LibraryCalls[] = {
173 { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },
174 { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },
175 { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },
176 { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },
177 { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },
178 };
179
180 for (const auto &LC : LibraryCalls) {
181 setLibcallName(LC.Op, LC.Name);
182 setLibcallCallingConv(LC.Op, LC.CC);
183 }
184 }
185
186 if (Subtarget.canUseCMPXCHG16B())
188 else if (Subtarget.canUseCMPXCHG8B())
190 else
192
193 setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);
194
196
197 // Set up the register classes.
198 addRegisterClass(MVT::i8, &X86::GR8RegClass);
199 addRegisterClass(MVT::i16, &X86::GR16RegClass);
200 addRegisterClass(MVT::i32, &X86::GR32RegClass);
201 if (Subtarget.is64Bit())
202 addRegisterClass(MVT::i64, &X86::GR64RegClass);
203
204 for (MVT VT : MVT::integer_valuetypes())
206
207 // We don't accept any truncstore of integer registers.
208 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
209 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
210 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
211 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
212 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
213 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
214
215 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
216
217 // SETOEQ and SETUNE require checking two conditions.
218 for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
221 }
222
223 // Integer absolute.
224 if (Subtarget.canUseCMOV()) {
225 setOperationAction(ISD::ABS , MVT::i16 , Custom);
226 setOperationAction(ISD::ABS , MVT::i32 , Custom);
227 if (Subtarget.is64Bit())
228 setOperationAction(ISD::ABS , MVT::i64 , Custom);
229 }
230
231 // Absolute difference.
232 for (auto Op : {ISD::ABDS, ISD::ABDU}) {
233 setOperationAction(Op , MVT::i8 , Custom);
234 setOperationAction(Op , MVT::i16 , Custom);
235 setOperationAction(Op , MVT::i32 , Custom);
236 if (Subtarget.is64Bit())
237 setOperationAction(Op , MVT::i64 , Custom);
238 }
239
240 // Signed saturation subtraction.
244 if (Subtarget.is64Bit())
246
247 // Funnel shifts.
248 for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
249 // For slow shld targets we only lower for code size.
250 LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
251
252 setOperationAction(ShiftOp , MVT::i8 , Custom);
253 setOperationAction(ShiftOp , MVT::i16 , Custom);
254 setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);
255 if (Subtarget.is64Bit())
256 setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);
257 }
258
259 if (!Subtarget.useSoftFloat()) {
260 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
261 // operation.
266 // We have an algorithm for SSE2, and we turn this into a 64-bit
267 // FILD or VCVTUSI2SS/SD for other targets.
270 // We have an algorithm for SSE2->double, and we turn this into a
271 // 64-bit FILD followed by conditional FADD for other targets.
274
275 // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
276 // this operation.
279 // SSE has no i16 to fp conversion, only i32. We promote in the handler
280 // to allow f80 to use i16 and f64 to use i16 with sse1 only
283 // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
286 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
287 // are Legal, f80 is custom lowered.
290
291 // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
292 // this operation.
294 // FIXME: This doesn't generate invalid exception when it should. PR44019.
300 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
301 // are Legal, f80 is custom lowered.
304
305 // Handle FP_TO_UINT by promoting the destination to a larger signed
306 // conversion.
308 // FIXME: This doesn't generate invalid exception when it should. PR44019.
311 // FIXME: This doesn't generate invalid exception when it should. PR44019.
317
322
323 if (!Subtarget.is64Bit()) {
326 }
327 }
328
329 if (Subtarget.hasSSE2()) {
330 // Custom lowering for saturating float to int conversions.
331 // We handle promotion to larger result types manually.
332 for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {
335 }
337 if (Subtarget.is64Bit()) {
341 }
342 }
343 if (Subtarget.hasAVX10_2()) {
346 for (MVT VT : {MVT::i32, MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
347 MVT::v4i64}) {
350 }
351 if (Subtarget.hasAVX10_2_512()) {
354 }
355 if (Subtarget.is64Bit()) {
358 }
359 }
360
361 // Handle address space casts between mixed sized pointers.
364
365 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
366 if (!Subtarget.hasSSE2()) {
372 if (Subtarget.is64Bit()) {
374 // Without SSE, i64->f64 goes through memory.
376 }
377 } else if (!Subtarget.is64Bit())
379
380 // Scalar integer divide and remainder are lowered to use operations that
381 // produce two results, to match the available instructions. This exposes
382 // the two-result form to trivial CSE, which is able to combine x/y and x%y
383 // into a single instruction.
384 //
385 // Scalar integer multiply-high is also lowered to use two-result
386 // operations, to match the available instructions. However, plain multiply
387 // (low) operations are left as Legal, as there are single-result
388 // instructions for this in x86. Using the two-result multiply instructions
389 // when both high and low results are needed must be arranged by dagcombine.
390 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
397 }
398
399 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
401 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
402 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
405 }
406 if (Subtarget.is64Bit())
411
412 setOperationAction(ISD::FREM , MVT::f32 , Expand);
413 setOperationAction(ISD::FREM , MVT::f64 , Expand);
414 setOperationAction(ISD::FREM , MVT::f80 , Expand);
415 setOperationAction(ISD::FREM , MVT::f128 , Expand);
416
417 if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {
423 }
424
425 // Promote the i8 variants and force them on up to i32 which has a shorter
426 // encoding.
427 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
429 // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit
430 // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to
431 // promote that too.
432 setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);
434
435 if (!Subtarget.hasBMI()) {
436 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
438 if (Subtarget.is64Bit()) {
439 setOperationPromotedToType(ISD::CTTZ , MVT::i32, MVT::i64);
440 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
442 }
443 }
444
445 if (Subtarget.hasLZCNT()) {
446 // When promoting the i8 variants, force them to i32 for a shorter
447 // encoding.
448 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
450 } else {
451 for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
452 if (VT == MVT::i64 && !Subtarget.is64Bit())
453 continue;
456 }
457 }
458
461 // Special handling for half-precision floating point conversions.
462 // If we don't have F16C support, then lower half float conversions
463 // into library calls.
465 Op, MVT::f32,
466 (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
467 // There's never any support for operations beyond MVT::f32.
468 setOperationAction(Op, MVT::f64, Expand);
469 setOperationAction(Op, MVT::f80, Expand);
470 setOperationAction(Op, MVT::f128, Expand);
471 }
472
473 for (auto VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
476 }
477
478 for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {
479 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
480 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
481 setTruncStoreAction(VT, MVT::f16, Expand);
482 setTruncStoreAction(VT, MVT::bf16, Expand);
483
486 }
487
491 if (Subtarget.is64Bit())
493 if (Subtarget.hasPOPCNT()) {
494 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
495 // popcntw is longer to encode than popcntl and also has a false dependency
496 // on the dest that popcntl hasn't had since Cannon Lake.
497 setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);
498 } else {
503 }
504
506
507 if (!Subtarget.hasMOVBE())
509
510 // X86 wants to expand cmov itself.
511 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
516 }
517 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
518 if (VT == MVT::i64 && !Subtarget.is64Bit())
519 continue;
522 }
523
524 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
527
529 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
530 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
534 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
535 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
536
537 // Darwin ABI issue.
538 for (auto VT : { MVT::i32, MVT::i64 }) {
539 if (VT == MVT::i64 && !Subtarget.is64Bit())
540 continue;
547 }
548
549 // 64-bit shl, sra, srl (iff 32-bit x86)
550 for (auto VT : { MVT::i32, MVT::i64 }) {
551 if (VT == MVT::i64 && !Subtarget.is64Bit())
552 continue;
556 }
557
558 if (Subtarget.hasSSEPrefetch())
560
562
563 // Expand certain atomics
564 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
572 }
573
574 if (!Subtarget.is64Bit())
576
577 if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
578 // All CPUs supporting AVX will atomically load/store aligned 128-bit
579 // values, so we can emit [V]MOVAPS/[V]MOVDQA.
582 }
583
584 if (Subtarget.canUseCMPXCHG16B())
586
587 // FIXME - use subtarget debug flags
588 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
589 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
590 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
592 }
593
596
599
600 setOperationAction(ISD::TRAP, MVT::Other, Legal);
602 if (Subtarget.isTargetPS())
604 else
606
607 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
609 setOperationAction(ISD::VAEND , MVT::Other, Expand);
610 bool Is64Bit = Subtarget.is64Bit();
611 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
612 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
613
616
618
619 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
622
624
625 auto setF16Action = [&] (MVT VT, LegalizeAction Action) {
626 setOperationAction(ISD::FABS, VT, Action);
627 setOperationAction(ISD::FNEG, VT, Action);
629 setOperationAction(ISD::FREM, VT, Action);
630 setOperationAction(ISD::FMA, VT, Action);
631 setOperationAction(ISD::FMINNUM, VT, Action);
632 setOperationAction(ISD::FMAXNUM, VT, Action);
637 setOperationAction(ISD::FSIN, VT, Action);
638 setOperationAction(ISD::FCOS, VT, Action);
639 setOperationAction(ISD::FSINCOS, VT, Action);
640 setOperationAction(ISD::FTAN, VT, Action);
641 setOperationAction(ISD::FSQRT, VT, Action);
642 setOperationAction(ISD::FPOW, VT, Action);
643 setOperationAction(ISD::FPOWI, VT, Action);
644 setOperationAction(ISD::FLOG, VT, Action);
645 setOperationAction(ISD::FLOG2, VT, Action);
646 setOperationAction(ISD::FLOG10, VT, Action);
647 setOperationAction(ISD::FEXP, VT, Action);
648 setOperationAction(ISD::FEXP2, VT, Action);
649 setOperationAction(ISD::FEXP10, VT, Action);
650 setOperationAction(ISD::FCEIL, VT, Action);
651 setOperationAction(ISD::FFLOOR, VT, Action);
653 setOperationAction(ISD::FRINT, VT, Action);
654 setOperationAction(ISD::BR_CC, VT, Action);
655 setOperationAction(ISD::SETCC, VT, Action);
658 setOperationAction(ISD::FROUND, VT, Action);
660 setOperationAction(ISD::FTRUNC, VT, Action);
661 setOperationAction(ISD::FLDEXP, VT, Action);
662 };
663
664 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
665 // f16, f32 and f64 use SSE.
666 // Set up the FP register classes.
667 addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass
668 : &X86::FR16RegClass);
669 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
670 : &X86::FR32RegClass);
671 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
672 : &X86::FR64RegClass);
673
674 // Disable f32->f64 extload as we can only generate this in one instruction
675 // under optsize. So its easier to pattern match (fpext (load)) for that
676 // case instead of needing to emit 2 instructions for extload in the
677 // non-optsize case.
678 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
679
680 for (auto VT : { MVT::f32, MVT::f64 }) {
681 // Use ANDPD to simulate FABS.
683
684 // Use XORP to simulate FNEG.
686
687 // Use ANDPD and ORPD to simulate FCOPYSIGN.
689
690 // These might be better off as horizontal vector ops.
693
694 // We don't support sin/cos/fmod
698 }
699
700 // Half type will be promoted by default.
701 setF16Action(MVT::f16, Promote);
709
740
741 setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");
742 setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");
743
744 // Lower this to MOVMSK plus an AND.
747
748 } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&
749 (UseX87 || Is64Bit)) {
750 // Use SSE for f32, x87 for f64.
751 // Set up the FP register classes.
752 addRegisterClass(MVT::f32, &X86::FR32RegClass);
753 if (UseX87)
754 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
755
756 // Use ANDPS to simulate FABS.
758
759 // Use XORP to simulate FNEG.
761
762 if (UseX87)
764
765 // Use ANDPS and ORPS to simulate FCOPYSIGN.
766 if (UseX87)
769
770 // We don't support sin/cos/fmod
774
775 if (UseX87) {
776 // Always expand sin/cos functions even though x87 has an instruction.
780 }
781 } else if (UseX87) {
782 // f32 and f64 in x87.
783 // Set up the FP register classes.
784 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
785 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
786
787 for (auto VT : { MVT::f32, MVT::f64 }) {
790
791 // Always expand sin/cos functions even though x87 has an instruction.
795 }
796 }
797
798 // Expand FP32 immediates into loads from the stack, save special cases.
799 if (isTypeLegal(MVT::f32)) {
800 if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {
801 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
802 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
803 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
804 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
805 } else // SSE immediates.
806 addLegalFPImmediate(APFloat(+0.0f)); // xorps
807 }
808 // Expand FP64 immediates into loads from the stack, save special cases.
809 if (isTypeLegal(MVT::f64)) {
810 if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {
811 addLegalFPImmediate(APFloat(+0.0)); // FLD0
812 addLegalFPImmediate(APFloat(+1.0)); // FLD1
813 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
814 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
815 } else // SSE immediates.
816 addLegalFPImmediate(APFloat(+0.0)); // xorpd
817 }
818 // Support fp16 0 immediate.
819 if (isTypeLegal(MVT::f16))
820 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));
821
822 // Handle constrained floating-point operations of scalar.
835
836 // We don't support FMA.
839
840 // f80 always uses X87.
841 if (UseX87) {
842 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
845 {
847 addLegalFPImmediate(TmpFlt); // FLD0
848 TmpFlt.changeSign();
849 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
850
851 bool ignored;
852 APFloat TmpFlt2(+1.0);
854 &ignored);
855 addLegalFPImmediate(TmpFlt2); // FLD1
856 TmpFlt2.changeSign();
857 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
858 }
859
860 // Always expand sin/cos functions even though x87 has an instruction.
861 // clang-format off
873 // clang-format on
874
886
887 // Handle constrained floating-point operations of scalar.
893 if (isTypeLegal(MVT::f16)) {
896 } else {
898 }
899 // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
900 // as Custom.
902 }
903
904 // f128 uses xmm registers, but most operations require libcalls.
905 if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {
906 addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass
907 : &X86::VR128RegClass);
908
909 addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
910
921
925
926 // clang-format off
934 // clang-format on
935 // No STRICT_FSINCOS
938
941 // We need to custom handle any FP_ROUND with an f128 input, but
942 // LegalizeDAG uses the result type to know when to run a custom handler.
943 // So we have to list all legal floating point result types here.
944 if (isTypeLegal(MVT::f32)) {
947 }
948 if (isTypeLegal(MVT::f64)) {
951 }
952 if (isTypeLegal(MVT::f80)) {
956 }
957
959
960 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);
961 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);
962 setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);
963 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
964 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
965 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
966 }
967
968 // Always use a library call for pow.
969 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
970 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
971 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
972 setOperationAction(ISD::FPOW , MVT::f128 , Expand);
973
982
983 // Some FP actions are always expanded for vector types.
984 for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,
985 MVT::v4f32, MVT::v8f32, MVT::v16f32,
986 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
987 // clang-format off
1001 // clang-format on
1002 }
1003
1004 // First set operation action for all vector types to either promote
1005 // (for widening) or expand (for scalarization). Then we will selectively
1006 // turn on ones that can be effectively codegen'd.
1046 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1047 setTruncStoreAction(InnerVT, VT, Expand);
1048
1049 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
1050 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
1051
1052 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
1053 // types, we have to deal with them whether we ask for Expansion or not.
1054 // Setting Expand causes its own optimisation problems though, so leave
1055 // them legal.
1056 if (VT.getVectorElementType() == MVT::i1)
1057 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1058
1059 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
1060 // split/scalarized right now.
1061 if (VT.getVectorElementType() == MVT::f16 ||
1062 VT.getVectorElementType() == MVT::bf16)
1063 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1064 }
1065 }
1066
1067 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
1068 // with -msoft-float, disable use of MMX as well.
1069 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
1070 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
1071 // No operations on x86mmx supported, everything uses intrinsics.
1072 }
1073
1074 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
1075 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1076 : &X86::VR128RegClass);
1077
1082
1083 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
1084 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
1092
1093 setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
1094 setOperationAction(ISD::STORE, MVT::v2f32, Custom);
1096
1102 }
1103
1104 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
1105 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1106 : &X86::VR128RegClass);
1107
1108 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
1109 // registers cannot be used even for integer operations.
1110 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
1111 : &X86::VR128RegClass);
1112 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1113 : &X86::VR128RegClass);
1114 addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass
1115 : &X86::VR128RegClass);
1116 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
1117 : &X86::VR128RegClass);
1118 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
1119 : &X86::VR128RegClass);
1120
1121 for (auto VT : { MVT::f64, MVT::v4f32, MVT::v2f64 }) {
1126 }
1127
1128 for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,
1129 MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {
1134 }
1135
1136 setOperationAction(ISD::MUL, MVT::v2i8, Custom);
1137 setOperationAction(ISD::MUL, MVT::v4i8, Custom);
1138 setOperationAction(ISD::MUL, MVT::v8i8, Custom);
1139
1140 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
1141 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1142 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1143 setOperationAction(ISD::MULHU, MVT::v4i32, Custom);
1144 setOperationAction(ISD::MULHS, MVT::v4i32, Custom);
1145 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
1146 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
1147 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
1148 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
1149 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
1152
1153 setOperationAction(ISD::SMULO, MVT::v16i8, Custom);
1154 setOperationAction(ISD::UMULO, MVT::v16i8, Custom);
1155 setOperationAction(ISD::UMULO, MVT::v2i32, Custom);
1156
1157 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
1159 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
1161
1162 setOperationAction(ISD::LRINT, MVT::v4f32, Custom);
1163
1164 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1165 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
1166 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
1167 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
1168 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
1169 }
1170
1181
1186
1187 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1193
1194 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1195 // setcc all the way to isel and prefer SETGT in some isel patterns.
1198 }
1199
1200 setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
1201 setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
1206
1207 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1213 }
1214
1215 for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {
1219
1220 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
1221 continue;
1222
1225 }
1226 setF16Action(MVT::v8f16, Expand);
1227 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
1228 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
1229 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
1230 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
1231 setOperationAction(ISD::FNEG, MVT::v8f16, Custom);
1232 setOperationAction(ISD::FABS, MVT::v8f16, Custom);
1234
1235 // Custom lower v2i64 and v2f64 selects.
1242
1249
1250 // Custom legalize these to avoid over promotion or custom promotion.
1251 for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
1256 }
1257
1262
1265
1268
1269 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
1274
1279
1280 // We want to legalize this to an f64 load rather than an i64 load on
1281 // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
1282 // store.
1283 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
1284 setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
1285 setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
1286 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
1287 setOperationAction(ISD::STORE, MVT::v4i16, Custom);
1289
1290 // Add 32-bit vector stores to help vectorization opportunities.
1291 setOperationAction(ISD::STORE, MVT::v2i16, Custom);
1293
1297 if (!Subtarget.hasAVX512())
1299
1303
1305
1322
1323 // In the customized shift lowering, the legal v4i32/v2i64 cases
1324 // in AVX2 will be recognized.
1325 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1329 if (VT == MVT::v2i64) continue;
1334 }
1335
1341 }
1342
1343 if (Subtarget.hasGFNI()) {
1348 }
1349
1350 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
1351 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
1352 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
1353 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
1354
1355 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1358 }
1359
1360 // These might be better off as horizontal vector ops.
1365 }
1366
1367 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
1368 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
1371 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
1375 setOperationAction(ISD::FRINT, RoundedTy, Legal);
1381
1383 }
1384
1385 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
1386 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
1387 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
1388 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
1389 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
1390 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
1391 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
1392 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
1393
1397
1398 // FIXME: Do we need to handle scalar-to-vector here?
1399 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
1400 setOperationAction(ISD::SMULO, MVT::v2i32, Custom);
1401
1402 // We directly match byte blends in the backend as they match the VSELECT
1403 // condition form.
1405
1406 // SSE41 brings specific instructions for doing vector sign extend even in
1407 // cases where we don't have SRA.
1408 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1411 }
1412
1413 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
1414 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1415 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
1416 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
1417 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
1418 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
1419 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
1420 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
1421 }
1422
1423 if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
1424 // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
1425 // do the pre and post work in the vector domain.
1428 // We need to mark SINT_TO_FP as Custom even though we want to expand it
1429 // so that DAG combine doesn't try to turn it into uint_to_fp.
1432 }
1433 }
1434
1435 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {
1437 }
1438
1439 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
1440 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1441 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1444 }
1445
1446 // XOP can efficiently perform BITREVERSE with VPPERM.
1447 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
1449 }
1450
1451 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
1452 bool HasInt256 = Subtarget.hasInt256();
1453
1454 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
1455 : &X86::VR256RegClass);
1456 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1457 : &X86::VR256RegClass);
1458 addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass
1459 : &X86::VR256RegClass);
1460 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1461 : &X86::VR256RegClass);
1462 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1463 : &X86::VR256RegClass);
1464 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1465 : &X86::VR256RegClass);
1466 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1467 : &X86::VR256RegClass);
1468
1469 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1482
1484
1488
1494 }
1495
1496 setOperationAction(ISD::LRINT, MVT::v8f32, Custom);
1497 setOperationAction(ISD::LRINT, MVT::v4f64, Custom);
1498
1499 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1500 // even though v8i16 is a legal type.
1501 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1502 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1503 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1504 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1508
1515
1527
1528 if (!Subtarget.hasAVX512())
1530
1531 // In the customized shift lowering, the legal v8i32/v4i64 cases
1532 // in AVX2 will be recognized.
1533 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1539 if (VT == MVT::v4i64) continue;
1544 }
1545
1546 // These types need custom splitting if their input is a 128-bit vector.
1551
1555 setOperationAction(ISD::SELECT, MVT::v16i16, Custom);
1556 setOperationAction(ISD::SELECT, MVT::v16f16, Custom);
1559
1560 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1564 }
1565
1570
1571 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1576
1577 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1578 // setcc all the way to isel and prefer SETGT in some isel patterns.
1581 }
1582
1583 setOperationAction(ISD::SETCC, MVT::v4f64, Custom);
1584 setOperationAction(ISD::SETCC, MVT::v8f32, Custom);
1589
1590 if (Subtarget.hasAnyFMA()) {
1591 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1592 MVT::v2f64, MVT::v4f64 }) {
1595 }
1596 }
1597
1598 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1599 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1600 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1601 }
1602
1603 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1604 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1605 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1606 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1607
1608 setOperationAction(ISD::MULHU, MVT::v8i32, Custom);
1609 setOperationAction(ISD::MULHS, MVT::v8i32, Custom);
1610 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1611 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1612 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1613 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1614 setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);
1615 setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);
1616
1617 setOperationAction(ISD::SMULO, MVT::v32i8, Custom);
1618 setOperationAction(ISD::UMULO, MVT::v32i8, Custom);
1619
1620 setOperationAction(ISD::ABS, MVT::v4i64, Custom);
1621 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1622 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1623 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1624 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1625
1626 setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1627 setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1628 setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1629 setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);
1630 setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1631 setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1632 setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1633 setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);
1638
1639 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1640 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1641 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1642 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1643 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1644 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1645 }
1646
1647 for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {
1650 }
1651
1652 if (HasInt256) {
1653 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1654 // when we have a 256bit-wide blend with immediate.
1657
1658 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1659 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1660 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1661 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1662 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1663 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1664 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1665 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1666 }
1667 }
1668
1669 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1670 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1671 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1673 }
1674
1675 // Extract subvector is special because the value type
1676 // (result) is 128-bit but the source is 256-bit wide.
1677 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1678 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1680 }
1681
1682 // Custom lower several nodes for 256-bit types.
1683 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1684 MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {
1694 }
1695 setF16Action(MVT::v16f16, Expand);
1696 setOperationAction(ISD::FNEG, MVT::v16f16, Custom);
1697 setOperationAction(ISD::FABS, MVT::v16f16, Custom);
1699 setOperationAction(ISD::FADD, MVT::v16f16, Expand);
1700 setOperationAction(ISD::FSUB, MVT::v16f16, Expand);
1701 setOperationAction(ISD::FMUL, MVT::v16f16, Expand);
1702 setOperationAction(ISD::FDIV, MVT::v16f16, Expand);
1703
1704 if (HasInt256) {
1706
1707 // Custom legalize 2x32 to get a little better code.
1710
1711 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1712 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1714 }
1715 }
1716
1717 if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&
1718 Subtarget.hasF16C()) {
1719 for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {
1722 }
1723 for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32, MVT::v8f32 }) {
1726 }
1727 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
1728 setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);
1729 setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);
1730 }
1731 setOperationAction(ISD::SETCC, MVT::v8f16, Custom);
1732 setOperationAction(ISD::SETCC, MVT::v16f16, Custom);
1733 }
1734
1735 // This block controls legalization of the mask vector sizes that are
1736 // available with AVX512. 512-bit vectors are in a separate block controlled
1737 // by useAVX512Regs.
1738 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1739 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1740 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1741 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1742 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1743 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1744
1748
1749 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1750 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1751 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1752 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1753 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1754 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1755 setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1756 setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1764
1765 // There is no byte sized k-register load or store without AVX512DQ.
1766 if (!Subtarget.hasDQI()) {
1767 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1768 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1769 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1770 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1771
1776 }
1777
1778 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1779 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1783 }
1784
1785 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })
1787
1788 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1792
1799 }
1800
1801 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1803 }
1804 if (Subtarget.hasDQI() && Subtarget.hasVLX()) {
1805 for (MVT VT : {MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1808 }
1809 }
1810
1811 // This block controls legalization for 512-bit operations with 8/16/32/64 bit
1812 // elements. 512-bits can be disabled based on prefer-vector-width and
1813 // required-vector-width function attributes.
1814 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1815 bool HasBWI = Subtarget.hasBWI();
1816
1817 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1818 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1819 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1820 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1821 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1822 addRegisterClass(MVT::v32f16, &X86::VR512RegClass);
1823 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1824
1825 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1826 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1827 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1828 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1829 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1830 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1831 if (HasBWI)
1832 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1833 }
1834
1835 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1846 }
1847 setOperationAction(ISD::LRINT, MVT::v16f32,
1848 Subtarget.hasDQI() ? Legal : Custom);
1849 setOperationAction(ISD::LRINT, MVT::v8f64,
1850 Subtarget.hasDQI() ? Legal : Custom);
1851 if (Subtarget.hasDQI())
1852 setOperationAction(ISD::LLRINT, MVT::v8f64, Legal);
1853
1854 for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
1859 }
1860
1861 for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {
1866 }
1867
1874
1886
1887 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1888 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1889 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1890 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1891 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1892 if (HasBWI)
1893 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1894
1895 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1896 // to 512-bit rather than use the AVX2 instructions so that we can use
1897 // k-masks.
1898 if (!Subtarget.hasVLX()) {
1899 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1900 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1903 }
1904 }
1905
1907 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);
1908 setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);
1918
1919 if (HasBWI) {
1920 // Extends from v64i1 masks to 512-bit vectors.
1924 }
1925
1926 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1939
1941 }
1942
1943 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
1946 }
1947
1948 setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
1949 setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
1950 setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);
1951 setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);
1952
1953 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1954 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1955 setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
1956 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1957
1958 setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
1959 setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
1960 setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
1961 setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
1962 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1963 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1964 setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);
1965 setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);
1966
1967 setOperationAction(ISD::SMULO, MVT::v64i8, Custom);
1968 setOperationAction(ISD::UMULO, MVT::v64i8, Custom);
1969
1970 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
1980
1981 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1982 // setcc all the way to isel and prefer SETGT in some isel patterns.
1985 }
1986
1987 setOperationAction(ISD::SETCC, MVT::v8f64, Custom);
1988 setOperationAction(ISD::SETCC, MVT::v16f32, Custom);
1993
1994 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
2001 }
2002
2003 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2004 setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);
2005 setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);
2007 setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);
2008 setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);
2009 setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);
2010 setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);
2015 }
2016
2017 setOperationAction(ISD::FSHL, MVT::v64i8, Custom);
2018 setOperationAction(ISD::FSHR, MVT::v64i8, Custom);
2019 setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
2020 setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
2021 setOperationAction(ISD::FSHL, MVT::v16i32, Custom);
2022 setOperationAction(ISD::FSHR, MVT::v16i32, Custom);
2023
2024 if (Subtarget.hasDQI()) {
2028 setOperationAction(Opc, MVT::v8i64, Custom);
2029 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
2030 }
2031
2032 if (Subtarget.hasCDI()) {
2033 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
2034 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
2036 }
2037 } // Subtarget.hasCDI()
2038
2039 if (Subtarget.hasVPOPCNTDQ()) {
2040 for (auto VT : { MVT::v16i32, MVT::v8i64 })
2042 }
2043
2044 // Extract subvector is special because the value type
2045 // (result) is 256-bit but the source is 512-bit wide.
2046 // 128-bit was made Legal under AVX1.
2047 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
2048 MVT::v16f16, MVT::v8f32, MVT::v4f64 })
2050
2051 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
2052 MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {
2062 }
2063 setF16Action(MVT::v32f16, Expand);
2068 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2069 setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);
2070 setOperationAction(ISD::SETCC, MVT::v32f16, Custom);
2071
2072 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
2077 }
2078 if (HasBWI) {
2079 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
2082 }
2083 } else {
2084 setOperationAction(ISD::STORE, MVT::v32i16, Custom);
2085 setOperationAction(ISD::STORE, MVT::v64i8, Custom);
2086 }
2087
2088 if (Subtarget.hasVBMI2()) {
2089 for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
2092 }
2093
2094 setOperationAction(ISD::ROTL, MVT::v32i16, Custom);
2095 setOperationAction(ISD::ROTR, MVT::v32i16, Custom);
2096 }
2097
2098 setOperationAction(ISD::FNEG, MVT::v32f16, Custom);
2099 setOperationAction(ISD::FABS, MVT::v32f16, Custom);
2101 }// useAVX512Regs
2102
2103 if (!Subtarget.useSoftFloat() && Subtarget.hasVBMI2()) {
2104 for (auto VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v16i16, MVT::v8i32,
2105 MVT::v4i64}) {
2108 }
2109 }
2110
2111 // This block controls legalization for operations that don't have
2112 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
2113 // narrower widths.
2114 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
2115 // These operations are handled on non-VLX by artificially widening in
2116 // isel patterns.
2117
2121
2122 if (Subtarget.hasDQI()) {
2123 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
2124 // v2f32 UINT_TO_FP is already custom under SSE2.
2127 "Unexpected operation action!");
2128 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
2133 }
2134
2135 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
2141 }
2142
2143 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2146 }
2147
2148 // Custom legalize 2x32 to get a little better code.
2151
2152 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
2153 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
2155
2156 if (Subtarget.hasDQI()) {
2160 setOperationAction(Opc, MVT::v2i64, Custom);
2161 setOperationAction(Opc, MVT::v4i64, Custom);
2162 }
2163 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
2164 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
2165 }
2166
2167 if (Subtarget.hasCDI()) {
2168 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
2170 }
2171 } // Subtarget.hasCDI()
2172
2173 if (Subtarget.hasVPOPCNTDQ()) {
2174 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
2176 }
2177
2178 // We can try to convert vectors to different sizes to leverage legal
2179 // `vpcompress` cases. So we mark these supported vector sizes as Custom and
2180 // then specialize to Legal below.
2181 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2182 MVT::v4f64, MVT::v2i64, MVT::v2f64, MVT::v16i8, MVT::v8i16,
2183 MVT::v16i16, MVT::v8i8})
2185
2186 // Legal vpcompress depends on various AVX512 extensions.
2187 // Legal in AVX512F
2188 for (MVT VT : {MVT::v16i32, MVT::v16f32, MVT::v8i64, MVT::v8f64})
2190
2191 // Legal in AVX512F + AVX512VL
2192 if (Subtarget.hasVLX())
2193 for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v4i32, MVT::v4f32, MVT::v4i64,
2194 MVT::v4f64, MVT::v2i64, MVT::v2f64})
2196
2197 // Legal in AVX512F + AVX512VBMI2
2198 if (Subtarget.hasVBMI2())
2199 for (MVT VT : {MVT::v32i16, MVT::v64i8})
2201
2202 // Legal in AVX512F + AVX512VL + AVX512VBMI2
2203 if (Subtarget.hasVBMI2() && Subtarget.hasVLX())
2204 for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v32i8, MVT::v16i16})
2206 }
2207
2208 // This block control legalization of v32i1/v64i1 which are available with
2209 // AVX512BW..
2210 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
2211 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
2212 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
2213
2214 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
2225 }
2226
2227 for (auto VT : { MVT::v16i1, MVT::v32i1 })
2229
2230 // Extends from v32i1 masks to 256-bit vectors.
2234
2235 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
2236 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
2237 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
2238 }
2239
2240 // These operations are handled on non-VLX by artificially widening in
2241 // isel patterns.
2242 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
2243
2244 if (Subtarget.hasBITALG()) {
2245 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
2247 }
2248 }
2249
2250 if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {
2251 auto setGroup = [&] (MVT VT) {
2262
2275
2277
2280
2286
2292
2296 };
2297
2298 // AVX512_FP16 scalar operations
2299 setGroup(MVT::f16);
2315
2318
2319 if (Subtarget.useAVX512Regs()) {
2320 setGroup(MVT::v32f16);
2326 setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);
2333
2338 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);
2340 MVT::v32i16);
2341 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);
2343 MVT::v32i16);
2344 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);
2346 MVT::v32i16);
2347 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);
2349 MVT::v32i16);
2350
2354
2355 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);
2356 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);
2357
2362 }
2363
2364 if (Subtarget.hasVLX()) {
2365 setGroup(MVT::v8f16);
2366 setGroup(MVT::v16f16);
2367
2378
2389
2390 // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE
2393
2397
2398 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);
2399 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);
2400 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);
2401 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);
2402
2403 // Need to custom widen these to prevent scalarization.
2404 setOperationAction(ISD::LOAD, MVT::v4f16, Custom);
2405 setOperationAction(ISD::STORE, MVT::v4f16, Custom);
2406
2411
2416 }
2417 }
2418
2419 if (!Subtarget.useSoftFloat() &&
2420 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
2421 addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
2422 : &X86::VR128RegClass);
2423 addRegisterClass(MVT::v16bf16, Subtarget.hasAVX512() ? &X86::VR256XRegClass
2424 : &X86::VR256RegClass);
2425 // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't
2426 // provide the method to promote BUILD_VECTOR and INSERT_VECTOR_ELT.
2427 // Set the operation action Custom to do the customization later.
2430 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2431 setF16Action(VT, Expand);
2432 if (!Subtarget.hasBF16())
2438 }
2439 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {
2440 setOperationPromotedToType(Opc, MVT::v8bf16, MVT::v8f32);
2441 setOperationPromotedToType(Opc, MVT::v16bf16, MVT::v16f32);
2442 }
2443 setOperationAction(ISD::SETCC, MVT::v8bf16, Custom);
2444 setOperationAction(ISD::SETCC, MVT::v16bf16, Custom);
2446 addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));
2447 }
2448
2449 if (!Subtarget.useSoftFloat() && Subtarget.hasBF16() &&
2450 Subtarget.useAVX512Regs()) {
2451 addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);
2452 setF16Action(MVT::v32bf16, Expand);
2453 for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV})
2454 setOperationPromotedToType(Opc, MVT::v32bf16, MVT::v32f32);
2455 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2457 setOperationAction(ISD::FP_ROUND, MVT::v16bf16, Custom);
2461 }
2462
2463 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX10_2()) {
2464 for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {
2476 }
2477 if (Subtarget.hasAVX10_2_512()) {
2478 setOperationAction(ISD::FADD, MVT::v32bf16, Legal);
2479 setOperationAction(ISD::FSUB, MVT::v32bf16, Legal);
2480 setOperationAction(ISD::FMUL, MVT::v32bf16, Legal);
2481 setOperationAction(ISD::FDIV, MVT::v32bf16, Legal);
2482 setOperationAction(ISD::FSQRT, MVT::v32bf16, Legal);
2483 setOperationAction(ISD::FMA, MVT::v32bf16, Legal);
2484 setOperationAction(ISD::SETCC, MVT::v32bf16, Custom);
2485 setOperationAction(ISD::FMINIMUM, MVT::v32bf16, Custom);
2486 setOperationAction(ISD::FMAXIMUM, MVT::v32bf16, Custom);
2489 }
2490 for (auto VT : {MVT::f16, MVT::f32, MVT::f64}) {
2493 }
2494 }
2495
2496 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
2497 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
2498 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
2499 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
2500 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
2501 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
2502
2503 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
2504 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
2505 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
2506 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
2507 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
2508
2509 if (Subtarget.hasBWI()) {
2510 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
2511 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
2512 }
2513
2514 if (Subtarget.hasFP16()) {
2515 // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64
2524 // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16
2533 // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16
2538 // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32
2543 }
2544 }
2545
2546 if (!Subtarget.useSoftFloat() && Subtarget.hasAMXTILE()) {
2547 addRegisterClass(MVT::x86amx, &X86::TILERegClass);
2548 }
2549
2550 // We want to custom lower some of our intrinsics.
2554 if (!Subtarget.is64Bit()) {
2556 }
2557
2558 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
2559 // handle type legalization for these operations here.
2560 //
2561 // FIXME: We really should do custom legalization for addition and
2562 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
2563 // than generic legalization for 64-bit multiplication-with-overflow, though.
2564 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
2565 if (VT == MVT::i64 && !Subtarget.is64Bit())
2566 continue;
2567 // Add/Sub/Mul with overflow operations are custom lowered.
2574
2575 // Support carry in as value rather than glue.
2581 }
2582
2583 // Combine sin / cos into _sincos_stret if it is available.
2584 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
2585 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
2588 }
2589
2590 if (Subtarget.isTargetWin64()) {
2591 setOperationAction(ISD::SDIV, MVT::i128, Custom);
2592 setOperationAction(ISD::UDIV, MVT::i128, Custom);
2593 setOperationAction(ISD::SREM, MVT::i128, Custom);
2594 setOperationAction(ISD::UREM, MVT::i128, Custom);
2603 }
2604
2605 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
2606 // is. We should promote the value to 64-bits to solve this.
2607 // This is what the CRT headers do - `fmodf` is an inline header
2608 // function casting to f64 and calling `fmod`.
2609 if (Subtarget.is32Bit() &&
2610 (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
2611 // clang-format off
2612 for (ISD::NodeType Op :
2630 if (isOperationExpand(Op, MVT::f32))
2631 setOperationAction(Op, MVT::f32, Promote);
2632 // clang-format on
2633
2634 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2635 // it, but it's just a wrapper around ldexp.
2636 if (Subtarget.isOSWindows()) {
2638 if (isOperationExpand(Op, MVT::f32))
2639 setOperationAction(Op, MVT::f32, Promote);
2640 }
2641
2642 // We have target-specific dag combine patterns for the following nodes:
2653 ISD::SHL,
2654 ISD::SRA,
2655 ISD::SRL,
2656 ISD::OR,
2657 ISD::AND,
2663 ISD::ADD,
2664 ISD::FADD,
2665 ISD::FSUB,
2666 ISD::FNEG,
2667 ISD::FMA,
2671 ISD::SUB,
2672 ISD::LOAD,
2673 ISD::LRINT,
2675 ISD::MLOAD,
2676 ISD::STORE,
2692 ISD::SETCC,
2693 ISD::MUL,
2694 ISD::XOR,
2705
2707
2708 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
2710 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
2712 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
2714
2715 // TODO: These control memcmp expansion in CGP and could be raised higher, but
2716 // that needs to benchmarked and balanced with the potential use of vector
2717 // load/store types (PR33329, PR33914).
2720
2721 // Default loop alignment, which can be overridden by -align-loops.
2723
2724 // An out-of-order CPU can speculatively execute past a predictable branch,
2725 // but a conditional move could be stalled by an expensive earlier operation.
2726 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
2727 EnableExtLdPromotion = true;
2729
2731
2732 // Default to having -disable-strictnode-mutation on
2733 IsStrictFPEnabled = true;
2734}
2735
2736// This has so far only been implemented for 64-bit MachO.
2738 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
2739}
2740
2742 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
2743 return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
2744}
2745
2747 const SDLoc &DL) const {
2748 EVT PtrTy = getPointerTy(DAG.getDataLayout());
2749 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
2750 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
2751 return SDValue(Node, 0);
2752}
2753
2756 if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
2757 !Subtarget.hasBWI())
2758 return TypeSplitVector;
2759
2760 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2761 !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)
2762 return TypeSplitVector;
2763
2764 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
2765 VT.getVectorElementType() != MVT::i1)
2766 return TypeWidenVector;
2767
2769}
2770
2771FastISel *
2773 const TargetLibraryInfo *libInfo) const {
2774 return X86::createFastISel(funcInfo, libInfo);
2775}
2776
2777//===----------------------------------------------------------------------===//
2778// Other Lowering Hooks
2779//===----------------------------------------------------------------------===//
2780
2782 bool AssumeSingleUse) {
2783 if (!AssumeSingleUse && !Op.hasOneUse())
2784 return false;
2785 if (!ISD::isNormalLoad(Op.getNode()))
2786 return false;
2787
2788 // If this is an unaligned vector, make sure the target supports folding it.
2789 auto *Ld = cast<LoadSDNode>(Op.getNode());
2790 if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&
2791 Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))
2792 return false;
2793
2794 // TODO: If this is a non-temporal load and the target has an instruction
2795 // for it, it should not be folded. See "useNonTemporalLoad()".
2796
2797 return true;
2798}
2799
2801 const X86Subtarget &Subtarget,
2802 bool AssumeSingleUse) {
2803 assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
2804 if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
2805 return false;
2806
2807 // We can not replace a wide volatile load with a broadcast-from-memory,
2808 // because that would narrow the load, which isn't legal for volatiles.
2809 auto *Ld = cast<LoadSDNode>(Op.getNode());
2810 return !Ld->isVolatile() ||
2811 Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();
2812}
2813
2815 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->user_begin());
2816}
2817
2819 if (Op.hasOneUse()) {
2820 unsigned Opcode = Op.getNode()->user_begin()->getOpcode();
2821 return (ISD::ZERO_EXTEND == Opcode);
2822 }
2823 return false;
2824}
2825
2826static bool isLogicOp(unsigned Opcode) {
2827 // TODO: Add support for X86ISD::FAND/FOR/FXOR/FANDN with test coverage.
2828 return ISD::isBitwiseLogicOp(Opcode) || X86ISD::ANDNP == Opcode;
2829}
2830
2831static bool isTargetShuffle(unsigned Opcode) {
2832 switch(Opcode) {
2833 default: return false;
2834 case X86ISD::BLENDI:
2835 case X86ISD::PSHUFB:
2836 case X86ISD::PSHUFD:
2837 case X86ISD::PSHUFHW:
2838 case X86ISD::PSHUFLW:
2839 case X86ISD::SHUFP:
2840 case X86ISD::INSERTPS:
2841 case X86ISD::EXTRQI:
2842 case X86ISD::INSERTQI:
2843 case X86ISD::VALIGN:
2844 case X86ISD::PALIGNR:
2845 case X86ISD::VSHLDQ:
2846 case X86ISD::VSRLDQ:
2847 case X86ISD::MOVLHPS:
2848 case X86ISD::MOVHLPS:
2849 case X86ISD::MOVSHDUP:
2850 case X86ISD::MOVSLDUP:
2851 case X86ISD::MOVDDUP:
2852 case X86ISD::MOVSS:
2853 case X86ISD::MOVSD:
2854 case X86ISD::MOVSH:
2855 case X86ISD::UNPCKL:
2856 case X86ISD::UNPCKH:
2857 case X86ISD::VBROADCAST:
2858 case X86ISD::VPERMILPI:
2859 case X86ISD::VPERMILPV:
2860 case X86ISD::VPERM2X128:
2861 case X86ISD::SHUF128:
2862 case X86ISD::VPERMIL2:
2863 case X86ISD::VPERMI:
2864 case X86ISD::VPPERM:
2865 case X86ISD::VPERMV:
2866 case X86ISD::VPERMV3:
2867 case X86ISD::VZEXT_MOVL:
2868 return true;
2869 }
2870}
2871
2872static bool isTargetShuffleVariableMask(unsigned Opcode) {
2873 switch (Opcode) {
2874 default: return false;
2875 // Target Shuffles.
2876 case X86ISD::PSHUFB:
2877 case X86ISD::VPERMILPV:
2878 case X86ISD::VPERMIL2:
2879 case X86ISD::VPPERM:
2880 case X86ISD::VPERMV:
2881 case X86ISD::VPERMV3:
2882 return true;
2883 // 'Faux' Target Shuffles.
2884 case ISD::OR:
2885 case ISD::AND:
2886 case X86ISD::ANDNP:
2887 return true;
2888 }
2889}
2890
2893 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
2895 int ReturnAddrIndex = FuncInfo->getRAIndex();
2896
2897 if (ReturnAddrIndex == 0) {
2898 // Set up a frame object for the return address.
2899 unsigned SlotSize = RegInfo->getSlotSize();
2900 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
2901 -(int64_t)SlotSize,
2902 false);
2903 FuncInfo->setRAIndex(ReturnAddrIndex);
2904 }
2905
2906 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
2907}
2908
2910 bool HasSymbolicDisplacement) {
2911 // Offset should fit into 32 bit immediate field.
2912 if (!isInt<32>(Offset))
2913 return false;
2914
2915 // If we don't have a symbolic displacement - we don't have any extra
2916 // restrictions.
2917 if (!HasSymbolicDisplacement)
2918 return true;
2919
2920 // We can fold large offsets in the large code model because we always use
2921 // 64-bit offsets.
2922 if (CM == CodeModel::Large)
2923 return true;
2924
2925 // For kernel code model we know that all object resist in the negative half
2926 // of 32bits address space. We may not accept negative offsets, since they may
2927 // be just off and we may accept pretty large positive ones.
2928 if (CM == CodeModel::Kernel)
2929 return Offset >= 0;
2930
2931 // For other non-large code models we assume that latest small object is 16MB
2932 // before end of 31 bits boundary. We may also accept pretty large negative
2933 // constants knowing that all objects are in the positive half of address
2934 // space.
2935 return Offset < 16 * 1024 * 1024;
2936}
2937
2938/// Return true if the condition is an signed comparison operation.
2939static bool isX86CCSigned(unsigned X86CC) {
2940 switch (X86CC) {
2941 default:
2942 llvm_unreachable("Invalid integer condition!");
2943 case X86::COND_E:
2944 case X86::COND_NE:
2945 case X86::COND_B:
2946 case X86::COND_A:
2947 case X86::COND_BE:
2948 case X86::COND_AE:
2949 return false;
2950 case X86::COND_G:
2951 case X86::COND_GE:
2952 case X86::COND_L:
2953 case X86::COND_LE:
2954 return true;
2955 }
2956}
2957
2959 switch (SetCCOpcode) {
2960 // clang-format off
2961 default: llvm_unreachable("Invalid integer condition!");
2962 case ISD::SETEQ: return X86::COND_E;
2963 case ISD::SETGT: return X86::COND_G;
2964 case ISD::SETGE: return X86::COND_GE;
2965 case ISD::SETLT: return X86::COND_L;
2966 case ISD::SETLE: return X86::COND_LE;
2967 case ISD::SETNE: return X86::COND_NE;
2968 case ISD::SETULT: return X86::COND_B;
2969 case ISD::SETUGT: return X86::COND_A;
2970 case ISD::SETULE: return X86::COND_BE;
2971 case ISD::SETUGE: return X86::COND_AE;
2972 // clang-format on
2973 }
2974}
2975
2976/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
2977/// condition code, returning the condition code and the LHS/RHS of the
2978/// comparison to make.
2980 bool isFP, SDValue &LHS, SDValue &RHS,
2981 SelectionDAG &DAG) {
2982 if (!isFP) {
2983 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
2984 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {
2985 // X > -1 -> X == 0, jump !sign.
2986 RHS = DAG.getConstant(0, DL, RHS.getValueType());
2987 return X86::COND_NS;
2988 }
2989 if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {
2990 // X < 0 -> X == 0, jump on sign.
2991 return X86::COND_S;
2992 }
2993 if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {
2994 // X >= 0 -> X == 0, jump on !sign.
2995 return X86::COND_NS;
2996 }
2997 if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
2998 // X < 1 -> X <= 0
2999 RHS = DAG.getConstant(0, DL, RHS.getValueType());
3000 return X86::COND_LE;
3001 }
3002 }
3003
3004 return TranslateIntegerX86CC(SetCCOpcode);
3005 }
3006
3007 // First determine if it is required or is profitable to flip the operands.
3008
3009 // If LHS is a foldable load, but RHS is not, flip the condition.
3010 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
3011 !ISD::isNON_EXTLoad(RHS.getNode())) {
3012 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
3013 std::swap(LHS, RHS);
3014 }
3015
3016 switch (SetCCOpcode) {
3017 default: break;
3018 case ISD::SETOLT:
3019 case ISD::SETOLE:
3020 case ISD::SETUGT:
3021 case ISD::SETUGE:
3022 std::swap(LHS, RHS);
3023 break;
3024 }
3025
3026 // On a floating point condition, the flags are set as follows:
3027 // ZF PF CF op
3028 // 0 | 0 | 0 | X > Y
3029 // 0 | 0 | 1 | X < Y
3030 // 1 | 0 | 0 | X == Y
3031 // 1 | 1 | 1 | unordered
3032 switch (SetCCOpcode) {
3033 // clang-format off
3034 default: llvm_unreachable("Condcode should be pre-legalized away");
3035 case ISD::SETUEQ:
3036 case ISD::SETEQ: return X86::COND_E;
3037 case ISD::SETOLT: // flipped
3038 case ISD::SETOGT:
3039 case ISD::SETGT: return X86::COND_A;
3040 case ISD::SETOLE: // flipped
3041 case ISD::SETOGE:
3042 case ISD::SETGE: return X86::COND_AE;
3043 case ISD::SETUGT: // flipped
3044 case ISD::SETULT:
3045 case ISD::SETLT: return X86::COND_B;
3046 case ISD::SETUGE: // flipped
3047 case ISD::SETULE:
3048 case ISD::SETLE: return X86::COND_BE;
3049 case ISD::SETONE:
3050 case ISD::SETNE: return X86::COND_NE;
3051 case ISD::SETUO: return X86::COND_P;
3052 case ISD::SETO: return X86::COND_NP;
3053 case ISD::SETOEQ:
3054 case ISD::SETUNE: return X86::COND_INVALID;
3055 // clang-format on
3056 }
3057}
3058
3059/// Is there a floating point cmov for the specific X86 condition code?
3060/// Current x86 isa includes the following FP cmov instructions:
3061/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
3062static bool hasFPCMov(unsigned X86CC) {
3063 switch (X86CC) {
3064 default:
3065 return false;
3066 case X86::COND_B:
3067 case X86::COND_BE:
3068 case X86::COND_E:
3069 case X86::COND_P:
3070 case X86::COND_A:
3071 case X86::COND_AE:
3072 case X86::COND_NE:
3073 case X86::COND_NP:
3074 return true;
3075 }
3076}
3077
3078static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {
3079 return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||
3080 VT.is512BitVector();
3081}
3082
3084 const CallInst &I,
3085 MachineFunction &MF,
3086 unsigned Intrinsic) const {
3088 Info.offset = 0;
3089
3090 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
3091 if (!IntrData) {
3092 switch (Intrinsic) {
3093 case Intrinsic::x86_aesenc128kl:
3094 case Intrinsic::x86_aesdec128kl:
3096 Info.ptrVal = I.getArgOperand(1);
3097 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3098 Info.align = Align(1);
3100 return true;
3101 case Intrinsic::x86_aesenc256kl:
3102 case Intrinsic::x86_aesdec256kl:
3104 Info.ptrVal = I.getArgOperand(1);
3105 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3106 Info.align = Align(1);
3108 return true;
3109 case Intrinsic::x86_aesencwide128kl:
3110 case Intrinsic::x86_aesdecwide128kl:
3112 Info.ptrVal = I.getArgOperand(0);
3113 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);
3114 Info.align = Align(1);
3116 return true;
3117 case Intrinsic::x86_aesencwide256kl:
3118 case Intrinsic::x86_aesdecwide256kl:
3120 Info.ptrVal = I.getArgOperand(0);
3121 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);
3122 Info.align = Align(1);
3124 return true;
3125 case Intrinsic::x86_cmpccxadd32:
3126 case Intrinsic::x86_cmpccxadd64:
3127 case Intrinsic::x86_atomic_bts:
3128 case Intrinsic::x86_atomic_btc:
3129 case Intrinsic::x86_atomic_btr: {
3131 Info.ptrVal = I.getArgOperand(0);
3132 unsigned Size = I.getType()->getScalarSizeInBits();
3133 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3134 Info.align = Align(Size);
3137 return true;
3138 }
3139 case Intrinsic::x86_atomic_bts_rm:
3140 case Intrinsic::x86_atomic_btc_rm:
3141 case Intrinsic::x86_atomic_btr_rm: {
3143 Info.ptrVal = I.getArgOperand(0);
3144 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3145 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3146 Info.align = Align(Size);
3149 return true;
3150 }
3151 case Intrinsic::x86_aadd32:
3152 case Intrinsic::x86_aadd64:
3153 case Intrinsic::x86_aand32:
3154 case Intrinsic::x86_aand64:
3155 case Intrinsic::x86_aor32:
3156 case Intrinsic::x86_aor64:
3157 case Intrinsic::x86_axor32:
3158 case Intrinsic::x86_axor64:
3159 case Intrinsic::x86_atomic_add_cc:
3160 case Intrinsic::x86_atomic_sub_cc:
3161 case Intrinsic::x86_atomic_or_cc:
3162 case Intrinsic::x86_atomic_and_cc:
3163 case Intrinsic::x86_atomic_xor_cc: {
3165 Info.ptrVal = I.getArgOperand(0);
3166 unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();
3167 Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);
3168 Info.align = Align(Size);
3171 return true;
3172 }
3173 }
3174 return false;
3175 }
3176
3177 switch (IntrData->Type) {
3180 case TRUNCATE_TO_MEM_VI32: {
3182 Info.ptrVal = I.getArgOperand(0);
3183 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
3185 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
3186 ScalarVT = MVT::i8;
3187 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
3188 ScalarVT = MVT::i16;
3189 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
3190 ScalarVT = MVT::i32;
3191
3192 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
3193 Info.align = Align(1);
3195 break;
3196 }
3197 case GATHER:
3198 case GATHER_AVX2: {
3200 Info.ptrVal = nullptr;
3201 MVT DataVT = MVT::getVT(I.getType());
3202 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3203 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3204 IndexVT.getVectorNumElements());
3205 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3206 Info.align = Align(1);
3208 break;
3209 }
3210 case SCATTER: {
3212 Info.ptrVal = nullptr;
3213 MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());
3214 MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());
3215 unsigned NumElts = std::min(DataVT.getVectorNumElements(),
3216 IndexVT.getVectorNumElements());
3217 Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
3218 Info.align = Align(1);
3220 break;
3221 }
3222 default:
3223 return false;
3224 }
3225
3226 return true;
3227}
3228
3229/// Returns true if the target can instruction select the
3230/// specified FP immediate natively. If false, the legalizer will
3231/// materialize the FP immediate as a load from a constant pool.
3233 bool ForCodeSize) const {
3234 for (const APFloat &FPImm : LegalFPImmediates)
3235 if (Imm.bitwiseIsEqual(FPImm))
3236 return true;
3237 return false;
3238}
3239
3241 ISD::LoadExtType ExtTy,
3242 EVT NewVT) const {
3243 assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow");
3244
3245 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
3246 // relocation target a movq or addq instruction: don't let the load shrink.
3247 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
3248 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
3249 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
3250 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
3251
3252 // If this is an (1) AVX vector load with (2) multiple uses and (3) all of
3253 // those uses are extracted directly into a store, then the extract + store
3254 // can be store-folded. Therefore, it's probably not worth splitting the load.
3255 EVT VT = Load->getValueType(0);
3256 if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
3257 for (SDUse &Use : Load->uses()) {
3258 // Skip uses of the chain value. Result 0 of the node is the load value.
3259 if (Use.getResNo() != 0)
3260 continue;
3261
3262 SDNode *User = Use.getUser();
3263
3264 // If this use is not an extract + store, it's probably worth splitting.
3265 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR || !User->hasOneUse() ||
3266 User->user_begin()->getOpcode() != ISD::STORE)
3267 return true;
3268 }
3269 // All non-chain uses are extract + store.
3270 return false;
3271 }
3272
3273 return true;
3274}
3275
3276/// Returns true if it is beneficial to convert a load of a constant
3277/// to just the constant itself.
3279 Type *Ty) const {
3280 assert(Ty->isIntegerTy());
3281
3282 unsigned BitSize = Ty->getPrimitiveSizeInBits();
3283 if (BitSize == 0 || BitSize > 64)
3284 return false;
3285 return true;
3286}
3287
3289 // If we are using XMM registers in the ABI and the condition of the select is
3290 // a floating-point compare and we have blendv or conditional move, then it is
3291 // cheaper to select instead of doing a cross-register move and creating a
3292 // load that depends on the compare result.
3293 bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;
3294 return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();
3295}
3296
3298 // TODO: It might be a win to ease or lift this restriction, but the generic
3299 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
3300 if (VT.isVector() && Subtarget.hasAVX512())
3301 return false;
3302
3303 return true;
3304}
3305
3307 SDValue C) const {
3308 // TODO: We handle scalars using custom code, but generic combining could make
3309 // that unnecessary.
3310 APInt MulC;
3311 if (!ISD::isConstantSplatVector(C.getNode(), MulC))
3312 return false;
3313
3314 // Find the type this will be legalized too. Otherwise we might prematurely
3315 // convert this to shl+add/sub and then still have to type legalize those ops.
3316 // Another choice would be to defer the decision for illegal types until
3317 // after type legalization. But constant splat vectors of i64 can't make it
3318 // through type legalization on 32-bit targets so we would need to special
3319 // case vXi64.
3320 while (getTypeAction(Context, VT) != TypeLegal)
3321 VT = getTypeToTransformTo(Context, VT);
3322
3323 // If vector multiply is legal, assume that's faster than shl + add/sub.
3324 // Multiply is a complex op with higher latency and lower throughput in
3325 // most implementations, sub-vXi32 vector multiplies are always fast,
3326 // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)
3327 // is always going to be slow.
3328 unsigned EltSizeInBits = VT.getScalarSizeInBits();
3329 if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&
3330 (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))
3331 return false;
3332
3333 // shl+add, shl+sub, shl+add+neg
3334 return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||
3335 (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
3336}
3337
3339 unsigned Index) const {
3341 return false;
3342
3343 // Mask vectors support all subregister combinations and operations that
3344 // extract half of vector.
3345 if (ResVT.getVectorElementType() == MVT::i1)
3346 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
3347 (Index == ResVT.getVectorNumElements()));
3348
3349 return (Index % ResVT.getVectorNumElements()) == 0;
3350}
3351
3353 unsigned Opc = VecOp.getOpcode();
3354
3355 // Assume target opcodes can't be scalarized.
3356 // TODO - do we have any exceptions?
3357 if (Opc >= ISD::BUILTIN_OP_END || !isBinOp(Opc))
3358 return false;
3359
3360 // If the vector op is not supported, try to convert to scalar.
3361 EVT VecVT = VecOp.getValueType();
3362 if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))
3363 return true;
3364
3365 // If the vector op is supported, but the scalar op is not, the transform may
3366 // not be worthwhile.
3367 EVT ScalarVT = VecVT.getScalarType();
3368 return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
3369}
3370
3372 bool) const {
3373 // TODO: Allow vectors?
3374 if (VT.isVector())
3375 return false;
3376 return VT.isSimple() || !isOperationExpand(Opcode, VT);
3377}
3378
3380 // Speculate cttz only if we can directly use TZCNT or can promote to i32/i64.
3381 return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
3382 (!Ty->isVectorTy() &&
3383 Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
3384}
3385
3387 // Speculate ctlz only if we can directly use LZCNT.
3388 return Subtarget.hasLZCNT() || Subtarget.canUseCMOV();
3389}
3390
3392 // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
3393 // expensive than a straight movsd. On the other hand, it's important to
3394 // shrink long double fp constant since fldt is very slow.
3395 return !Subtarget.hasSSE2() || VT == MVT::f80;
3396}
3397
3399 return (VT == MVT::f64 && Subtarget.hasSSE2()) ||
3400 (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;
3401}
3402
3404 const SelectionDAG &DAG,
3405 const MachineMemOperand &MMO) const {
3406 if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
3407 BitcastVT.getVectorElementType() == MVT::i1)
3408 return false;
3409
3410 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
3411 return false;
3412
3413 // If both types are legal vectors, it's always ok to convert them.
3414 if (LoadVT.isVector() && BitcastVT.isVector() &&
3415 isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
3416 return true;
3417
3418 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
3419}
3420
3422 const MachineFunction &MF) const {
3423 // Do not merge to float value size (128 bytes) if no implicit
3424 // float attribute is set.
3425 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
3426
3427 if (NoFloat) {
3428 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
3429 return (MemVT.getSizeInBits() <= MaxIntSize);
3430 }
3431 // Make sure we don't merge greater than our preferred vector
3432 // width.
3433 if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
3434 return false;
3435
3436 return true;
3437}
3438
3440 return Subtarget.hasFastLZCNT();
3441}
3442
3444 const Instruction &AndI) const {
3445 return true;
3446}
3447
3449 EVT VT = Y.getValueType();
3450
3451 if (VT.isVector())
3452 return false;
3453
3454 if (!Subtarget.hasBMI())
3455 return false;
3456
3457 // There are only 32-bit and 64-bit forms for 'andn'.
3458 if (VT != MVT::i32 && VT != MVT::i64)
3459 return false;
3460
3461 return !isa<ConstantSDNode>(Y) || cast<ConstantSDNode>(Y)->isOpaque();
3462}
3463
3465 EVT VT = Y.getValueType();
3466
3467 if (!VT.isVector())
3468 return hasAndNotCompare(Y);
3469
3470 // Vector.
3471
3472 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
3473 return false;
3474
3475 if (VT == MVT::v4i32)
3476 return true;
3477
3478 return Subtarget.hasSSE2();
3479}
3480
3482 return X.getValueType().isScalarInteger(); // 'bt'
3483}
3484
3488 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
3489 SelectionDAG &DAG) const {
3490 // Does baseline recommend not to perform the fold by default?
3492 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
3493 return false;
3494 // For scalars this transform is always beneficial.
3495 if (X.getValueType().isScalarInteger())
3496 return true;
3497 // If all the shift amounts are identical, then transform is beneficial even
3498 // with rudimentary SSE2 shifts.
3499 if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))
3500 return true;
3501 // If we have AVX2 with it's powerful shift operations, then it's also good.
3502 if (Subtarget.hasAVX2())
3503 return true;
3504 // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.
3505 return NewShiftOpcode == ISD::SHL;
3506}
3507
3509 EVT VT, unsigned ShiftOpc, bool MayTransformRotate,
3510 const APInt &ShiftOrRotateAmt, const std::optional<APInt> &AndMask) const {
3511 if (!VT.isInteger())
3512 return ShiftOpc;
3513
3514 bool PreferRotate = false;
3515 if (VT.isVector()) {
3516 // For vectors, if we have rotate instruction support, then its definetly
3517 // best. Otherwise its not clear what the best so just don't make changed.
3518 PreferRotate = Subtarget.hasAVX512() && (VT.getScalarType() == MVT::i32 ||
3519 VT.getScalarType() == MVT::i64);
3520 } else {
3521 // For scalar, if we have bmi prefer rotate for rorx. Otherwise prefer
3522 // rotate unless we have a zext mask+shr.
3523 PreferRotate = Subtarget.hasBMI2();
3524 if (!PreferRotate) {
3525 unsigned MaskBits =
3526 VT.getScalarSizeInBits() - ShiftOrRotateAmt.getZExtValue();
3527 PreferRotate = (MaskBits != 8) && (MaskBits != 16) && (MaskBits != 32);
3528 }
3529 }
3530
3531 if (ShiftOpc == ISD::SHL || ShiftOpc == ISD::SRL) {
3532 assert(AndMask.has_value() && "Null andmask when querying about shift+and");
3533
3534 if (PreferRotate && MayTransformRotate)
3535 return ISD::ROTL;
3536
3537 // If vector we don't really get much benefit swapping around constants.
3538 // Maybe we could check if the DAG has the flipped node already in the
3539 // future.
3540 if (VT.isVector())
3541 return ShiftOpc;
3542
3543 // See if the beneficial to swap shift type.
3544 if (ShiftOpc == ISD::SHL) {
3545 // If the current setup has imm64 mask, then inverse will have
3546 // at least imm32 mask (or be zext i32 -> i64).
3547 if (VT == MVT::i64)
3548 return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
3549 : ShiftOpc;
3550
3551 // We can only benefit if req at least 7-bit for the mask. We
3552 // don't want to replace shl of 1,2,3 as they can be implemented
3553 // with lea/add.
3554 return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
3555 }
3556
3557 if (VT == MVT::i64)
3558 // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
3559 // extremely efficient.
3560 return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
3561
3562 // Keep small shifts as shl so we can generate add/lea.
3563 return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
3564 }
3565
3566 // We prefer rotate for vectors of if we won't get a zext mask with SRL
3567 // (PreferRotate will be set in the latter case).
3568 if (PreferRotate || !MayTransformRotate || VT.isVector())
3569 return ShiftOpc;
3570
3571 // Non-vector type and we have a zext mask with SRL.
3572 return ISD::SRL;
3573}
3574
3577 const Value *Lhs,
3578 const Value *Rhs) const {
3579 using namespace llvm::PatternMatch;
3580 int BaseCost = BrMergingBaseCostThresh.getValue();
3581 // With CCMP, branches can be merged in a more efficient way.
3582 if (BaseCost >= 0 && Subtarget.hasCCMP())
3583 BaseCost += BrMergingCcmpBias;
3584 // a == b && a == c is a fast pattern on x86.
3585 if (BaseCost >= 0 && Opc == Instruction::And &&
3588 BaseCost += 1;
3589 return {BaseCost, BrMergingLikelyBias.getValue(),
3590 BrMergingUnlikelyBias.getValue()};
3591}
3592
3594 return N->getOpcode() != ISD::FP_EXTEND;
3595}
3596
3598 const SDNode *N, CombineLevel Level) const {
3599 assert(((N->getOpcode() == ISD::SHL &&
3600 N->getOperand(0).getOpcode() == ISD::SRL) ||
3601 (N->getOpcode() == ISD::SRL &&
3602 N->getOperand(0).getOpcode() == ISD::SHL)) &&
3603 "Expected shift-shift mask");
3604 // TODO: Should we always create i64 masks? Or only folded immediates?
3605 EVT VT = N->getValueType(0);
3606 if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
3607 (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
3608 // Only fold if the shift values are equal - so it folds to AND.
3609 // TODO - we should fold if either is a non-uniform vector but we don't do
3610 // the fold for non-splats yet.
3611 return N->getOperand(1) == N->getOperand(0).getOperand(1);
3612 }
3614}
3615
3617 EVT VT = Y.getValueType();
3618
3619 // For vectors, we don't have a preference, but we probably want a mask.
3620 if (VT.isVector())
3621 return false;
3622
3623 // 64-bit shifts on 32-bit targets produce really bad bloated code.
3624 if (VT == MVT::i64 && !Subtarget.is64Bit())
3625 return false;
3626
3627 return true;
3628}
3629
3632 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
3634 !Subtarget.isOSWindows())
3637 ExpansionFactor);
3638}
3639
3641 // Any legal vector type can be splatted more efficiently than
3642 // loading/spilling from memory.
3643 return isTypeLegal(VT);
3644}
3645
3647 MVT VT = MVT::getIntegerVT(NumBits);
3648 if (isTypeLegal(VT))
3649 return VT;
3650
3651 // PMOVMSKB can handle this.
3652 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
3653 return MVT::v16i8;
3654
3655 // VPMOVMSKB can handle this.
3656 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
3657 return MVT::v32i8;
3658
3659 // TODO: Allow 64-bit type for 32-bit target.
3660 // TODO: 512-bit types should be allowed, but make sure that those
3661 // cases are handled in combineVectorSizedSetCCEquality().
3662
3664}
3665
3666/// Val is the undef sentinel value or equal to the specified value.
3667static bool isUndefOrEqual(int Val, int CmpVal) {
3668 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
3669}
3670
3671/// Return true if every element in Mask is the undef sentinel value or equal to
3672/// the specified value.
3673static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {
3674 return llvm::all_of(Mask, [CmpVal](int M) {
3675 return (M == SM_SentinelUndef) || (M == CmpVal);
3676 });
3677}
3678
3679/// Return true if every element in Mask, beginning from position Pos and ending
3680/// in Pos+Size is the undef sentinel value or equal to the specified value.
3681static bool isUndefOrEqualInRange(ArrayRef<int> Mask, int CmpVal, unsigned Pos,
3682 unsigned Size) {
3683 return llvm::all_of(Mask.slice(Pos, Size),
3684 [CmpVal](int M) { return isUndefOrEqual(M, CmpVal); });
3685}
3686
3687/// Val is either the undef or zero sentinel value.
3688static bool isUndefOrZero(int Val) {
3689 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
3690}
3691
3692/// Return true if every element in Mask, beginning from position Pos and ending
3693/// in Pos+Size is the undef sentinel value.
3694static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
3695 return llvm::all_of(Mask.slice(Pos, Size),
3696 [](int M) { return M == SM_SentinelUndef; });
3697}
3698
3699/// Return true if the mask creates a vector whose lower half is undefined.
3701 unsigned NumElts = Mask.size();
3702 return isUndefInRange(Mask, 0, NumElts / 2);
3703}
3704
3705/// Return true if the mask creates a vector whose upper half is undefined.
3707 unsigned NumElts = Mask.size();
3708 return isUndefInRange(Mask, NumElts / 2, NumElts / 2);
3709}
3710
3711/// Return true if Val falls within the specified range (L, H].
3712static bool isInRange(int Val, int Low, int Hi) {
3713 return (Val >= Low && Val < Hi);
3714}
3715
3716/// Return true if the value of any element in Mask falls within the specified
3717/// range (L, H].
3718static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
3719 return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
3720}
3721
3722/// Return true if the value of any element in Mask is the zero sentinel value.
3723static bool isAnyZero(ArrayRef<int> Mask) {
3724 return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
3725}
3726
3727/// Return true if Val is undef or if its value falls within the
3728/// specified range (L, H].
3729static bool isUndefOrInRange(int Val, int Low, int Hi) {
3730 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
3731}
3732
3733/// Return true if every element in Mask is undef or if its value
3734/// falls within the specified range (L, H].
3735static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3736 return llvm::all_of(
3737 Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });
3738}
3739
3740/// Return true if Val is undef, zero or if its value falls within the
3741/// specified range (L, H].
3742static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
3743 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
3744}
3745
3746/// Return true if every element in Mask is undef, zero or if its value
3747/// falls within the specified range (L, H].
3748static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
3749 return llvm::all_of(
3750 Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });
3751}
3752
3753/// Return true if every element in Mask, is an in-place blend/select mask or is
3754/// undef.
3756 unsigned NumElts = Mask.size();
3757 for (auto [I, M] : enumerate(Mask))
3758 if (!isUndefOrEqual(M, I) && !isUndefOrEqual(M, I + NumElts))
3759 return false;
3760 return true;
3761}
3762
3763/// Return true if every element in Mask, beginning
3764/// from position Pos and ending in Pos + Size, falls within the specified
3765/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
3766static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
3767 unsigned Size, int Low, int Step = 1) {
3768 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3769 if (!isUndefOrEqual(Mask[i], Low))
3770 return false;
3771 return true;
3772}
3773
3774/// Return true if every element in Mask, beginning
3775/// from position Pos and ending in Pos+Size, falls within the specified
3776/// sequential range (Low, Low+Size], or is undef or is zero.
3778 unsigned Size, int Low,
3779 int Step = 1) {
3780 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
3781 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
3782 return false;
3783 return true;
3784}
3785
3786/// Return true if every element in Mask, beginning
3787/// from position Pos and ending in Pos+Size is undef or is zero.
3788static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
3789 unsigned Size) {
3790 return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);
3791}
3792
3793/// Return true if every element of a single input is referenced by the shuffle
3794/// mask. i.e. it just permutes them all.
3796 unsigned NumElts = Mask.size();
3797 APInt DemandedElts = APInt::getZero(NumElts);
3798 for (int M : Mask)
3799 if (isInRange(M, 0, NumElts))
3800 DemandedElts.setBit(M);
3801 return DemandedElts.isAllOnes();
3802}
3803
3804/// Helper function to test whether a shuffle mask could be
3805/// simplified by widening the elements being shuffled.
3806///
3807/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
3808/// leaves it in an unspecified state.
3809///
3810/// NOTE: This must handle normal vector shuffle masks and *target* vector
3811/// shuffle masks. The latter have the special property of a '-2' representing
3812/// a zero-ed lane of a vector.
3814 SmallVectorImpl<int> &WidenedMask) {
3815 WidenedMask.assign(Mask.size() / 2, 0);
3816 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
3817 int M0 = Mask[i];
3818 int M1 = Mask[i + 1];
3819
3820 // If both elements are undef, its trivial.
3821 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
3822 WidenedMask[i / 2] = SM_SentinelUndef;
3823 continue;
3824 }
3825
3826 // Check for an undef mask and a mask value properly aligned to fit with
3827 // a pair of values. If we find such a case, use the non-undef mask's value.
3828 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
3829 WidenedMask[i / 2] = M1 / 2;
3830 continue;
3831 }
3832 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
3833 WidenedMask[i / 2] = M0 / 2;
3834 continue;
3835 }
3836
3837 // When zeroing, we need to spread the zeroing across both lanes to widen.
3838 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
3839 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
3841 WidenedMask[i / 2] = SM_SentinelZero;
3842 continue;
3843 }
3844 return false;
3845 }
3846
3847 // Finally check if the two mask values are adjacent and aligned with
3848 // a pair.
3849 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
3850 WidenedMask[i / 2] = M0 / 2;
3851 continue;
3852 }
3853
3854 // Otherwise we can't safely widen the elements used in this shuffle.
3855 return false;
3856 }
3857 assert(WidenedMask.size() == Mask.size() / 2 &&
3858 "Incorrect size of mask after widening the elements!");
3859
3860 return true;
3861}
3862
3864 const APInt &Zeroable,
3865 bool V2IsZero,
3866 SmallVectorImpl<int> &WidenedMask) {
3867 // Create an alternative mask with info about zeroable elements.
3868 // Here we do not set undef elements as zeroable.
3869 SmallVector<int, 64> ZeroableMask(Mask);
3870 if (V2IsZero) {
3871 assert(!Zeroable.isZero() && "V2's non-undef elements are used?!");
3872 for (int i = 0, Size = Mask.size(); i != Size; ++i)
3873 if (Mask[i] != SM_SentinelUndef && Zeroable[i])
3874 ZeroableMask[i] = SM_SentinelZero;
3875 }
3876 return canWidenShuffleElements(ZeroableMask, WidenedMask);
3877}
3878
3880 SmallVector<int, 32> WidenedMask;
3881 return canWidenShuffleElements(Mask, WidenedMask);
3882}
3883
3884// Attempt to narrow/widen shuffle mask until it matches the target number of
3885// elements.
3886static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
3887 SmallVectorImpl<int> &ScaledMask) {
3888 unsigned NumSrcElts = Mask.size();
3889 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
3890 "Illegal shuffle scale factor");
3891
3892 // Narrowing is guaranteed to work.
3893 if (NumDstElts >= NumSrcElts) {
3894 int Scale = NumDstElts / NumSrcElts;
3895 llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
3896 return true;
3897 }
3898
3899 // We have to repeat the widening until we reach the target size, but we can
3900 // split out the first widening as it sets up ScaledMask for us.
3901 if (canWidenShuffleElements(Mask, ScaledMask)) {
3902 while (ScaledMask.size() > NumDstElts) {
3903 SmallVector<int, 16> WidenedMask;
3904 if (!canWidenShuffleElements(ScaledMask, WidenedMask))
3905 return false;
3906 ScaledMask = std::move(WidenedMask);
3907 }
3908 return true;
3909 }
3910
3911 return false;
3912}
3913
3914static bool canScaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts) {
3915 SmallVector<int, 32> ScaledMask;
3916 return scaleShuffleElements(Mask, NumDstElts, ScaledMask);
3917}
3918
3919/// Returns true if Elt is a constant zero or a floating point constant +0.0.
3921 return isNullConstant(Elt) || isNullFPConstant(Elt);
3922}
3923
3924// Build a vector of constants.
3925// Use an UNDEF node if MaskElt == -1.
3926// Split 64-bit constants in the 32-bit mode.
3928 const SDLoc &dl, bool IsMask = false) {
3929
3931 bool Split = false;
3932
3933 MVT ConstVecVT = VT;
3934 unsigned NumElts = VT.getVectorNumElements();
3935 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3936 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3937 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3938 Split = true;
3939 }
3940
3941 MVT EltVT = ConstVecVT.getVectorElementType();
3942 for (unsigned i = 0; i < NumElts; ++i) {
3943 bool IsUndef = Values[i] < 0 && IsMask;
3944 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
3945 DAG.getConstant(Values[i], dl, EltVT);
3946 Ops.push_back(OpNode);
3947 if (Split)
3948 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
3949 DAG.getConstant(0, dl, EltVT));
3950 }
3951 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
3952 if (Split)
3953 ConstsNode = DAG.getBitcast(VT, ConstsNode);
3954 return ConstsNode;
3955}
3956
3957static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs,
3958 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
3959 assert(Bits.size() == Undefs.getBitWidth() &&
3960 "Unequal constant and undef arrays");
3962 bool Split = false;
3963
3964 MVT ConstVecVT = VT;
3965 unsigned NumElts = VT.getVectorNumElements();
3966 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
3967 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
3968 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
3969 Split = true;
3970 }
3971
3972 MVT EltVT = ConstVecVT.getVectorElementType();
3973 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
3974 if (Undefs[i]) {
3975 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
3976 continue;
3977 }
3978 const APInt &V = Bits[i];
3979 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
3980 if (Split) {
3981 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
3982 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
3983 } else if (EltVT == MVT::f32) {
3985 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
3986 } else if (EltVT == MVT::f64) {
3988 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
3989 } else {
3990 Ops.push_back(DAG.getConstant(V, dl, EltVT));
3991 }
3992 }
3993
3994 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
3995 return DAG.getBitcast(VT, ConstsNode);
3996}
3997
3999 SelectionDAG &DAG, const SDLoc &dl) {
4000 APInt Undefs = APInt::getZero(Bits.size());
4001 return getConstVector(Bits, Undefs, VT, DAG, dl);
4002}
4003
4004/// Returns a vector of specified type with all zero elements.
4005static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4006 SelectionDAG &DAG, const SDLoc &dl) {
4007 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4008 VT.getVectorElementType() == MVT::i1) &&
4009 "Unexpected vector type");
4010
4011 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4012 // type. This ensures they get CSE'd. But if the integer type is not
4013 // available, use a floating-point +0.0 instead.
4014 SDValue Vec;
4015 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4016 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4017 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4018 } else if (VT.isFloatingPoint() &&
4020 Vec = DAG.getConstantFP(+0.0, dl, VT);
4021 } else if (VT.getVectorElementType() == MVT::i1) {
4022 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4023 "Unexpected vector type");
4024 Vec = DAG.getConstant(0, dl, VT);
4025 } else {
4026 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4027 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4028 }
4029 return DAG.getBitcast(VT, Vec);
4030}
4031
4032// Helper to determine if the ops are all the extracted subvectors come from a
4033// single source. If we allow commute they don't have to be in order (Lo/Hi).
4034static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {
4035 if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4036 RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
4037 LHS.getValueType() != RHS.getValueType() ||
4038 LHS.getOperand(0) != RHS.getOperand(0))
4039 return SDValue();
4040
4041 SDValue Src = LHS.getOperand(0);
4042 if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))
4043 return SDValue();
4044
4045 unsigned NumElts = LHS.getValueType().getVectorNumElements();
4046 if ((LHS.getConstantOperandAPInt(1) == 0 &&
4047 RHS.getConstantOperandAPInt(1) == NumElts) ||
4048 (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&
4049 LHS.getConstantOperandAPInt(1) == NumElts))
4050 return Src;
4051
4052 return SDValue();
4053}
4054
4055static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4056 const SDLoc &dl, unsigned vectorWidth) {
4057 EVT VT = Vec.getValueType();
4058 EVT ElVT = VT.getVectorElementType();
4059 unsigned Factor = VT.getSizeInBits() / vectorWidth;
4060 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
4061 VT.getVectorNumElements() / Factor);
4062
4063 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4064 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4065 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4066
4067 // This is the index of the first element of the vectorWidth-bit chunk
4068 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4069 IdxVal &= ~(ElemsPerChunk - 1);
4070
4071 // If the input is a buildvector just emit a smaller one.
4072 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4073 return DAG.getBuildVector(ResultVT, dl,
4074 Vec->ops().slice(IdxVal, ElemsPerChunk));
4075
4076 // Check if we're extracting the upper undef of a widening pattern.
4077 if (Vec.getOpcode() == ISD::INSERT_SUBVECTOR && Vec.getOperand(0).isUndef() &&
4078 Vec.getOperand(1).getValueType().getVectorNumElements() <= IdxVal &&
4079 isNullConstant(Vec.getOperand(2)))
4080 return DAG.getUNDEF(ResultVT);
4081
4082 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4083 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
4084}
4085
4086/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4087/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4088/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4089/// instructions or a simple subregister reference. Idx is an index in the
4090/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4091/// lowering EXTRACT_VECTOR_ELT operations easier.
4092static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4093 SelectionDAG &DAG, const SDLoc &dl) {
4095 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
4096 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4097}
4098
4099/// Generate a DAG to grab 256-bits from a 512-bit vector.
4100static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4101 SelectionDAG &DAG, const SDLoc &dl) {
4102 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4103 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4104}
4105
4106static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4107 SelectionDAG &DAG, const SDLoc &dl,
4108 unsigned vectorWidth) {
4109 assert((vectorWidth == 128 || vectorWidth == 256) &&
4110 "Unsupported vector width");
4111 // Inserting UNDEF is Result
4112 if (Vec.isUndef())
4113 return Result;
4114 EVT VT = Vec.getValueType();
4115 EVT ElVT = VT.getVectorElementType();
4116 EVT ResultVT = Result.getValueType();
4117
4118 // Insert the relevant vectorWidth bits.
4119 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
4120 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4121
4122 // This is the index of the first element of the vectorWidth-bit chunk
4123 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4124 IdxVal &= ~(ElemsPerChunk - 1);
4125
4126 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4127 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
4128}
4129
4130/// Generate a DAG to put 128-bits into a vector > 128 bits. This
4131/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4132/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4133/// simple superregister reference. Idx is an index in the 128 bits
4134/// we want. It need not be aligned to a 128-bit boundary. That makes
4135/// lowering INSERT_VECTOR_ELT operations easier.
4136static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4137 SelectionDAG &DAG, const SDLoc &dl) {
4138 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4139 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4140}
4141
4142/// Widen a vector to a larger size with the same scalar type, with the new
4143/// elements either zero or undef.
4144static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
4145 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4146 const SDLoc &dl) {
4147 EVT VecVT = Vec.getValueType();
4149 VecVT.getScalarType() == VT.getScalarType() &&
4150 "Unsupported vector widening type");
4151 // If the upper 128-bits of a build vector are already undef/zero, then try to
4152 // widen from the lower 128-bits.
4153 if (Vec.getOpcode() == ISD::BUILD_VECTOR && VecVT.is256BitVector()) {
4154 unsigned NumSrcElts = VecVT.getVectorNumElements();
4155 ArrayRef<SDUse> Hi = Vec->ops().drop_front(NumSrcElts / 2);
4156 if (all_of(Hi, [&](SDValue V) {
4157 return V.isUndef() || (ZeroNewElements && X86::isZeroNode(V));
4158 }))
4159 Vec = extract128BitVector(Vec, 0, DAG, dl);
4160 }
4161 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
4162 : DAG.getUNDEF(VT);
4163 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
4164 DAG.getIntPtrConstant(0, dl));
4165}
4166
4167/// Widen a vector to a larger size with the same scalar type, with the new
4168/// elements either zero or undef.
4169static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
4170 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4171 const SDLoc &dl, unsigned WideSizeInBits) {
4172 assert(Vec.getValueSizeInBits() <= WideSizeInBits &&
4173 (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&
4174 "Unsupported vector widening type");
4175 unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();
4176 MVT SVT = Vec.getSimpleValueType().getScalarType();
4177 MVT VT = MVT::getVectorVT(SVT, WideNumElts);
4178 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4179}
4180
4181/// Widen a mask vector type to a minimum of v8i1/v16i1 to allow use of KSHIFT
4182/// and bitcast with integer types.
4183static MVT widenMaskVectorType(MVT VT, const X86Subtarget &Subtarget) {
4184 assert(VT.getVectorElementType() == MVT::i1 && "Expected bool vector");
4185 unsigned NumElts = VT.getVectorNumElements();
4186 if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)
4187 return Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
4188 return VT;
4189}
4190
4191/// Widen a mask vector to a minimum of v8i1/v16i1 to allow use of KSHIFT and
4192/// bitcast with integer types.
4193static SDValue widenMaskVector(SDValue Vec, bool ZeroNewElements,
4194 const X86Subtarget &Subtarget, SelectionDAG &DAG,
4195 const SDLoc &dl) {
4196 MVT VT = widenMaskVectorType(Vec.getSimpleValueType(), Subtarget);
4197 return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
4198}
4199
4200// Helper function to collect subvector ops that are concatenated together,
4201// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
4202// The subvectors in Ops are guaranteed to be the same type.
4204 SelectionDAG &DAG) {
4205 assert(Ops.empty() && "Expected an empty ops vector");
4206
4207 if (N->getOpcode() == ISD::CONCAT_VECTORS) {
4208 Ops.append(N->op_begin(), N->op_end());
4209 return true;
4210 }
4211
4212 if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
4213 SDValue Src = N->getOperand(0);
4214 SDValue Sub = N->getOperand(1);
4215 const APInt &Idx = N->getConstantOperandAPInt(2);
4216 EVT VT = Src.getValueType();
4217 EVT SubVT = Sub.getValueType();
4218
4219 if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
4220 // insert_subvector(undef, x, lo)
4221 if (Idx == 0 && Src.isUndef()) {
4222 Ops.push_back(Sub);
4223 Ops.push_back(DAG.getUNDEF(SubVT));
4224 return true;
4225 }
4226 if (Idx == (VT.getVectorNumElements() / 2)) {
4227 // insert_subvector(insert_subvector(undef, x, lo), y, hi)
4228 if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
4229 Src.getOperand(1).getValueType() == SubVT &&
4230 isNullConstant(Src.getOperand(2))) {
4231 // Attempt to recurse into inner (matching) concats.
4232 SDValue Lo = Src.getOperand(1);
4233 SDValue Hi = Sub;
4234 SmallVector<SDValue, 2> LoOps, HiOps;
4235 if (collectConcatOps(Lo.getNode(), LoOps, DAG) &&
4236 collectConcatOps(Hi.getNode(), HiOps, DAG) &&
4237 LoOps.size() == HiOps.size()) {
4238 Ops.append(LoOps);
4239 Ops.append(HiOps);
4240 return true;
4241 }
4242 Ops.push_back(Lo);
4243 Ops.push_back(Hi);
4244 return true;
4245 }
4246 // insert_subvector(x, extract_subvector(x, lo), hi)
4247 if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
4248 Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
4249 Ops.append(2, Sub);
4250 return true;
4251 }
4252 // insert_subvector(undef, x, hi)
4253 if (Src.isUndef()) {
4254 Ops.push_back(DAG.getUNDEF(SubVT));
4255 Ops.push_back(Sub);
4256 return true;
4257 }
4258 }
4259 }
4260 }
4261
4262 return false;
4263}
4264
4265// Helper to check if \p V can be split into subvectors and the upper subvectors
4266// are all undef. In which case return the lower subvector.
4268 SelectionDAG &DAG) {
4269 SmallVector<SDValue> SubOps;
4270 if (!collectConcatOps(V.getNode(), SubOps, DAG))
4271 return SDValue();
4272
4273 unsigned NumSubOps = SubOps.size();
4274 unsigned HalfNumSubOps = NumSubOps / 2;
4275 assert((NumSubOps % 2) == 0 && "Unexpected number of subvectors");
4276
4277 ArrayRef<SDValue> UpperOps(SubOps.begin() + HalfNumSubOps, SubOps.end());
4278 if (any_of(UpperOps, [](SDValue Op) { return !Op.isUndef(); }))
4279 return SDValue();
4280
4281 EVT HalfVT = V.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
4282 ArrayRef<SDValue> LowerOps(SubOps.begin(), SubOps.begin() + HalfNumSubOps);
4283 return DAG.getNode(ISD::CONCAT_VECTORS, DL, HalfVT, LowerOps);
4284}
4285
4286// Helper to check if we can access all the constituent subvectors without any
4287// extract ops.
4290 return collectConcatOps(N, Ops, DAG);
4291}
4292
4293static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
4294 const SDLoc &dl) {
4295 EVT VT = Op.getValueType();
4296 unsigned NumElems = VT.getVectorNumElements();
4297 unsigned SizeInBits = VT.getSizeInBits();
4298 assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
4299 "Can't split odd sized vector");
4300
4301 // If this is a splat value (with no-undefs) then use the lower subvector,
4302 // which should be a free extraction.
4303 SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
4304 if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))
4305 return std::make_pair(Lo, Lo);
4306
4307 SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
4308 return std::make_pair(Lo, Hi);
4309}
4310
4311/// Break an operation into 2 half sized ops and then concatenate the results.
4313 unsigned NumOps = Op.getNumOperands();
4314 EVT VT = Op.getValueType();
4315
4316 // Extract the LHS Lo/Hi vectors
4317 SmallVector<SDValue> LoOps(NumOps, SDValue());
4318 SmallVector<SDValue> HiOps(NumOps, SDValue());
4319 for (unsigned I = 0; I != NumOps; ++I) {
4320 SDValue SrcOp = Op.getOperand(I);
4321 if (!SrcOp.getValueType().isVector()) {
4322 LoOps[I] = HiOps[I] = SrcOp;
4323 continue;
4324 }
4325 std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);
4326 }
4327
4328 EVT LoVT, HiVT;
4329 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
4330 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
4331 DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),
4332 DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));
4333}
4334
4335/// Break an unary integer operation into 2 half sized ops and then
4336/// concatenate the result back.
4338 const SDLoc &dl) {
4339 // Make sure we only try to split 256/512-bit types to avoid creating
4340 // narrow vectors.
4341 [[maybe_unused]] EVT VT = Op.getValueType();
4342 assert((Op.getOperand(0).getValueType().is256BitVector() ||
4343 Op.getOperand(0).getValueType().is512BitVector()) &&
4344 (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4345 assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
4346 VT.getVectorNumElements() &&
4347 "Unexpected VTs!");
4348 return splitVectorOp(Op, DAG, dl);
4349}
4350
4351/// Break a binary integer operation into 2 half sized ops and then
4352/// concatenate the result back.
4354 const SDLoc &dl) {
4355 // Assert that all the types match.
4356 [[maybe_unused]] EVT VT = Op.getValueType();
4357 assert(Op.getOperand(0).getValueType() == VT &&
4358 Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
4359 assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
4360 return splitVectorOp(Op, DAG, dl);
4361}
4362
4363// Helper for splitting operands of an operation to legal target size and
4364// apply a function on each part.
4365// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
4366// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
4367// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
4368// The argument Builder is a function that will be applied on each split part:
4369// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
4370template <typename F>
4372 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
4373 F Builder, bool CheckBWI = true) {
4374 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
4375 unsigned NumSubs = 1;
4376 if ((CheckBWI && Subtarget.useBWIRegs()) ||
4377 (!CheckBWI && Subtarget.useAVX512Regs())) {
4378 if (VT.getSizeInBits() > 512) {
4379 NumSubs = VT.getSizeInBits() / 512;
4380 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
4381 }
4382 } else if (Subtarget.hasAVX2()) {
4383 if (VT.getSizeInBits() > 256) {
4384 NumSubs = VT.getSizeInBits() / 256;
4385 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
4386 }
4387 } else {
4388 if (VT.getSizeInBits() > 128) {
4389 NumSubs = VT.getSizeInBits() / 128;
4390 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
4391 }
4392 }
4393
4394 if (NumSubs == 1)
4395 return Builder(DAG, DL, Ops);
4396
4398 for (unsigned i = 0; i != NumSubs; ++i) {
4400 for (SDValue Op : Ops) {
4401 EVT OpVT = Op.getValueType();
4402 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
4403 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
4404 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
4405 }
4406 Subs.push_back(Builder(DAG, DL, SubOps));
4407 }
4408 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
4409}
4410
4411// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX
4412// targets.
4413static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,
4415 const X86Subtarget &Subtarget) {
4416 assert(Subtarget.hasAVX512() && "AVX512 target expected");
4417 MVT SVT = VT.getScalarType();
4418
4419 // If we have a 32/64 splatted constant, splat it to DstTy to
4420 // encourage a foldable broadcast'd operand.
4421 auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {
4422 unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();
4423 // AVX512 broadcasts 32/64-bit operands.
4424 // TODO: Support float once getAVX512Node is used by fp-ops.
4425 if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||
4427 return SDValue();
4428 // If we're not widening, don't bother if we're not bitcasting.
4429 if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)
4430 return SDValue();
4431 if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {
4432 APInt SplatValue, SplatUndef;
4433 unsigned SplatBitSize;
4434 bool HasAnyUndefs;
4435 if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
4436 HasAnyUndefs, OpEltSizeInBits) &&
4437 !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)
4438 return DAG.getConstant(SplatValue, DL, DstVT);
4439 }
4440 return SDValue();
4441 };
4442
4443 bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());
4444
4445 MVT DstVT = VT;
4446 if (Widen)
4447 DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());
4448
4449 // Canonicalize src operands.
4450 SmallVector<SDValue> SrcOps(Ops);
4451 for (SDValue &Op : SrcOps) {
4452 MVT OpVT = Op.getSimpleValueType();
4453 // Just pass through scalar operands.
4454 if (!OpVT.isVector())
4455 continue;
4456 assert(OpVT == VT && "Vector type mismatch");
4457
4458 if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {
4459 Op = BroadcastOp;
4460 continue;
4461 }
4462
4463 // Just widen the subvector by inserting into an undef wide vector.
4464 if (Widen)
4465 Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);
4466 }
4467
4468 SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);
4469
4470 // Perform the 512-bit op then extract the bottom subvector.
4471 if (Widen)
4472 Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());
4473 return Res;
4474}
4475
4476/// Insert i1-subvector to i1-vector.
4478 const X86Subtarget &Subtarget) {
4479
4480 SDLoc dl(Op);
4481 SDValue Vec = Op.getOperand(0);
4482 SDValue SubVec = Op.getOperand(1);
4483 SDValue Idx = Op.getOperand(2);
4484 unsigned IdxVal = Op.getConstantOperandVal(2);
4485
4486 // Inserting undef is a nop. We can just return the original vector.
4487 if (SubVec.isUndef())
4488 return Vec;
4489
4490 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4491 return Op;
4492
4493 MVT OpVT = Op.getSimpleValueType();
4494 unsigned NumElems = OpVT.getVectorNumElements();
4495 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
4496
4497 // Extend to natively supported kshift.
4498 MVT WideOpVT = widenMaskVectorType(OpVT, Subtarget);
4499
4500 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
4501 // if necessary.
4502 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
4503 // May need to promote to a legal type.
4504 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4505 DAG.getConstant(0, dl, WideOpVT),
4506 SubVec, Idx);
4507 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4508 }
4509
4510 MVT SubVecVT = SubVec.getSimpleValueType();
4511 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4512 assert(IdxVal + SubVecNumElems <= NumElems &&
4513 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4514 "Unexpected index value in INSERT_SUBVECTOR");
4515
4516 SDValue Undef = DAG.getUNDEF(WideOpVT);
4517
4518 if (IdxVal == 0) {
4519 // Zero lower bits of the Vec
4520 SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);
4521 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
4522 ZeroIdx);
4523 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4524 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4525 // Merge them together, SubVec should be zero extended.
4526 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4527 DAG.getConstant(0, dl, WideOpVT),
4528 SubVec, ZeroIdx);
4529 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4530 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4531 }
4532
4533 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4534 Undef, SubVec, ZeroIdx);
4535
4536 if (Vec.isUndef()) {
4537 assert(IdxVal != 0 && "Unexpected index");
4538 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4539 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4540 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4541 }
4542
4544 assert(IdxVal != 0 && "Unexpected index");
4545 // If upper elements of Vec are known undef, then just shift into place.
4546 if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),
4547 [](SDValue V) { return V.isUndef(); })) {
4548 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4549 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4550 } else {
4551 NumElems = WideOpVT.getVectorNumElements();
4552 unsigned ShiftLeft = NumElems - SubVecNumElems;
4553 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4554 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4555 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4556 if (ShiftRight != 0)
4557 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4558 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4559 }
4560 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4561 }
4562
4563 // Simple case when we put subvector in the upper part
4564 if (IdxVal + SubVecNumElems == NumElems) {
4565 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4566 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
4567 if (SubVecNumElems * 2 == NumElems) {
4568 // Special case, use legal zero extending insert_subvector. This allows
4569 // isel to optimize when bits are known zero.
4570 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
4571 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4572 DAG.getConstant(0, dl, WideOpVT),
4573 Vec, ZeroIdx);
4574 } else {
4575 // Otherwise use explicit shifts to zero the bits.
4576 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
4577 Undef, Vec, ZeroIdx);
4578 NumElems = WideOpVT.getVectorNumElements();
4579 SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);
4580 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
4581 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
4582 }
4583 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4584 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4585 }
4586
4587 // Inserting into the middle is more complicated.
4588
4589 NumElems = WideOpVT.getVectorNumElements();
4590
4591 // Widen the vector if needed.
4592 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
4593
4594 unsigned ShiftLeft = NumElems - SubVecNumElems;
4595 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
4596
4597 // Do an optimization for the most frequently used types.
4598 if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
4599 APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
4600 Mask0.flipAllBits();
4601 SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
4602 SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
4603 Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
4604 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4605 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4606 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4607 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4608 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
4609
4610 // Reduce to original width if needed.
4611 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
4612 }
4613
4614 // Clear the upper bits of the subvector and move it to its insert position.
4615 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
4616 DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
4617 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
4618 DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
4619
4620 // Isolate the bits below the insertion point.
4621 unsigned LowShift = NumElems - IdxVal;
4622 SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
4623 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4624 Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
4625 DAG.getTargetConstant(LowShift, dl, MVT::i8));
4626
4627 // Isolate the bits after the last inserted bit.
4628 unsigned HighShift = IdxVal + SubVecNumElems;
4629 SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
4630 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4631 High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
4632 DAG.getTargetConstant(HighShift, dl, MVT::i8));
4633
4634 // Now OR all 3 pieces together.
4635 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
4636 SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
4637
4638 // Reduce to original width if needed.
4639 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
4640}
4641
4643 const SDLoc &dl) {
4644 assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch");
4645 EVT SubVT = V1.getValueType();
4646 EVT SubSVT = SubVT.getScalarType();
4647 unsigned SubNumElts = SubVT.getVectorNumElements();
4648 unsigned SubVectorWidth = SubVT.getSizeInBits();
4649 EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);
4650 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);
4651 return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);
4652}
4653
4654/// Returns a vector of specified type with all bits set.
4655/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
4656/// Then bitcast to their original type, ensuring they get CSE'd.
4657static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4658 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
4659 "Expected a 128/256/512-bit vector type");
4660 unsigned NumElts = VT.getSizeInBits() / 32;
4661 SDValue Vec = DAG.getAllOnesConstant(dl, MVT::getVectorVT(MVT::i32, NumElts));
4662 return DAG.getBitcast(VT, Vec);
4663}
4664
4665static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,
4666 SDValue In, SelectionDAG &DAG) {
4667 EVT InVT = In.getValueType();
4668 assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.");
4669 assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||
4670 ISD::ZERO_EXTEND == Opcode) &&
4671 "Unknown extension opcode");
4672
4673 // For 256-bit vectors, we only need the lower (128-bit) input half.
4674 // For 512-bit vectors, we only need the lower input half or quarter.
4675 if (InVT.getSizeInBits() > 128) {
4676 assert(VT.getSizeInBits() == InVT.getSizeInBits() &&
4677 "Expected VTs to be the same size!");
4678 unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
4679 In = extractSubVector(In, 0, DAG, DL,
4680 std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
4681 InVT = In.getValueType();
4682 }
4683
4684 if (VT.getVectorNumElements() != InVT.getVectorNumElements())
4685 Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);
4686
4687 return DAG.getNode(Opcode, DL, VT, In);
4688}
4689
4690// Create OR(AND(LHS,MASK),AND(RHS,~MASK)) bit select pattern
4691static SDValue getBitSelect(const SDLoc &DL, MVT VT, SDValue LHS, SDValue RHS,
4692 SDValue Mask, SelectionDAG &DAG) {
4693 LHS = DAG.getNode(ISD::AND, DL, VT, LHS, Mask);
4694 RHS = DAG.getNode(X86ISD::ANDNP, DL, VT, Mask, RHS);
4695 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
4696}
4697
4699 bool Lo, bool Unary) {
4700 assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&
4701 "Illegal vector type to unpack");
4702 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4703 int NumElts = VT.getVectorNumElements();
4704 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
4705 for (int i = 0; i < NumElts; ++i) {
4706 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
4707 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
4708 Pos += (Unary ? 0 : NumElts * (i % 2));
4709 Pos += (Lo ? 0 : NumEltsInLane / 2);
4710 Mask.push_back(Pos);
4711 }
4712}
4713
4714/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
4715/// imposed by AVX and specific to the unary pattern. Example:
4716/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
4717/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
4719 bool Lo) {
4720 assert(Mask.empty() && "Expected an empty shuffle mask vector");
4721 int NumElts = VT.getVectorNumElements();
4722 for (int i = 0; i < NumElts; ++i) {
4723 int Pos = i / 2;
4724 Pos += (Lo ? 0 : NumElts / 2);
4725 Mask.push_back(Pos);
4726 }
4727}
4728
4729// Attempt to constant fold, else just create a VECTOR_SHUFFLE.
4730static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,
4731 SDValue V1, SDValue V2, ArrayRef<int> Mask) {
4733 (ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {
4734 SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));
4735 for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {
4736 int M = Mask[I];
4737 if (M < 0)
4738 continue;
4739 SDValue V = (M < NumElts) ? V1 : V2;
4740 if (V.isUndef())
4741 continue;
4742 Ops[I] = V.getOperand(M % NumElts);
4743 }
4744 return DAG.getBuildVector(VT, dl, Ops);
4745 }
4746
4747 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
4748}
4749
4750/// Returns a vector_shuffle node for an unpackl operation.
4751static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4752 SDValue V1, SDValue V2) {
4754 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
4755 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4756}
4757
4758/// Returns a vector_shuffle node for an unpackh operation.
4759static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,
4760 SDValue V1, SDValue V2) {
4762 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
4763 return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);
4764}
4765
4766/// Returns a node that packs the LHS + RHS nodes together at half width.
4767/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.
4768/// TODO: Add subvector splitting if/when we have a need for it.
4769static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,
4770 const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,
4771 bool PackHiHalf = false) {
4772 MVT OpVT = LHS.getSimpleValueType();
4773 unsigned EltSizeInBits = VT.getScalarSizeInBits();
4774 bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;
4775 assert(OpVT == RHS.getSimpleValueType() &&
4776 VT.getSizeInBits() == OpVT.getSizeInBits() &&
4777 (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&
4778 "Unexpected PACK operand types");
4779 assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&
4780 "Unexpected PACK result type");
4781
4782 // Rely on vector shuffles for vXi64 -> vXi32 packing.
4783 if (EltSizeInBits == 32) {
4784 SmallVector<int> PackMask;
4785 int Offset = PackHiHalf ? 1 : 0;
4786 int NumElts = VT.getVectorNumElements();
4787 for (int I = 0; I != NumElts; I += 4) {
4788 PackMask.push_back(I + Offset);
4789 PackMask.push_back(I + Offset + 2);
4790 PackMask.push_back(I + Offset + NumElts);
4791 PackMask.push_back(I + Offset + NumElts + 2);
4792 }
4793 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),
4794 DAG.getBitcast(VT, RHS), PackMask);
4795 }
4796
4797 // See if we already have sufficient leading bits for PACKSS/PACKUS.
4798 if (!PackHiHalf) {
4799 if (UsePackUS &&
4800 DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&
4801 DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)
4802 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4803
4804 if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&
4805 DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)
4806 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4807 }
4808
4809 // Fallback to sign/zero extending the requested half and pack.
4810 SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);
4811 if (UsePackUS) {
4812 if (PackHiHalf) {
4813 LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);
4814 RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);
4815 } else {
4816 SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);
4817 LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);
4818 RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);
4819 };
4820 return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);
4821 };
4822
4823 if (!PackHiHalf) {
4824 LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);
4825 RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);
4826 }
4827 LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);
4828 RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);
4829 return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);
4830}
4831
4832/// Return a vector_shuffle of the specified vector of zero or undef vector.
4833/// This produces a shuffle where the low element of V2 is swizzled into the
4834/// zero/undef vector, landing at element Idx.
4835/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
4837 bool IsZero,
4838 const X86Subtarget &Subtarget,
4839 SelectionDAG &DAG) {
4840 MVT VT = V2.getSimpleValueType();
4841 SDValue V1 = IsZero
4842 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
4843 int NumElems = VT.getVectorNumElements();
4844 SmallVector<int, 16> MaskVec(NumElems);
4845 for (int i = 0; i != NumElems; ++i)
4846 // If this is the insertion idx, put the low elt of V2 here.
4847 MaskVec[i] = (i == Idx) ? NumElems : i;
4848 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
4849}
4850
4852 if (Ptr.getOpcode() == X86ISD::Wrapper ||
4853 Ptr.getOpcode() == X86ISD::WrapperRIP)
4854 Ptr = Ptr.getOperand(0);
4855 return dyn_cast<ConstantPoolSDNode>(Ptr);
4856}
4857
4858// TODO: Add support for non-zero offsets.
4861 if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
4862 return nullptr;
4863 return CNode->getConstVal();
4864}
4865
4867 if (!Load || !ISD::isNormalLoad(Load))
4868 return nullptr;
4869 return getTargetConstantFromBasePtr(Load->getBasePtr());
4870}
4871
4874 return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
4875}
4876
4877const Constant *
4879 assert(LD && "Unexpected null LoadSDNode");
4880 return getTargetConstantFromNode(LD);
4881}
4882
4883// Extract raw constant bits from constant pools.
4884static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
4885 APInt &UndefElts,
4886 SmallVectorImpl<APInt> &EltBits,
4887 bool AllowWholeUndefs = true,
4888 bool AllowPartialUndefs = false) {
4889 assert(EltBits.empty() && "Expected an empty EltBits vector");
4890
4892
4893 EVT VT = Op.getValueType();
4894 unsigned SizeInBits = VT.getSizeInBits();
4895 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
4896 unsigned NumElts = SizeInBits / EltSizeInBits;
4897
4898 // Bitcast a source array of element bits to the target size.
4899 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
4900 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
4901 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
4902 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
4903 "Constant bit sizes don't match");
4904
4905 // Don't split if we don't allow undef bits.
4906 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
4907 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
4908 return false;
4909
4910 // If we're already the right size, don't bother bitcasting.
4911 if (NumSrcElts == NumElts) {
4912 UndefElts = UndefSrcElts;
4913 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
4914 return true;
4915 }
4916
4917 // Extract all the undef/constant element data and pack into single bitsets.
4918 APInt UndefBits(SizeInBits, 0);
4919 APInt MaskBits(SizeInBits, 0);
4920
4921 for (unsigned i = 0; i != NumSrcElts; ++i) {
4922 unsigned BitOffset = i * SrcEltSizeInBits;
4923 if (UndefSrcElts[i])
4924 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
4925 MaskBits.insertBits(SrcEltBits[i], BitOffset);
4926 }
4927
4928 // Split the undef/constant single bitset data into the target elements.
4929 UndefElts = APInt(NumElts, 0);
4930 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
4931
4932 for (unsigned i = 0; i != NumElts; ++i) {
4933 unsigned BitOffset = i * EltSizeInBits;
4934 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
4935
4936 // Only treat an element as UNDEF if all bits are UNDEF.
4937 if (UndefEltBits.isAllOnes()) {
4938 if (!AllowWholeUndefs)
4939 return false;
4940 UndefElts.setBit(i);
4941 continue;
4942 }
4943
4944 // If only some bits are UNDEF then treat them as zero (or bail if not
4945 // supported).
4946 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
4947 return false;
4948
4949 EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);
4950 }
4951 return true;
4952 };
4953
4954 // Collect constant bits and insert into mask/undef bit masks.
4955 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
4956 unsigned UndefBitIndex) {
4957 if (!Cst)
4958 return false;
4959 if (isa<UndefValue>(Cst)) {
4960 Undefs.setBit(UndefBitIndex);
4961 return true;
4962 }
4963 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
4964 Mask = CInt->getValue();
4965 return true;
4966 }
4967 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
4968 Mask = CFP->getValueAPF().bitcastToAPInt();
4969 return true;
4970 }
4971 if (auto *CDS = dyn_cast<ConstantDataSequential>(Cst)) {
4972 Type *Ty = CDS->getType();
4974 Type *EltTy = CDS->getElementType();
4975 bool IsInteger = EltTy->isIntegerTy();
4976 bool IsFP =
4977 EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
4978 if (!IsInteger && !IsFP)
4979 return false;
4980 unsigned EltBits = EltTy->getPrimitiveSizeInBits();
4981 for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
4982 if (IsInteger)
4983 Mask.insertBits(CDS->getElementAsAPInt(I), I * EltBits);
4984 else
4985 Mask.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(),
4986 I * EltBits);
4987 return true;
4988 }
4989 return false;
4990 };
4991
4992 // Handle UNDEFs.
4993 if (Op.isUndef()) {
4994 APInt UndefSrcElts = APInt::getAllOnes(NumElts);
4995 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
4996 return CastBitData(UndefSrcElts, SrcEltBits);
4997 }
4998
4999 // Extract scalar constant bits.
5000 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5001 APInt UndefSrcElts = APInt::getZero(1);
5002 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5003 return CastBitData(UndefSrcElts, SrcEltBits);
5004 }
5005 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5006 APInt UndefSrcElts = APInt::getZero(1);
5007 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5008 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5009 return CastBitData(UndefSrcElts, SrcEltBits);
5010 }
5011
5012 // Extract constant bits from build vector.
5013 if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {
5014 BitVector Undefs;
5015 SmallVector<APInt> SrcEltBits;
5016 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5017 if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {
5018 APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());
5019 for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)
5020 if (Undefs[I])
5021 UndefSrcElts.setBit(I);
5022 return CastBitData(UndefSrcElts, SrcEltBits);
5023 }
5024 }
5025
5026 // Extract constant bits from constant pool vector.
5027 if (auto *Cst = getTargetConstantFromNode(Op)) {
5028 Type *CstTy = Cst->getType();
5029 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5030 if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)
5031 return false;
5032
5033 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5034 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5035 if ((SizeInBits % SrcEltSizeInBits) != 0)
5036 return false;
5037
5038 APInt UndefSrcElts(NumSrcElts, 0);
5039 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5040 for (unsigned i = 0; i != NumSrcElts; ++i)
5041 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5042 UndefSrcElts, i))
5043 return false;
5044
5045 return CastBitData(UndefSrcElts, SrcEltBits);
5046 }
5047
5048 // Extract constant bits from a broadcasted constant pool scalar.
5049 if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
5050 EltSizeInBits <= VT.getScalarSizeInBits()) {
5051 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5052 if (MemIntr->getMemoryVT().getStoreSizeInBits() != VT.getScalarSizeInBits())
5053 return false;
5054
5055 SDValue Ptr = MemIntr->getBasePtr();
5057 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5058 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5059
5060 APInt UndefSrcElts(NumSrcElts, 0);
5061 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5062 if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {
5063 if (UndefSrcElts[0])
5064 UndefSrcElts.setBits(0, NumSrcElts);
5065 if (SrcEltBits[0].getBitWidth() != SrcEltSizeInBits)
5066 SrcEltBits[0] = SrcEltBits[0].trunc(SrcEltSizeInBits);
5067 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5068 return CastBitData(UndefSrcElts, SrcEltBits);
5069 }
5070 }
5071 }
5072
5073 // Extract constant bits from a subvector broadcast.
5074 if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {
5075 auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
5076 SDValue Ptr = MemIntr->getBasePtr();
5077 // The source constant may be larger than the subvector broadcast,
5078 // ensure we extract the correct subvector constants.
5079 if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {
5080 Type *CstTy = Cst->getType();
5081 unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
5082 unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();
5083 if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||
5084 (SizeInBits % SubVecSizeInBits) != 0)
5085 return false;
5086 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5087 unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;
5088 unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;
5089 APInt UndefSubElts(NumSubElts, 0);
5090 SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,
5091 APInt(CstEltSizeInBits, 0));
5092 for (unsigned i = 0; i != NumSubElts; ++i) {
5093 if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],
5094 UndefSubElts, i))
5095 return false;
5096 for (unsigned j = 1; j != NumSubVecs; ++j)
5097 SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];
5098 }
5099 UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),
5100 UndefSubElts);
5101 return CastBitData(UndefSubElts, SubEltBits);
5102 }
5103 }
5104
5105 // Extract a rematerialized scalar constant insertion.
5106 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5107 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5108 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5109 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5110 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5111
5112 APInt UndefSrcElts(NumSrcElts, 0);
5113 SmallVector<APInt, 64> SrcEltBits;
5114 const APInt &C = Op.getOperand(0).getConstantOperandAPInt(0);
5115 SrcEltBits.push_back(C.zextOrTrunc(SrcEltSizeInBits));
5116 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5117 return CastBitData(UndefSrcElts, SrcEltBits);
5118 }
5119
5120 // Insert constant bits from a base and sub vector sources.
5121 if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
5122 // If bitcasts to larger elements we might lose track of undefs - don't
5123 // allow any to be safe.
5124 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5125 bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;
5126
5127 APInt UndefSrcElts, UndefSubElts;
5128 SmallVector<APInt, 32> EltSrcBits, EltSubBits;
5129 if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,
5130 UndefSubElts, EltSubBits,
5131 AllowWholeUndefs && AllowUndefs,
5132 AllowPartialUndefs && AllowUndefs) &&
5133 getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,
5134 UndefSrcElts, EltSrcBits,
5135 AllowWholeUndefs && AllowUndefs,
5136 AllowPartialUndefs && AllowUndefs)) {
5137 unsigned BaseIdx = Op.getConstantOperandVal(2);
5138 UndefSrcElts.insertBits(UndefSubElts, BaseIdx);
5139 for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)
5140 EltSrcBits[BaseIdx + i] = EltSubBits[i];
5141 return CastBitData(UndefSrcElts, EltSrcBits);
5142 }
5143 }
5144
5145 // Extract constant bits from a subvector's source.
5146 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
5147 // TODO - support extract_subvector through bitcasts.
5148 if (EltSizeInBits != VT.getScalarSizeInBits())
5149 return false;
5150
5151 if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5152 UndefElts, EltBits, AllowWholeUndefs,
5153 AllowPartialUndefs)) {
5154 EVT SrcVT = Op.getOperand(0).getValueType();
5155 unsigned NumSrcElts = SrcVT.getVectorNumElements();
5156 unsigned NumSubElts = VT.getVectorNumElements();
5157 unsigned BaseIdx = Op.getConstantOperandVal(1);
5158 UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);
5159 if ((BaseIdx + NumSubElts) != NumSrcElts)
5160 EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());
5161 if (BaseIdx != 0)
5162 EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);
5163 return true;
5164 }
5165 }
5166
5167 // Extract constant bits from shuffle node sources.
5168 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {
5169 // TODO - support shuffle through bitcasts.
5170 if (EltSizeInBits != VT.getScalarSizeInBits())
5171 return false;
5172
5173 ArrayRef<int> Mask = SVN->getMask();
5174 if ((!AllowWholeUndefs || !AllowPartialUndefs) &&
5175 llvm::any_of(Mask, [](int M) { return M < 0; }))
5176 return false;
5177
5178 APInt UndefElts0, UndefElts1;
5179 SmallVector<APInt, 32> EltBits0, EltBits1;
5180 if (isAnyInRange(Mask, 0, NumElts) &&
5181 !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,
5182 UndefElts0, EltBits0, AllowWholeUndefs,
5183 AllowPartialUndefs))
5184 return false;
5185 if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&
5186 !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,
5187 UndefElts1, EltBits1, AllowWholeUndefs,
5188 AllowPartialUndefs))
5189 return false;
5190
5191 UndefElts = APInt::getZero(NumElts);
5192 for (int i = 0; i != (int)NumElts; ++i) {
5193 int M = Mask[i];
5194 if (M < 0) {
5195 UndefElts.setBit(i);
5196 EltBits.push_back(APInt::getZero(EltSizeInBits));
5197 } else if (M < (int)NumElts) {
5198 if (UndefElts0[M])
5199 UndefElts.setBit(i);
5200 EltBits.push_back(EltBits0[M]);
5201 } else {
5202 if (UndefElts1[M - NumElts])
5203 UndefElts.setBit(i);
5204 EltBits.push_back(EltBits1[M - NumElts]);
5205 }
5206 }
5207 return true;
5208 }
5209
5210 return false;
5211}
5212
5213namespace llvm {
5214namespace X86 {
5215bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
5216 APInt UndefElts;
5217 SmallVector<APInt, 16> EltBits;
5219 Op, Op.getScalarValueSizeInBits(), UndefElts, EltBits,
5220 /*AllowWholeUndefs*/ true, AllowPartialUndefs)) {
5221 int SplatIndex = -1;
5222 for (int i = 0, e = EltBits.size(); i != e; ++i) {
5223 if (UndefElts[i])
5224 continue;
5225 if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {
5226 SplatIndex = -1;
5227 break;
5228 }
5229 SplatIndex = i;
5230 }
5231 if (0 <= SplatIndex) {
5232 SplatVal = EltBits[SplatIndex];
5233 return true;
5234 }
5235 }
5236
5237 return false;
5238}
5239} // namespace X86
5240} // namespace llvm
5241
5243 unsigned MaskEltSizeInBits,
5245 APInt &UndefElts) {
5246 // Extract the raw target constant bits.
5247 SmallVector<APInt, 64> EltBits;
5248 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5249 EltBits, /* AllowWholeUndefs */ true,
5250 /* AllowPartialUndefs */ false))
5251 return false;
5252
5253 // Insert the extracted elements into the mask.
5254 for (const APInt &Elt : EltBits)
5255 RawMask.push_back(Elt.getZExtValue());
5256
5257 return true;
5258}
5259
5260static bool isConstantPowerOf2(SDValue V, unsigned EltSizeInBIts,
5261 bool AllowUndefs) {
5262 APInt UndefElts;
5263 SmallVector<APInt, 64> EltBits;
5264 if (!getTargetConstantBitsFromNode(V, EltSizeInBIts, UndefElts, EltBits,
5265 /*AllowWholeUndefs*/ AllowUndefs,
5266 /*AllowPartialUndefs*/ false))
5267 return false;
5268
5269 bool IsPow2OrUndef = true;
5270 for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
5271 IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
5272 return IsPow2OrUndef;
5273}
5274
5275// Helper to attempt to return a cheaper, bit-inverted version of \p V.
5277 // TODO: don't always ignore oneuse constraints.
5278 V = peekThroughBitcasts(V);
5279 EVT VT = V.getValueType();
5280
5281 // Match not(xor X, -1) -> X.
5282 if (V.getOpcode() == ISD::XOR &&
5283 (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||
5284 isAllOnesConstant(V.getOperand(1))))
5285 return V.getOperand(0);
5286
5287 // Match not(extract_subvector(not(X)) -> extract_subvector(X).
5288 if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5289 (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {
5290 if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {
5291 Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);
5292 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), VT, Not,
5293 V.getOperand(1));
5294 }
5295 }
5296
5297 // Match not(pcmpgt(C, X)) -> pcmpgt(X, C - 1).
5298 if (V.getOpcode() == X86ISD::PCMPGT &&
5299 !ISD::isBuildVectorAllZeros(V.getOperand(0).getNode()) &&
5300 !ISD::isBuildVectorAllOnes(V.getOperand(0).getNode()) &&
5301 V.getOperand(0).hasOneUse()) {
5302 APInt UndefElts;
5303 SmallVector<APInt> EltBits;
5304 if (getTargetConstantBitsFromNode(V.getOperand(0),
5305 V.getScalarValueSizeInBits(), UndefElts,
5306 EltBits) &&
5307 !ISD::isBuildVectorOfConstantSDNodes(V.getOperand(1).getNode())) {
5308 // Don't fold min_signed_value -> (min_signed_value - 1)
5309 bool MinSigned = false;
5310 for (APInt &Elt : EltBits) {
5311 MinSigned |= Elt.isMinSignedValue();
5312 Elt -= 1;
5313 }
5314 if (!MinSigned) {
5315 SDLoc DL(V);
5316 MVT VT = V.getSimpleValueType();
5317 return DAG.getNode(X86ISD::PCMPGT, DL, VT, V.getOperand(1),
5318 getConstVector(EltBits, UndefElts, VT, DAG, DL));
5319 }
5320 }
5321 }
5322
5323 // Match not(concat_vectors(not(X), not(Y))) -> concat_vectors(X, Y).
5325 if (collectConcatOps(V.getNode(), CatOps, DAG)) {
5326 for (SDValue &CatOp : CatOps) {
5327 SDValue NotCat = IsNOT(CatOp, DAG);
5328 if (!NotCat)
5329 return SDValue();
5330 CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);
5331 }
5332 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), VT, CatOps);
5333 }
5334
5335 // Match not(or(not(X),not(Y))) -> and(X, Y).
5336 if (V.getOpcode() == ISD::OR && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
5337 V.getOperand(0).hasOneUse() && V.getOperand(1).hasOneUse()) {
5338 // TODO: Handle cases with single NOT operand -> ANDNP
5339 if (SDValue Op1 = IsNOT(V.getOperand(1), DAG))
5340 if (SDValue Op0 = IsNOT(V.getOperand(0), DAG))
5341 return DAG.getNode(ISD::AND, SDLoc(V), VT, DAG.getBitcast(VT, Op0),
5342 DAG.getBitcast(VT, Op1));
5343 }
5344
5345 return SDValue();
5346}
5347
5348/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5349/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
5350/// Note: This ignores saturation, so inputs must be checked first.
5352 bool Unary, unsigned NumStages = 1) {
5353 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5354 unsigned NumElts = VT.getVectorNumElements();
5355 unsigned NumLanes = VT.getSizeInBits() / 128;
5356 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5357 unsigned Offset = Unary ? 0 : NumElts;
5358 unsigned Repetitions = 1u << (NumStages - 1);
5359 unsigned Increment = 1u << NumStages;
5360 assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
5361
5362 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5363 for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
5364 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5365 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5366 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
5367 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5368 }
5369 }
5370}
5371
5372// Split the demanded elts of a PACKSS/PACKUS node between its operands.
5373static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,
5374 APInt &DemandedLHS, APInt &DemandedRHS) {
5375 int NumLanes = VT.getSizeInBits() / 128;
5376 int NumElts = DemandedElts.getBitWidth();
5377 int NumInnerElts = NumElts / 2;
5378 int NumEltsPerLane = NumElts / NumLanes;
5379 int NumInnerEltsPerLane = NumInnerElts / NumLanes;
5380
5381 DemandedLHS = APInt::getZero(NumInnerElts);
5382 DemandedRHS = APInt::getZero(NumInnerElts);
5383
5384 // Map DemandedElts to the packed operands.
5385 for (int Lane = 0; Lane != NumLanes; ++Lane) {
5386 for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {
5387 int OuterIdx = (Lane * NumEltsPerLane) + Elt;
5388 int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;
5389 if (DemandedElts[OuterIdx])
5390 DemandedLHS.setBit(InnerIdx);
5391 if (DemandedElts[OuterIdx + NumInnerEltsPerLane])
5392 DemandedRHS.setBit(InnerIdx);
5393 }
5394 }
5395}
5396
5397// Split the demanded elts of a HADD/HSUB node between its operands.
5398static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,
5399 APInt &DemandedLHS, APInt &DemandedRHS) {
5401 DemandedLHS, DemandedRHS);
5402 DemandedLHS |= DemandedLHS << 1;
5403 DemandedRHS |= DemandedRHS << 1;
5404}
5405
5406/// Calculates the shuffle mask corresponding to the target-specific opcode.
5407/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5408/// operands in \p Ops, and returns true.
5409/// Sets \p IsUnary to true if only one source is used. Note that this will set
5410/// IsUnary for shuffles which use a single input multiple times, and in those
5411/// cases it will adjust the mask to only have indices within that single input.
5412/// It is an error to call this with non-empty Mask/Ops vectors.
5413static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5415 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5416 if (!isTargetShuffle(N.getOpcode()))
5417 return false;
5418
5419 MVT VT = N.getSimpleValueType();
5420 unsigned NumElems = VT.getVectorNumElements();
5421 unsigned MaskEltSize = VT.getScalarSizeInBits();
5423 APInt RawUndefs;
5424 uint64_t ImmN;
5425
5426 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5427 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5428
5429 IsUnary = false;
5430 bool IsFakeUnary = false;
5431 switch (N.getOpcode()) {
5432 case X86ISD::BLENDI:
5433 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5434 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5435 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5436 DecodeBLENDMask(NumElems, ImmN, Mask);
5437 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5438 break;
5439 case X86ISD::SHUFP:
5440 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5441 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5442 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5443 DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
5444 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5445 break;
5446 case X86ISD::INSERTPS:
5447 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5448 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5449 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5450 DecodeINSERTPSMask(ImmN, Mask, /*SrcIsMem=*/false);
5451 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5452 break;
5453 case X86ISD::EXTRQI:
5454 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5455 if (isa<ConstantSDNode>(N.getOperand(1)) &&
5456 isa<ConstantSDNode>(N.getOperand(2))) {
5457 int BitLen = N.getConstantOperandVal(1);
5458 int BitIdx = N.getConstantOperandVal(2);
5459 DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5460 IsUnary = true;
5461 }
5462 break;
5463 case X86ISD::INSERTQI:
5464 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5465 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5466 if (isa<ConstantSDNode>(N.getOperand(2)) &&
5467 isa<ConstantSDNode>(N.getOperand(3))) {
5468 int BitLen = N.getConstantOperandVal(2);
5469 int BitIdx = N.getConstantOperandVal(3);
5470 DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);
5471 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5472 }
5473 break;
5474 case X86ISD::UNPCKH:
5475 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5476 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5477 DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);
5478 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5479 break;
5480 case X86ISD::UNPCKL:
5481 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5482 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5483 DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);
5484 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5485 break;
5486 case X86ISD::MOVHLPS:
5487 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5488 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5489 DecodeMOVHLPSMask(NumElems, Mask);
5490 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5491 break;
5492 case X86ISD::MOVLHPS:
5493 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5494 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5495 DecodeMOVLHPSMask(NumElems, Mask);
5496 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5497 break;
5498 case X86ISD::VALIGN:
5499 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
5500 "Only 32-bit and 64-bit elements are supported!");
5501 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5502 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5503 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5504 DecodeVALIGNMask(NumElems, ImmN, Mask);
5505 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5506 Ops.push_back(N.getOperand(1));
5507 Ops.push_back(N.getOperand(0));
5508 break;
5509 case X86ISD::PALIGNR:
5510 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5511 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5512 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5513 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5514 DecodePALIGNRMask(NumElems, ImmN, Mask);
5515 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5516 Ops.push_back(N.getOperand(1));
5517 Ops.push_back(N.getOperand(0));
5518 break;
5519 case X86ISD::VSHLDQ:
5520 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5521 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5522 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5523 DecodePSLLDQMask(NumElems, ImmN, Mask);
5524 IsUnary = true;
5525 break;
5526 case X86ISD::VSRLDQ:
5527 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5528 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5529 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5530 DecodePSRLDQMask(NumElems, ImmN, Mask);
5531 IsUnary = true;
5532 break;
5533 case X86ISD::PSHUFD:
5534 case X86ISD::VPERMILPI:
5535 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5536 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5537 DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
5538 IsUnary = true;
5539 break;
5540 case X86ISD::PSHUFHW:
5541 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5542 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5543 DecodePSHUFHWMask(NumElems, ImmN, Mask);
5544 IsUnary = true;
5545 break;
5546 case X86ISD::PSHUFLW:
5547 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5548 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5549 DecodePSHUFLWMask(NumElems, ImmN, Mask);
5550 IsUnary = true;
5551 break;
5552 case X86ISD::VZEXT_MOVL:
5553 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5554 DecodeZeroMoveLowMask(NumElems, Mask);
5555 IsUnary = true;
5556 break;
5557 case X86ISD::VBROADCAST:
5558 // We only decode broadcasts of same-sized vectors, peeking through to
5559 // extracted subvectors is likely to cause hasOneUse issues with
5560 // SimplifyDemandedBits etc.
5561 if (N.getOperand(0).getValueType() == VT) {
5562 DecodeVectorBroadcast(NumElems, Mask);
5563 IsUnary = true;
5564 break;
5565 }
5566 return false;
5567 case X86ISD::VPERMILPV: {
5568 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5569 IsUnary = true;
5570 SDValue MaskNode = N.getOperand(1);
5571 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5572 RawUndefs)) {
5573 DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);
5574 break;
5575 }
5576 return false;
5577 }
5578 case X86ISD::PSHUFB: {
5579 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5580 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5581 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5582 IsUnary = true;
5583 SDValue MaskNode = N.getOperand(1);
5584 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5585 DecodePSHUFBMask(RawMask, RawUndefs, Mask);
5586 break;
5587 }
5588 return false;
5589 }
5590 case X86ISD::VPERMI:
5591 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5592 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5593 DecodeVPERMMask(NumElems, ImmN, Mask);
5594 IsUnary = true;
5595 break;
5596 case X86ISD::MOVSS:
5597 case X86ISD::MOVSD:
5598 case X86ISD::MOVSH:
5599 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5600 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5601 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5602 break;
5603 case X86ISD::VPERM2X128:
5604 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5605 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5606 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5607 DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
5608 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5609 break;
5610 case X86ISD::SHUF128:
5611 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5612 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5613 ImmN = N.getConstantOperandVal(N.getNumOperands() - 1);
5614 decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
5615 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5616 break;
5617 case X86ISD::MOVSLDUP:
5618 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5619 DecodeMOVSLDUPMask(NumElems, Mask);
5620 IsUnary = true;
5621 break;
5622 case X86ISD::MOVSHDUP:
5623 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5624 DecodeMOVSHDUPMask(NumElems, Mask);
5625 IsUnary = true;
5626 break;
5627 case X86ISD::MOVDDUP:
5628 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5629 DecodeMOVDDUPMask(NumElems, Mask);
5630 IsUnary = true;
5631 break;
5632 case X86ISD::VPERMIL2: {
5633 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5634 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5635 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5636 SDValue MaskNode = N.getOperand(2);
5637 SDValue CtrlNode = N.getOperand(3);
5638 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5639 unsigned CtrlImm = CtrlOp->getZExtValue();
5640 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5641 RawUndefs)) {
5642 DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,
5643 Mask);
5644 break;
5645 }
5646 }
5647 return false;
5648 }
5649 case X86ISD::VPPERM: {
5650 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5651 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5652 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(1);
5653 SDValue MaskNode = N.getOperand(2);
5654 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {
5655 DecodeVPPERMMask(RawMask, RawUndefs, Mask);
5656 break;
5657 }
5658 return false;
5659 }
5660 case X86ISD::VPERMV: {
5661 assert(N.getOperand(1).getValueType() == VT && "Unexpected value type");
5662 IsUnary = true;
5663 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5664 Ops.push_back(N.getOperand(1));
5665 SDValue MaskNode = N.getOperand(0);
5666 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5667 RawUndefs)) {
5668 DecodeVPERMVMask(RawMask, RawUndefs, Mask);
5669 break;
5670 }
5671 return false;
5672 }
5673 case X86ISD::VPERMV3: {
5674 assert(N.getOperand(0).getValueType() == VT && "Unexpected value type");
5675 assert(N.getOperand(2).getValueType() == VT && "Unexpected value type");
5676 IsUnary = IsFakeUnary = N.getOperand(0) == N.getOperand(2);
5677 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5678 Ops.push_back(N.getOperand(0));
5679 Ops.push_back(N.getOperand(2));
5680 SDValue MaskNode = N.getOperand(1);
5681 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,
5682 RawUndefs)) {
5683 DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);
5684 break;
5685 }
5686 return false;
5687 }
5688 default:
5689 llvm_unreachable("unknown target shuffle node");
5690 }
5691
5692 // Empty mask indicates the decode failed.
5693 if (Mask.empty())
5694 return false;
5695
5696 // Check if we're getting a shuffle mask with zero'd elements.
5697 if (!AllowSentinelZero && isAnyZero(Mask))
5698 return false;
5699
5700 // If we have a fake unary shuffle, the shuffle mask is spread across two
5701 // inputs that are actually the same node. Re-map the mask to always point
5702 // into the first input.
5703 if (IsFakeUnary)
5704 for (int &M : Mask)
5705 if (M >= (int)Mask.size())
5706 M -= Mask.size();
5707
5708 // If we didn't already add operands in the opcode-specific code, default to
5709 // adding 1 or 2 operands starting at 0.
5710 if (Ops.empty()) {
5711 Ops.push_back(N.getOperand(0));
5712 if (!IsUnary || IsFakeUnary)
5713 Ops.push_back(N.getOperand(1));
5714 }
5715
5716 return true;
5717}
5718
5719// Wrapper for getTargetShuffleMask with InUnary;
5720static bool getTargetShuffleMask(SDValue N, bool AllowSentinelZero,
5722 SmallVectorImpl<int> &Mask) {
5723 bool IsUnary;
5724 return getTargetShuffleMask(N, AllowSentinelZero, Ops, Mask, IsUnary);
5725}
5726
5727/// Compute whether each element of a shuffle is zeroable.
5728///
5729/// A "zeroable" vector shuffle element is one which can be lowered to zero.
5730/// Either it is an undef element in the shuffle mask, the element of the input
5731/// referenced is undef, or the element of the input referenced is known to be
5732/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
5733/// as many lanes with this technique as possible to simplify the remaining
5734/// shuffle.
5736 SDValue V1, SDValue V2,
5737 APInt &KnownUndef, APInt &KnownZero) {
5738 int Size = Mask.size();
5739 KnownUndef = KnownZero = APInt::getZero(Size);
5740
5741 V1 = peekThroughBitcasts(V1);
5742 V2 = peekThroughBitcasts(V2);
5743
5744 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
5745 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
5746
5747 int VectorSizeInBits = V1.getValueSizeInBits();
5748 int ScalarSizeInBits = VectorSizeInBits / Size;
5749 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
5750
5751 for (int i = 0; i < Size; ++i) {
5752 int M = Mask[i];
5753 // Handle the easy cases.
5754 if (M < 0) {
5755 KnownUndef.setBit(i);
5756 continue;
5757 }
5758 if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
5759 KnownZero.setBit(i);
5760 continue;
5761 }
5762
5763 // Determine shuffle input and normalize the mask.
5764 SDValue V = M < Size ? V1 : V2;
5765 M %= Size;
5766
5767 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5768 if (V.getOpcode() != ISD::BUILD_VECTOR)
5769 continue;
5770
5771 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
5772 // the (larger) source element must be UNDEF/ZERO.
5773 if ((Size % V.getNumOperands()) == 0) {
5774 int Scale = Size / V->getNumOperands();
5775 SDValue Op = V.getOperand(M / Scale);
5776 if (Op.isUndef())
5777 KnownUndef.setBit(i);
5778 if (X86::isZeroNode(Op))
5779 KnownZero.setBit(i);
5780 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
5781 APInt Val = Cst->getAPIntValue();
5782 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5783 if (Val == 0)
5784 KnownZero.setBit(i);
5785 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5786 APInt Val = Cst->getValueAPF().bitcastToAPInt();
5787 Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
5788 if (Val == 0)
5789 KnownZero.setBit(i);
5790 }
5791 continue;
5792 }
5793
5794 // If the BUILD_VECTOR has more elements then all the (smaller) source
5795 // elements must be UNDEF or ZERO.
5796 if ((V.getNumOperands() % Size) == 0) {
5797 int Scale = V->getNumOperands() / Size;
5798 bool AllUndef = true;
5799 bool AllZero = true;
5800 for (int j = 0; j < Scale; ++j) {
5801 SDValue Op = V.getOperand((M * Scale) + j);
5802 AllUndef &= Op.isUndef();
5803 AllZero &= X86::isZeroNode(Op);
5804 }
5805 if (AllUndef)
5806 KnownUndef.setBit(i);
5807 if (AllZero)
5808 KnownZero.setBit(i);
5809 continue;
5810 }
5811 }
5812}
5813
5814/// Decode a target shuffle mask and inputs and see if any values are
5815/// known to be undef or zero from their inputs.
5816/// Returns true if the target shuffle mask was decoded.
5817/// FIXME: Merge this with computeZeroableShuffleElements?
5820 APInt &KnownUndef, APInt &KnownZero) {
5821 bool IsUnary;
5822 if (!isTargetShuffle(N.getOpcode()))
5823 return false;
5824
5825 MVT VT = N.getSimpleValueType();
5826 if (!getTargetShuffleMask(N, true, Ops, Mask, IsUnary))
5827 return false;
5828
5829 int Size = Mask.size();
5830 SDValue V1 = Ops[0];
5831 SDValue V2 = IsUnary ? V1 : Ops[1];
5832 KnownUndef = KnownZero = APInt::getZero(Size);
5833
5834 V1 = peekThroughBitcasts(V1);
5835 V2 = peekThroughBitcasts(V2);
5836
5837 assert((VT.getSizeInBits() % Size) == 0 &&
5838 "Illegal split of shuffle value type");
5839 unsigned EltSizeInBits = VT.getSizeInBits() / Size;
5840
5841 // Extract known constant input data.
5842 APInt UndefSrcElts[2];
5843 SmallVector<APInt, 32> SrcEltBits[2];
5844 bool IsSrcConstant[2] = {
5845 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5846 SrcEltBits[0], /*AllowWholeUndefs*/ true,
5847 /*AllowPartialUndefs*/ false),
5848 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5849 SrcEltBits[1], /*AllowWholeUndefs*/ true,
5850 /*AllowPartialUndefs*/ false)};
5851
5852 for (int i = 0; i < Size; ++i) {
5853 int M = Mask[i];
5854
5855 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5856 if (M < 0) {
5857 assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!");
5858 if (SM_SentinelUndef == M)
5859 KnownUndef.setBit(i);
5860 if (SM_SentinelZero == M)
5861 KnownZero.setBit(i);
5862 continue;
5863 }
5864
5865 // Determine shuffle input and normalize the mask.
5866 unsigned SrcIdx = M / Size;
5867 SDValue V = M < Size ? V1 : V2;
5868 M %= Size;
5869
5870 // We are referencing an UNDEF input.
5871 if (V.isUndef()) {
5872 KnownUndef.setBit(i);
5873 continue;
5874 }
5875
5876 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5877 // TODO: We currently only set UNDEF for integer types - floats use the same
5878 // registers as vectors and many of the scalar folded loads rely on the
5879 // SCALAR_TO_VECTOR pattern.
5880 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5881 (Size % V.getValueType().getVectorNumElements()) == 0) {
5882 int Scale = Size / V.getValueType().getVectorNumElements();
5883 int Idx = M / Scale;
5884 if (Idx != 0 && !VT.isFloatingPoint())
5885 KnownUndef.setBit(i);
5886 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5887 KnownZero.setBit(i);
5888 continue;
5889 }
5890
5891 // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
5892 // base vectors.
5893 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
5894 SDValue Vec = V.getOperand(0);
5895 int NumVecElts = Vec.getValueType().getVectorNumElements();
5896 if (Vec.isUndef() && Size == NumVecElts) {
5897 int Idx = V.getConstantOperandVal(2);
5898 int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
5899 if (M < Idx || (Idx + NumSubElts) <= M)
5900 KnownUndef.setBit(i);
5901 }
5902 continue;
5903 }
5904
5905 // Attempt to extract from the source's constant bits.
5906 if (IsSrcConstant[SrcIdx]) {
5907 if (UndefSrcElts[SrcIdx][M])
5908 KnownUndef.setBit(i);
5909 else if (SrcEltBits[SrcIdx][M] == 0)
5910 KnownZero.setBit(i);
5911 }
5912 }
5913
5914 assert(VT.getVectorNumElements() == (unsigned)Size &&
5915 "Different mask size from vector size!");
5916 return true;
5917}
5918
5919// Replace target shuffle mask elements with known undef/zero sentinels.
5921 const APInt &KnownUndef,
5922 const APInt &KnownZero,
5923 bool ResolveKnownZeros= true) {
5924 unsigned NumElts = Mask.size();
5925 assert(KnownUndef.getBitWidth() == NumElts &&
5926 KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
5927
5928 for (unsigned i = 0; i != NumElts; ++i) {
5929 if (KnownUndef[i])
5930 Mask[i] = SM_SentinelUndef;
5931 else if (ResolveKnownZeros && KnownZero[i])
5932 Mask[i] = SM_SentinelZero;
5933 }
5934}
5935
5936// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.
5938 APInt &KnownUndef,
5939 APInt &KnownZero) {
5940 unsigned NumElts = Mask.size();
5941 KnownUndef = KnownZero = APInt::getZero(NumElts);
5942
5943 for (unsigned i = 0; i != NumElts; ++i) {
5944 int M = Mask[i];
5945 if (SM_SentinelUndef == M)
5946 KnownUndef.setBit(i);
5947 if (SM_SentinelZero == M)
5948 KnownZero.setBit(i);
5949 }
5950}
5951
5952// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.
5954 SDValue Cond, bool IsBLENDV = false) {
5955 EVT CondVT = Cond.getValueType();
5956 unsigned EltSizeInBits = CondVT.getScalarSizeInBits();
5957 unsigned NumElts = CondVT.getVectorNumElements();
5958
5959 APInt UndefElts;
5960 SmallVector<APInt, 32> EltBits;
5961 if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,
5962 /*AllowWholeUndefs*/ true,
5963 /*AllowPartialUndefs*/ false))
5964 return false;
5965
5966 Mask.resize(NumElts, SM_SentinelUndef);
5967
5968 for (int i = 0; i != (int)NumElts; ++i) {
5969 Mask[i] = i;
5970 // Arbitrarily choose from the 2nd operand if the select condition element
5971 // is undef.
5972 // TODO: Can we do better by matching patterns such as even/odd?
5973 if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||
5974 (IsBLENDV && EltBits[i].isNonNegative()))
5975 Mask[i] += NumElts;
5976 }
5977
5978 return true;
5979}
5980
5981// Forward declaration (for getFauxShuffleMask recursive check).
5982static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
5985 const SelectionDAG &DAG, unsigned Depth,
5986 bool ResolveKnownElts);
5987
5988// Attempt to decode ops that could be represented as a shuffle mask.
5989// The decoded shuffle mask may contain a different number of elements to the
5990// destination value type.
5991// TODO: Merge into getTargetShuffleInputs()
5992static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
5995 const SelectionDAG &DAG, unsigned Depth,
5996 bool ResolveKnownElts) {
5997 Mask.clear();
5998 Ops.clear();
5999
6000 MVT VT = N.getSimpleValueType();
6001 unsigned NumElts = VT.getVectorNumElements();
6002 unsigned NumSizeInBits = VT.getSizeInBits();
6003 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6004 if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
6005 return false;
6006 assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
6007 unsigned NumSizeInBytes = NumSizeInBits / 8;
6008 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6009
6010 unsigned Opcode = N.getOpcode();
6011 switch (Opcode) {
6012 case ISD::VECTOR_SHUFFLE: {
6013 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6014 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6015 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6016 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6017 Ops.push_back(N.getOperand(0));
6018 Ops.push_back(N.getOperand(1));
6019 return true;
6020 }
6021 return false;
6022 }
6023 case ISD::AND:
6024 case X86ISD::ANDNP: {
6025 // Attempt to decode as a per-byte mask.
6026 APInt UndefElts;
6027 SmallVector<APInt, 32> EltBits;
6028 SDValue N0 = N.getOperand(0);
6029 SDValue N1 = N.getOperand(1);
6030 bool IsAndN = (X86ISD::ANDNP == Opcode);
6031 uint64_t ZeroMask = IsAndN ? 255 : 0;
6032 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits,
6033 /*AllowWholeUndefs*/ false,
6034 /*AllowPartialUndefs*/ false))
6035 return false;
6036 // We can't assume an undef src element gives an undef dst - the other src
6037 // might be zero.
6038 assert(UndefElts.isZero() && "Unexpected UNDEF element in AND/ANDNP mask");
6039 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6040 const APInt &ByteBits = EltBits[i];
6041 if (ByteBits != 0 && ByteBits != 255)
6042 return false;
6043 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6044 }
6045 Ops.push_back(IsAndN ? N1 : N0);
6046 return true;
6047 }
6048 case ISD::OR: {
6049 // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other
6050 // is a valid shuffle index.
6051 SDValue N0 = peekThroughBitcasts(N.getOperand(0));
6052 SDValue N1 = peekThroughBitcasts(N.getOperand(1));
6053 if (!N0.getValueType().isVector() || !N1.getValueType().isVector())
6054 return false;
6055
6056 SmallVector<int, 64> SrcMask0, SrcMask1;
6057 SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;
6060 if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,
6061 Depth + 1, true) ||
6062 !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,
6063 Depth + 1, true))
6064 return false;
6065
6066 size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
6067 SmallVector<int, 64> Mask0, Mask1;
6068 narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
6069 narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
6070 for (int i = 0; i != (int)MaskSize; ++i) {
6071 // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite
6072 // loops converting between OR and BLEND shuffles due to
6073 // canWidenShuffleElements merging away undef elements, meaning we
6074 // fail to recognise the OR as the undef element isn't known zero.
6075 if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)
6076 Mask.push_back(SM_SentinelZero);
6077 else if (Mask1[i] == SM_SentinelZero)
6078 Mask.push_back(i);
6079 else if (Mask0[i] == SM_SentinelZero)
6080 Mask.push_back(i + MaskSize);
6081 else
6082 return false;
6083 }
6084 Ops.push_back(N0);
6085 Ops.push_back(N1);
6086 return true;
6087 }
6088 case ISD::INSERT_SUBVECTOR: {
6089 SDValue Src = N.getOperand(0);
6090 SDValue Sub = N.getOperand(1);
6091 EVT SubVT = Sub.getValueType();
6092 unsigned NumSubElts = SubVT.getVectorNumElements();
6093 if (!N->isOnlyUserOf(Sub.getNode()))
6094 return false;
6095 SDValue SubBC = peekThroughBitcasts(Sub);
6096 uint64_t InsertIdx = N.getConstantOperandVal(2);
6097 // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
6098 if (SubBC.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
6099 SubBC.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6100 uint64_t ExtractIdx = SubBC.getConstantOperandVal(1);
6101 SDValue SubBCSrc = SubBC.getOperand(0);
6102 unsigned NumSubSrcBCElts = SubBCSrc.getValueType().getVectorNumElements();
6103 unsigned MaxElts = std::max(NumElts, NumSubSrcBCElts);
6104 assert((MaxElts % NumElts) == 0 && (MaxElts % NumSubSrcBCElts) == 0 &&
6105 "Subvector valuetype mismatch");
6106 InsertIdx *= (MaxElts / NumElts);
6107 ExtractIdx *= (MaxElts / NumSubSrcBCElts);
6108 NumSubElts *= (MaxElts / NumElts);
6109 bool SrcIsUndef = Src.isUndef();
6110 for (int i = 0; i != (int)MaxElts; ++i)
6111 Mask.push_back(SrcIsUndef ? SM_SentinelUndef : i);
6112 for (int i = 0; i != (int)NumSubElts; ++i)
6113 Mask[InsertIdx + i] = (SrcIsUndef ? 0 : MaxElts) + ExtractIdx + i;
6114 if (!SrcIsUndef)
6115 Ops.push_back(Src);
6116 Ops.push_back(SubBCSrc);
6117 return true;
6118 }
6119 // Handle CONCAT(SUB0, SUB1).
6120 // Limit this to vXi64 512-bit vector cases to make the most of AVX512
6121 // cross lane shuffles.
6122 if (Depth > 0 && InsertIdx == NumSubElts && NumElts == (2 * NumSubElts) &&
6123 NumBitsPerElt == 64 && NumSizeInBits == 512 &&
6124 Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
6125 Src.getOperand(0).isUndef() &&
6126 Src.getOperand(1).getValueType() == SubVT &&
6127 Src.getConstantOperandVal(2) == 0) {
6128 for (int i = 0; i != (int)NumSubElts; ++i)
6129 Mask.push_back(i);
6130 for (int i = 0; i != (int)NumSubElts; ++i)
6131 Mask.push_back(i + NumElts);
6132 Ops.push_back(Src.getOperand(1));
6133 Ops.push_back(Sub);
6134 return true;
6135 }
6136 // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).
6137 SmallVector<int, 64> SubMask;
6138 SmallVector<SDValue, 2> SubInputs;
6139 SDValue SubSrc = peekThroughOneUseBitcasts(Sub);
6140 EVT SubSrcVT = SubSrc.getValueType();
6141 if (!SubSrcVT.isVector())
6142 return false;
6143
6144 APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());
6145 if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,
6146 Depth + 1, ResolveKnownElts))
6147 return false;
6148
6149 // Subvector shuffle inputs must not be larger than the subvector.
6150 if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
6151 return SubVT.getFixedSizeInBits() <
6152 SubInput.getValueSizeInBits().getFixedValue();
6153 }))
6154 return false;
6155
6156 if (SubMask.size() != NumSubElts) {
6157 assert(((SubMask.size() % NumSubElts) == 0 ||
6158 (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
6159 if ((NumSubElts % SubMask.size()) == 0) {
6160 int Scale = NumSubElts / SubMask.size();
6161 SmallVector<int,64> ScaledSubMask;
6162 narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
6163 SubMask = ScaledSubMask;
6164 } else {
6165 int Scale = SubMask.size() / NumSubElts;
6166 NumSubElts = SubMask.size();
6167 NumElts *= Scale;
6168 InsertIdx *= Scale;
6169 }
6170 }
6171 Ops.push_back(Src);
6172 Ops.append(SubInputs.begin(), SubInputs.end());
6173 if (ISD::isBuildVectorAllZeros(Src.getNode()))
6174 Mask.append(NumElts, SM_SentinelZero);
6175 else
6176 for (int i = 0; i != (int)NumElts; ++i)
6177 Mask.push_back(i);
6178 for (int i = 0; i != (int)NumSubElts; ++i) {
6179 int M = SubMask[i];
6180 if (0 <= M) {
6181 int InputIdx = M / NumSubElts;
6182 M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);
6183 }
6184 Mask[i + InsertIdx] = M;
6185 }
6186 return true;
6187 }
6188 case X86ISD::PINSRB:
6189 case X86ISD::PINSRW:
6192 // Match against a insert_vector_elt/scalar_to_vector of an extract from a
6193 // vector, for matching src/dst vector types.
6194 SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
6195
6196 unsigned DstIdx = 0;
6197 if (Opcode != ISD::SCALAR_TO_VECTOR) {
6198 // Check we have an in-range constant insertion index.
6199 if (!isa<ConstantSDNode>(N.getOperand(2)) ||
6200 N.getConstantOperandAPInt(2).uge(NumElts))
6201 return false;
6202 DstIdx = N.getConstantOperandVal(2);
6203
6204 // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
6205 if (X86::isZeroNode(Scl)) {
6206 Ops.push_back(N.getOperand(0));
6207 for (unsigned i = 0; i != NumElts; ++i)
6208 Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
6209 return true;
6210 }
6211 }
6212
6213 // Peek through trunc/aext/zext/bitcast.
6214 // TODO: aext shouldn't require SM_SentinelZero padding.
6215 // TODO: handle shift of scalars.
6216 unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
6217 while (Scl.getOpcode() == ISD::TRUNCATE ||
6218 Scl.getOpcode() == ISD::ANY_EXTEND ||
6219 Scl.getOpcode() == ISD::ZERO_EXTEND ||
6220 (Scl.getOpcode() == ISD::BITCAST &&
6223 Scl = Scl.getOperand(0);
6224 MinBitsPerElt =
6225 std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
6226 }
6227 if ((MinBitsPerElt % 8) != 0)
6228 return false;
6229
6230 // Attempt to find the source vector the scalar was extracted from.
6231 SDValue SrcExtract;
6232 if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
6233 Scl.getOpcode() == X86ISD::PEXTRW ||
6234 Scl.getOpcode() == X86ISD::PEXTRB) &&
6235 Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
6236 SrcExtract = Scl;
6237 }
6238 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6239 return false;
6240
6241 SDValue SrcVec = SrcExtract.getOperand(0);
6242 EVT SrcVT = SrcVec.getValueType();
6243 if (!SrcVT.getScalarType().isByteSized())
6244 return false;
6245 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6246 unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
6247 unsigned DstByte = DstIdx * NumBytesPerElt;
6248 MinBitsPerElt =
6249 std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
6250
6251 // Create 'identity' byte level shuffle mask and then add inserted bytes.
6252 if (Opcode == ISD::SCALAR_TO_VECTOR) {
6253 Ops.push_back(SrcVec);
6254 Mask.append(NumSizeInBytes, SM_SentinelUndef);
6255 } else {
6256 Ops.push_back(SrcVec);
6257 Ops.push_back(N.getOperand(0));
6258 for (int i = 0; i != (int)NumSizeInBytes; ++i)
6259 Mask.push_back(NumSizeInBytes + i);
6260 }
6261
6262 unsigned MinBytesPerElts = MinBitsPerElt / 8;
6263 MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
6264 for (unsigned i = 0; i != MinBytesPerElts; ++i)
6265 Mask[DstByte + i] = SrcByte + i;
6266 for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
6267 Mask[DstByte + i] = SM_SentinelZero;
6268 return true;
6269 }
6270 case X86ISD::PACKSS:
6271 case X86ISD::PACKUS: {
6272 SDValue N0 = N.getOperand(0);
6273 SDValue N1 = N.getOperand(1);
6274 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6275 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6276 "Unexpected input value type");
6277
6278 APInt EltsLHS, EltsRHS;
6279 getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);
6280
6281 // If we know input saturation won't happen (or we don't care for particular
6282 // lanes), we can treat this as a truncation shuffle.
6283 bool Offset0 = false, Offset1 = false;
6284 if (Opcode == X86ISD::PACKSS) {
6285 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6286 DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||
6287 (!(N1.isUndef() || EltsRHS.isZero()) &&
6288 DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))
6289 return false;
6290 // We can't easily fold ASHR into a shuffle, but if it was feeding a
6291 // PACKSS then it was likely being used for sign-extension for a
6292 // truncation, so just peek through and adjust the mask accordingly.
6293 if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&
6294 N0.getConstantOperandAPInt(1) == NumBitsPerElt) {
6295 Offset0 = true;
6296 N0 = N0.getOperand(0);
6297 }
6298 if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&
6299 N1.getConstantOperandAPInt(1) == NumBitsPerElt) {
6300 Offset1 = true;
6301 N1 = N1.getOperand(0);
6302 }
6303 } else {
6304 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6305 if ((!(N0.isUndef() || EltsLHS.isZero()) &&
6306 !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||
6307 (!(N1.isUndef() || EltsRHS.isZero()) &&
6308 !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))
6309 return false;
6310 }
6311
6312 bool IsUnary = (N0 == N1);
6313
6314 Ops.push_back(N0);
6315 if (!IsUnary)
6316 Ops.push_back(N1);
6317
6318 createPackShuffleMask(VT, Mask, IsUnary);
6319
6320 if (Offset0 || Offset1) {
6321 for (int &M : Mask)
6322 if ((Offset0 && isInRange(M, 0, NumElts)) ||
6323 (Offset1 && isInRange(M, NumElts, 2 * NumElts)))
6324 ++M;
6325 }
6326 return true;
6327 }
6328 case ISD::VSELECT:
6329 case X86ISD::BLENDV: {
6330 SDValue Cond = N.getOperand(0);
6331 if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {
6332 Ops.push_back(N.getOperand(1));
6333 Ops.push_back(N.getOperand(2));
6334 return true;
6335 }
6336 return false;
6337 }
6338 case X86ISD::VTRUNC: {
6339 SDValue Src = N.getOperand(0);
6340 EVT SrcVT = Src.getValueType();
6341 // Truncated source must be a simple vector.
6342 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6343 (SrcVT.getScalarSizeInBits() % 8) != 0)
6344 return false;
6345 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6346 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6347 unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
6348 assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
6349 for (unsigned i = 0; i != NumSrcElts; ++i)
6350 Mask.push_back(i * Scale);
6351 Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
6352 Ops.push_back(Src);
6353 return true;
6354 }
6355 case ISD::SHL:
6356 case ISD::SRL: {
6357 // We can only decode 'whole byte' bit shifts as shuffles.
6358 std::optional<uint64_t> Amt = DAG.getValidShiftAmount(N, DemandedElts);
6359 if (!Amt || (*Amt % 8) != 0)
6360 return false;
6361
6362 uint64_t ByteShift = *Amt / 8;
6363 Ops.push_back(N.getOperand(0));
6364
6365 // Clear mask to all zeros and insert the shifted byte indices.
6366 Mask.append(NumSizeInBytes, SM_SentinelZero);
6367
6368 if (ISD::SHL == Opcode) {
6369 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6370 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6371 Mask[i + j] = i + j - ByteShift;
6372 } else {
6373 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6374 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6375 Mask[i + j - ByteShift] = i + j;
6376 }
6377 return true;
6378 }
6379 case X86ISD::VSHLI:
6380 case X86ISD::VSRLI: {
6381 uint64_t ShiftVal = N.getConstantOperandVal(1);
6382 // Out of range bit shifts are guaranteed to be zero.
6383 if (NumBitsPerElt <= ShiftVal) {
6384 Mask.append(NumElts, SM_SentinelZero);
6385 return true;
6386 }
6387
6388 // We can only decode 'whole byte' bit shifts as shuffles.
6389 if ((ShiftVal % 8) != 0)
6390 break;
6391
6392 uint64_t ByteShift = ShiftVal / 8;
6393 Ops.push_back(N.getOperand(0));
6394
6395 // Clear mask to all zeros and insert the shifted byte indices.
6396 Mask.append(NumSizeInBytes, SM_SentinelZero);
6397
6398 if (X86ISD::VSHLI == Opcode) {
6399 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6400 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6401 Mask[i + j] = i + j - ByteShift;
6402 } else {
6403 for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
6404 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6405 Mask[i + j - ByteShift] = i + j;
6406 }
6407 return true;
6408 }
6409 case X86ISD::VROTLI:
6410 case X86ISD::VROTRI: {
6411 // We can only decode 'whole byte' bit rotates as shuffles.
6412 uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
6413 if ((RotateVal % 8) != 0)
6414 return false;
6415 Ops.push_back(N.getOperand(0));
6416 int Offset = RotateVal / 8;
6417 Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
6418 for (int i = 0; i != (int)NumElts; ++i) {
6419 int BaseIdx = i * NumBytesPerElt;
6420 for (int j = 0; j != (int)NumBytesPerElt; ++j) {
6421 Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
6422 }
6423 }
6424 return true;
6425 }
6426 case X86ISD::VBROADCAST: {
6427 SDValue Src = N.getOperand(0);
6428 if (!Src.getSimpleValueType().isVector()) {
6429 if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6430 !isNullConstant(Src.getOperand(1)) ||
6431 Src.getOperand(0).getValueType().getScalarType() !=
6432 VT.getScalarType())
6433 return false;
6434 Src = Src.getOperand(0);
6435 }
6436 Ops.push_back(Src);
6437 Mask.append(NumElts, 0);
6438 return true;
6439 }
6441 SDValue Src = N.getOperand(0);
6442 EVT SrcVT = Src.getValueType();
6443 unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
6444
6445 // Extended source must be a simple vector.
6446 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6447 (NumBitsPerSrcElt % 8) != 0)
6448 return false;
6449
6450 // We can only handle all-signbits extensions.
6451 APInt DemandedSrcElts =
6452 DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
6453 if (DAG.ComputeNumSignBits(Src, DemandedSrcElts) != NumBitsPerSrcElt)
6454 return false;
6455
6456 assert((NumBitsPerElt % NumBitsPerSrcElt) == 0 && "Unexpected extension");
6457 unsigned Scale = NumBitsPerElt / NumBitsPerSrcElt;
6458 for (unsigned I = 0; I != NumElts; ++I)
6459 Mask.append(Scale, I);
6460 Ops.push_back(Src);
6461 return true;
6462 }
6463 case ISD::ZERO_EXTEND:
6464 case ISD::ANY_EXTEND:
6467 SDValue Src = N.getOperand(0);
6468 EVT SrcVT = Src.getValueType();
6469
6470 // Extended source must be a simple vector.
6471 if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
6472 (SrcVT.getScalarSizeInBits() % 8) != 0)
6473 return false;
6474
6475 bool IsAnyExtend =
6476 (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
6477 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
6478 IsAnyExtend, Mask);
6479 Ops.push_back(Src);
6480 return true;
6481 }
6482 }
6483
6484 return false;
6485}
6486
6487/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.
6489 SmallVectorImpl<int> &Mask) {
6490 int MaskWidth = Mask.size();
6491 SmallVector<SDValue, 16> UsedInputs;
6492 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6493 int lo = UsedInputs.size() * MaskWidth;
6494 int hi = lo + MaskWidth;
6495
6496 // Strip UNDEF input usage.
6497 if (Inputs[i].isUndef())
6498 for (int &M : Mask)
6499 if ((lo <= M) && (M < hi))
6500 M = SM_SentinelUndef;
6501
6502 // Check for unused inputs.
6503 if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6504 for (int &M : Mask)
6505 if (lo <= M)
6506 M -= MaskWidth;
6507 continue;
6508 }
6509
6510 // Check for repeated inputs.
6511 bool IsRepeat = false;
6512 for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {
6513 if (UsedInputs[j] != Inputs[i])
6514 continue;
6515 for (int &M : Mask)
6516 if (lo <= M)
6517 M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);
6518 IsRepeat = true;
6519 break;
6520 }
6521 if (IsRepeat)
6522 continue;
6523
6524 UsedInputs.push_back(Inputs[i]);
6525 }
6526 Inputs = UsedInputs;
6527}
6528
6529/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs
6530/// and then sets the SM_SentinelUndef and SM_SentinelZero values.
6531/// Returns true if the target shuffle mask was decoded.
6532static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6535 APInt &KnownUndef, APInt &KnownZero,
6536 const SelectionDAG &DAG, unsigned Depth,
6537 bool ResolveKnownElts) {
6539 return false; // Limit search depth.
6540
6541 EVT VT = Op.getValueType();
6542 if (!VT.isSimple() || !VT.isVector())
6543 return false;
6544
6545 if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {
6546 if (ResolveKnownElts)
6547 resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);
6548 return true;
6549 }
6550 if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,
6551 ResolveKnownElts)) {
6552 resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
6553 return true;
6554 }
6555 return false;
6556}
6557
6558static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
6561 const SelectionDAG &DAG, unsigned Depth,
6562 bool ResolveKnownElts) {
6563 APInt KnownUndef, KnownZero;
6564 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,
6565 KnownZero, DAG, Depth, ResolveKnownElts);
6566}
6567
6570 const SelectionDAG &DAG, unsigned Depth = 0,
6571 bool ResolveKnownElts = true) {
6572 EVT VT = Op.getValueType();
6573 if (!VT.isSimple() || !VT.isVector())
6574 return false;
6575
6576 unsigned NumElts = Op.getValueType().getVectorNumElements();
6577 APInt DemandedElts = APInt::getAllOnes(NumElts);
6578 return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,
6579 ResolveKnownElts);
6580}
6581
6582// Attempt to create a scalar/subvector broadcast from the base MemSDNode.
6583static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,
6584 EVT MemVT, MemSDNode *Mem, unsigned Offset,
6585 SelectionDAG &DAG) {
6586 assert((Opcode == X86ISD::VBROADCAST_LOAD ||
6587 Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&
6588 "Unknown broadcast load type");
6589
6590 // Ensure this is a simple (non-atomic, non-voltile), temporal read memop.
6591 if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())
6592 return SDValue();
6593
6596 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
6597 SDValue Ops[] = {Mem->getChain(), Ptr};
6598 SDValue BcstLd = DAG.getMemIntrinsicNode(
6599 Opcode, DL, Tys, Ops, MemVT,
6601 Mem->getMemOperand(), Offset, MemVT.getStoreSize()));
6602 DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));
6603 return BcstLd;
6604}
6605
6606/// Returns the scalar element that will make up the i'th
6607/// element of the result of the vector shuffle.
6608static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
6609 SelectionDAG &DAG, unsigned Depth) {
6611 return SDValue(); // Limit search depth.
6612
6613 EVT VT = Op.getValueType();
6614 unsigned Opcode = Op.getOpcode();
6615 unsigned NumElems = VT.getVectorNumElements();
6616
6617 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6618 if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
6619 int Elt = SV->getMaskElt(Index);
6620
6621 if (Elt < 0)
6622 return DAG.getUNDEF(VT.getVectorElementType());
6623
6624 SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
6625 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6626 }
6627
6628 // Recurse into target specific vector shuffles to find scalars.
6629 if (isTargetShuffle(Opcode)) {
6630 MVT ShufVT = VT.getSimpleVT();
6631 MVT ShufSVT = ShufVT.getVectorElementType();
6632 int NumElems = (int)ShufVT.getVectorNumElements();
6633 SmallVector<int, 16> ShuffleMask;
6635 if (!getTargetShuffleMask(Op, true, ShuffleOps, ShuffleMask))
6636 return SDValue();
6637
6638 int Elt = ShuffleMask[Index];
6639 if (Elt == SM_SentinelZero)
6640 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
6641 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
6642 if (Elt == SM_SentinelUndef)
6643 return DAG.getUNDEF(ShufSVT);
6644
6645 assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
6646 SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6647 return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
6648 }
6649
6650 // Recurse into insert_subvector base/sub vector to find scalars.
6651 if (Opcode == ISD::INSERT_SUBVECTOR) {
6652 SDValue Vec = Op.getOperand(0);
6653 SDValue Sub = Op.getOperand(1);
6654 uint64_t SubIdx = Op.getConstantOperandVal(2);
6655 unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
6656
6657 if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
6658 return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
6659 return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
6660 }
6661
6662 // Recurse into concat_vectors sub vector to find scalars.
6663 if (Opcode == ISD::CONCAT_VECTORS) {
6664 EVT SubVT = Op.getOperand(0).getValueType();
6665 unsigned NumSubElts = SubVT.getVectorNumElements();
6666 uint64_t SubIdx = Index / NumSubElts;
6667 uint64_t SubElt = Index % NumSubElts;
6668 return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
6669 }
6670
6671 // Recurse into extract_subvector src vector to find scalars.
6672 if (Opcode == ISD::EXTRACT_SUBVECTOR) {
6673 SDValue Src = Op.getOperand(0);
6674 uint64_t SrcIdx = Op.getConstantOperandVal(1);
6675 return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
6676 }
6677
6678 // We only peek through bitcasts of the same vector width.
6679 if (Opcode == ISD::BITCAST) {
6680 SDValue Src = Op.getOperand(0);
6681 EVT SrcVT = Src.getValueType();
6682 if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
6683 return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
6684 return SDValue();
6685 }
6686
6687 // Actual nodes that may contain scalar elements
6688
6689 // For insert_vector_elt - either return the index matching scalar or recurse
6690 // into the base vector.
6691 if (Opcode == ISD::INSERT_VECTOR_ELT &&
6692 isa<ConstantSDNode>(Op.getOperand(2))) {
6693 if (Op.getConstantOperandAPInt(2) == Index)
6694 return Op.getOperand(1);
6695 return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
6696 }
6697
6698 if (Opcode == ISD::SCALAR_TO_VECTOR)
6699 return (Index == 0) ? Op.getOperand(0)
6700 : DAG.getUNDEF(VT.getVectorElementType());
6701
6702 if (Opcode == ISD::BUILD_VECTOR)
6703 return Op.getOperand(Index);
6704
6705 return SDValue();
6706}
6707
6708// Use PINSRB/PINSRW/PINSRD to create a build vector.
6710 const APInt &NonZeroMask,
6711 unsigned NumNonZero, unsigned NumZero,
6712 SelectionDAG &DAG,
6713 const X86Subtarget &Subtarget) {
6714 MVT VT = Op.getSimpleValueType();
6715 unsigned NumElts = VT.getVectorNumElements();
6716 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6717 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6718 "Illegal vector insertion");
6719
6720 SDValue V;
6721 bool First = true;
6722
6723 for (unsigned i = 0; i < NumElts; ++i) {
6724 bool IsNonZero = NonZeroMask[i];
6725 if (!IsNonZero)
6726 continue;
6727
6728 // If the build vector contains zeros or our first insertion is not the
6729 // first index then insert into zero vector to break any register
6730 // dependency else use SCALAR_TO_VECTOR.
6731 if (First) {
6732 First = false;
6733 if (NumZero || 0 != i)
6734 V = getZeroVector(VT, Subtarget, DAG, DL);
6735 else {
6736 assert(0 == i && "Expected insertion into zero-index");
6737 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6738 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6739 V = DAG.getBitcast(VT, V);
6740 continue;
6741 }
6742 }
6743 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, V, Op.getOperand(i),
6744 DAG.getIntPtrConstant(i, DL));
6745 }
6746
6747 return V;
6748}
6749
6750/// Custom lower build_vector of v16i8.
6752 const APInt &NonZeroMask,
6753 unsigned NumNonZero, unsigned NumZero,
6754 SelectionDAG &DAG,
6755 const X86Subtarget &Subtarget) {
6756 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6757 return SDValue();
6758
6759 // SSE4.1 - use PINSRB to insert each byte directly.
6760 if (Subtarget.hasSSE41())
6761 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero,
6762 DAG, Subtarget);
6763
6764 SDValue V;
6765
6766 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6767 // If both the lowest 16-bits are non-zero, then convert to MOVD.
6768 if (!NonZeroMask.extractBits(2, 0).isZero() &&
6769 !NonZeroMask.extractBits(2, 2).isZero()) {
6770 for (unsigned I = 0; I != 4; ++I) {
6771 if (!NonZeroMask[I])
6772 continue;
6773 SDValue Elt = DAG.getZExtOrTrunc(Op.getOperand(I), DL, MVT::i32);
6774 if (I != 0)
6775 Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt,
6776 DAG.getConstant(I * 8, DL, MVT::i8));
6777 V = V ? DAG.getNode(ISD::OR, DL, MVT::i32, V, Elt) : Elt;
6778 }
6779 assert(V && "Failed to fold v16i8 vector to zero");
6780 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, V);
6781 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, V);
6782 V = DAG.getBitcast(MVT::v8i16, V);
6783 }
6784 for (unsigned i = V ? 4 : 0; i < 16; i += 2) {
6785 bool ThisIsNonZero = NonZeroMask[i];
6786 bool NextIsNonZero = NonZeroMask[i + 1];
6787 if (!ThisIsNonZero && !NextIsNonZero)
6788 continue;
6789
6790 SDValue Elt;
6791 if (ThisIsNonZero) {
6792 if (NumZero || NextIsNonZero)
6793 Elt = DAG.getZExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6794 else
6795 Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), DL, MVT::i32);
6796 }
6797
6798 if (NextIsNonZero) {
6799 SDValue NextElt = Op.getOperand(i + 1);
6800 if (i == 0 && NumZero)
6801 NextElt = DAG.getZExtOrTrunc(NextElt, DL, MVT::i32);
6802 else
6803 NextElt = DAG.getAnyExtOrTrunc(NextElt, DL, MVT::i32);
6804 NextElt = DAG.getNode(ISD::SHL, DL, MVT::i32, NextElt,
6805 DAG.getConstant(8, DL, MVT::i8));
6806 if (ThisIsNonZero)
6807 Elt = DAG.getNode(ISD::OR, DL, MVT::i32, NextElt, Elt);
6808 else
6809 Elt = NextElt;
6810 }
6811
6812 // If our first insertion is not the first index or zeros are needed, then
6813 // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
6814 // elements undefined).
6815 if (!V) {
6816 if (i != 0 || NumZero)
6817 V = getZeroVector(MVT::v8i16, Subtarget, DAG, DL);
6818 else {
6819 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, Elt);
6820 V = DAG.getBitcast(MVT::v8i16, V);
6821 continue;
6822 }
6823 }
6824 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
6825 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, V, Elt,
6826 DAG.getIntPtrConstant(i / 2, DL));
6827 }
6828
6829 return DAG.getBitcast(MVT::v16i8, V);
6830}
6831
6832/// Custom lower build_vector of v8i16.
6834 const APInt &NonZeroMask,
6835 unsigned NumNonZero, unsigned NumZero,
6836 SelectionDAG &DAG,
6837 const X86Subtarget &Subtarget) {
6838 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6839 return SDValue();
6840
6841 // Use PINSRW to insert each byte directly.
6842 return LowerBuildVectorAsInsert(Op, DL, NonZeroMask, NumNonZero, NumZero, DAG,
6843 Subtarget);
6844}
6845
6846/// Custom lower build_vector of v4i32 or v4f32.
6848 SelectionDAG &DAG,
6849 const X86Subtarget &Subtarget) {
6850 // If this is a splat of a pair of elements, use MOVDDUP (unless the target
6851 // has XOP; in that case defer lowering to potentially use VPERMIL2PS).
6852 // Because we're creating a less complicated build vector here, we may enable
6853 // further folding of the MOVDDUP via shuffle transforms.
6854 if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&
6855 Op.getOperand(0) == Op.getOperand(2) &&
6856 Op.getOperand(1) == Op.getOperand(3) &&
6857 Op.getOperand(0) != Op.getOperand(1)) {
6858 MVT VT = Op.getSimpleValueType();
6859 MVT EltVT = VT.getVectorElementType();
6860 // Create a new build vector with the first 2 elements followed by undef
6861 // padding, bitcast to v2f64, duplicate, and bitcast back.
6862 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
6863 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
6864 SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));
6865 SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);
6866 return DAG.getBitcast(VT, Dup);
6867 }
6868
6869 // Find all zeroable elements.
6870 std::bitset<4> Zeroable, Undefs;
6871 for (int i = 0; i < 4; ++i) {
6872 SDValue Elt = Op.getOperand(i);
6873 Undefs[i] = Elt.isUndef();
6874 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6875 }
6876 assert(Zeroable.size() - Zeroable.count() > 1 &&
6877 "We expect at least two non-zero elements!");
6878
6879 // We only know how to deal with build_vector nodes where elements are either
6880 // zeroable or extract_vector_elt with constant index.
6881 SDValue FirstNonZero;
6882 unsigned FirstNonZeroIdx;
6883 for (unsigned i = 0; i < 4; ++i) {
6884 if (Zeroable[i])
6885 continue;
6886 SDValue Elt = Op.getOperand(i);
6887 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6888 !isa<ConstantSDNode>(Elt.getOperand(1)))
6889 return SDValue();
6890 // Make sure that this node is extracting from a 128-bit vector.
6891 MVT VT = Elt.getOperand(0).getSimpleValueType();
6892 if (!VT.is128BitVector())
6893 return SDValue();
6894 if (!FirstNonZero.getNode()) {
6895 FirstNonZero = Elt;
6896 FirstNonZeroIdx = i;
6897 }
6898 }
6899
6900 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6901 SDValue V1 = FirstNonZero.getOperand(0);
6902 MVT VT = V1.getSimpleValueType();
6903
6904 // See if this build_vector can be lowered as a blend with zero.
6905 SDValue Elt;
6906 unsigned EltMaskIdx, EltIdx;
6907 int Mask[4];
6908 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6909 if (Zeroable[EltIdx]) {
6910 // The zero vector will be on the right hand side.
6911 Mask[EltIdx] = EltIdx+4;
6912 continue;
6913 }
6914
6915 Elt = Op->getOperand(EltIdx);
6916 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6917 EltMaskIdx = Elt.getConstantOperandVal(1);
6918 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6919 break;
6920 Mask[EltIdx] = EltIdx;
6921 }
6922
6923 if (EltIdx == 4) {
6924 // Let the shuffle legalizer deal with blend operations.
6925 SDValue VZeroOrUndef = (Zeroable == Undefs)
6926 ? DAG.getUNDEF(VT)
6927 : getZeroVector(VT, Subtarget, DAG, DL);
6928 if (V1.getSimpleValueType() != VT)
6929 V1 = DAG.getBitcast(VT, V1);
6930 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);
6931 }
6932
6933 // See if we can lower this build_vector to a INSERTPS.
6934 if (!Subtarget.hasSSE41())
6935 return SDValue();
6936
6937 SDValue V2 = Elt.getOperand(0);
6938 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6939 V1 = SDValue();
6940
6941 bool CanFold = true;
6942 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6943 if (Zeroable[i])
6944 continue;
6945
6946 SDValue Current = Op->getOperand(i);
6947 SDValue SrcVector = Current->getOperand(0);
6948 if (!V1.getNode())
6949 V1 = SrcVector;
6950 CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);
6951 }
6952
6953 if (!CanFold)
6954 return SDValue();
6955
6956 assert(V1.getNode() && "Expected at least two non-zero elements!");
6957 if (V1.getSimpleValueType() != MVT::v4f32)
6958 V1 = DAG.getBitcast(MVT::v4f32, V1);
6959 if (V2.getSimpleValueType() != MVT::v4f32)
6960 V2 = DAG.getBitcast(MVT::v4f32, V2);
6961
6962 // Ok, we can emit an INSERTPS instruction.
6963 unsigned ZMask = Zeroable.to_ulong();
6964
6965 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6966 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6967 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6968 DAG.getIntPtrConstant(InsertPSMask, DL, true));
6969 return DAG.getBitcast(VT, Result);
6970}
6971
6972/// Return a vector logical shift node.
6973static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6974 SelectionDAG &DAG, const TargetLowering &TLI,
6975 const SDLoc &dl) {
6976 assert(VT.is128BitVector() && "Unknown type for VShift");
6977 MVT ShVT = MVT::v16i8;
6978 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6979 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6980 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6981 SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);
6982 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6983}
6984
6986 SelectionDAG &DAG) {
6987
6988 // Check if the scalar load can be widened into a vector load. And if
6989 // the address is "base + cst" see if the cst can be "absorbed" into
6990 // the shuffle mask.
6991 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6992 SDValue Ptr = LD->getBasePtr();
6993 if (!ISD::isNormalLoad(LD) || !LD->isSimple())
6994 return SDValue();
6995 EVT PVT = LD->getValueType(0);
6996 if (PVT != MVT::i32 && PVT != MVT::f32)
6997 return SDValue();
6998
6999 int FI = -1;
7000 int64_t Offset = 0;
7001 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
7002 FI = FINode->getIndex();
7003 Offset = 0;
7004 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
7005 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
7006 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
7007 Offset = Ptr.getConstantOperandVal(1);
7008 Ptr = Ptr.getOperand(0);
7009 } else {
7010 return SDValue();
7011 }
7012
7013 // FIXME: 256-bit vector instructions don't require a strict alignment,
7014 // improve this code to support it better.
7015 Align RequiredAlign(VT.getSizeInBits() / 8);
7016 SDValue Chain = LD->getChain();
7017 // Make sure the stack object alignment is at least 16 or 32.
7019 MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
7020 if (!InferredAlign || *InferredAlign < RequiredAlign) {
7021 if (MFI.isFixedObjectIndex(FI)) {
7022 // Can't change the alignment. FIXME: It's possible to compute
7023 // the exact stack offset and reference FI + adjust offset instead.
7024 // If someone *really* cares about this. That's the way to implement it.
7025 return SDValue();
7026 } else {
7027 MFI.setObjectAlignment(FI, RequiredAlign);
7028 }
7029 }
7030
7031 // (Offset % 16 or 32) must be multiple of 4. Then address is then
7032 // Ptr + (Offset & ~15).
7033 if (Offset < 0)
7034 return SDValue();
7035 if ((Offset % RequiredAlign.value()) & 3)
7036 return SDValue();
7037 int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
7038 if (StartOffset) {
7039 SDLoc DL(Ptr);
7040 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7041 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
7042 }
7043
7044 int EltNo = (Offset - StartOffset) >> 2;
7045 unsigned NumElems = VT.getVectorNumElements();
7046
7047 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
7048 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
7049 LD->getPointerInfo().getWithOffset(StartOffset));
7050
7051 SmallVector<int, 8> Mask(NumElems, EltNo);
7052
7053 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
7054 }
7055
7056 return SDValue();
7057}
7058
7059// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
7060static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
7061 if (ISD::isNON_EXTLoad(Elt.getNode())) {
7062 auto *BaseLd = cast<LoadSDNode>(Elt);
7063 if (!BaseLd->isSimple())
7064 return false;
7065 Ld = BaseLd;
7066 ByteOffset = 0;
7067 return true;
7068 }
7069
7070 switch (Elt.getOpcode()) {
7071 case ISD::BITCAST:
7072 case ISD::TRUNCATE:
7074 return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
7075 case ISD::SRL:
7076 if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7077 uint64_t Amt = AmtC->getZExtValue();
7078 if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
7079 ByteOffset += Amt / 8;
7080 return true;
7081 }
7082 }
7083 break;
7085 if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
7086 SDValue Src = Elt.getOperand(0);
7087 unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
7088 unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
7089 if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
7090 findEltLoadSrc(Src, Ld, ByteOffset)) {
7091 uint64_t Idx = IdxC->getZExtValue();
7092 ByteOffset += Idx * (SrcSizeInBits / 8);
7093 return true;
7094 }
7095 }
7096 break;
7097 }
7098
7099 return false;
7100}
7101
7102/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
7103/// elements can be replaced by a single large load which has the same value as
7104/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
7105///
7106/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
7108 const SDLoc &DL, SelectionDAG &DAG,
7109 const X86Subtarget &Subtarget,
7110 bool IsAfterLegalize) {
7111 if ((VT.getScalarSizeInBits() % 8) != 0)
7112 return SDValue();
7113
7114 unsigned NumElems = Elts.size();
7115
7116 int LastLoadedElt = -1;
7117 APInt LoadMask = APInt::getZero(NumElems);
7118 APInt ZeroMask = APInt::getZero(NumElems);
7119 APInt UndefMask = APInt::getZero(NumElems);
7120
7121 SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
7122 SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
7123
7124 // For each element in the initializer, see if we've found a load, zero or an
7125 // undef.
7126 for (unsigned i = 0; i < NumElems; ++i) {
7127 SDValue Elt = peekThroughBitcasts(Elts[i]);
7128 if (!Elt.getNode())
7129 return SDValue();
7130 if (Elt.isUndef()) {
7131 UndefMask.setBit(i);
7132 continue;
7133 }
7135 ZeroMask.setBit(i);
7136 continue;
7137 }
7138
7139 // Each loaded element must be the correct fractional portion of the
7140 // requested vector load.
7141 unsigned EltSizeInBits = Elt.getValueSizeInBits();
7142 if ((NumElems * EltSizeInBits) != VT.getSizeInBits())
7143 return SDValue();
7144
7145 if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)
7146 return SDValue();
7147 unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);
7148 if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)
7149 return SDValue();
7150
7151 LoadMask.setBit(i);
7152 LastLoadedElt = i;
7153 }
7154 assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==
7155 NumElems &&
7156 "Incomplete element masks");
7157
7158 // Handle Special Cases - all undef or undef/zero.
7159 if (UndefMask.popcount() == NumElems)
7160 return DAG.getUNDEF(VT);
7161 if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)
7162 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
7163 : DAG.getConstantFP(0.0, DL, VT);
7164
7165 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7166 int FirstLoadedElt = LoadMask.countr_zero();
7167 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
7168 EVT EltBaseVT = EltBase.getValueType();
7169 assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
7170 "Register/Memory size mismatch");
7171 LoadSDNode *LDBase = Loads[FirstLoadedElt];
7172 assert(LDBase && "Did not find base load for merging consecutive loads");
7173 unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
7174 unsigned BaseSizeInBytes = BaseSizeInBits / 8;
7175 int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);
7176 int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;
7177 assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected");
7178
7179 // TODO: Support offsetting the base load.
7180 if (ByteOffsets[FirstLoadedElt] != 0)
7181 return SDValue();
7182
7183 // Check to see if the element's load is consecutive to the base load
7184 // or offset from a previous (already checked) load.
7185 auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
7186 LoadSDNode *Ld = Loads[EltIdx];
7187 int64_t ByteOffset = ByteOffsets[EltIdx];
7188 if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
7189 int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
7190 return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
7191 Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
7192 }
7193 return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
7194 EltIdx - FirstLoadedElt);
7195 };
7196
7197 // Consecutive loads can contain UNDEFS but not ZERO elements.
7198 // Consecutive loads with UNDEFs and ZEROs elements require a
7199 // an additional shuffle stage to clear the ZERO elements.
7200 bool IsConsecutiveLoad = true;
7201 bool IsConsecutiveLoadWithZeros = true;
7202 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
7203 if (LoadMask[i]) {
7204 if (!CheckConsecutiveLoad(LDBase, i)) {
7205 IsConsecutiveLoad = false;
7206 IsConsecutiveLoadWithZeros = false;
7207 break;
7208 }
7209 } else if (ZeroMask[i]) {
7210 IsConsecutiveLoad = false;
7211 }
7212 }
7213
7214 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
7215 auto MMOFlags = LDBase->getMemOperand()->getFlags();
7216 assert(LDBase->isSimple() &&
7217 "Cannot merge volatile or atomic loads.");
7218 SDValue NewLd =
7219 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
7220 LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
7221 MMOFlags);
7222 for (auto *LD : Loads)
7223 if (LD)
7224 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
7225 return NewLd;
7226 };
7227
7228 // Check if the base load is entirely dereferenceable.
7229 bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(
7230 VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());
7231
7232 // LOAD - all consecutive load/undefs (must start/end with a load or be
7233 // entirely dereferenceable). If we have found an entire vector of loads and
7234 // undefs, then return a large load of the entire vector width starting at the
7235 // base pointer. If the vector contains zeros, then attempt to shuffle those
7236 // elements.
7237 if (FirstLoadedElt == 0 &&
7238 (NumLoadedElts == (int)NumElems || IsDereferenceable) &&
7239 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
7240 if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
7241 return SDValue();
7242
7243 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
7244 // will lower to regular temporal loads and use the cache.
7245 if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&
7246 VT.is256BitVector() && !Subtarget.hasInt256())
7247 return SDValue();
7248
7249 if (NumElems == 1)
7250 return DAG.getBitcast(VT, Elts[FirstLoadedElt]);
7251
7252 if (!ZeroMask)
7253 return CreateLoad(VT, LDBase);
7254
7255 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
7256 // vector and a zero vector to clear out the zero elements.
7257 if (!IsAfterLegalize && VT.isVector()) {
7258 unsigned NumMaskElts = VT.getVectorNumElements();
7259 if ((NumMaskElts % NumElems) == 0) {
7260 unsigned Scale = NumMaskElts / NumElems;
7261 SmallVector<int, 4> ClearMask(NumMaskElts, -1);
7262 for (unsigned i = 0; i < NumElems; ++i) {
7263 if (UndefMask[i])
7264 continue;
7265 int Offset = ZeroMask[i] ? NumMaskElts : 0;
7266 for (unsigned j = 0; j != Scale; ++j)
7267 ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;
7268 }
7269 SDValue V = CreateLoad(VT, LDBase);
7270 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
7271 : DAG.getConstantFP(0.0, DL, VT);
7272 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
7273 }
7274 }
7275 }
7276
7277 // If the upper half of a ymm/zmm load is undef then just load the lower half.
7278 if (VT.is256BitVector() || VT.is512BitVector()) {
7279 unsigned HalfNumElems = NumElems / 2;
7280 if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {
7281 EVT HalfVT =
7282 EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);
7283 SDValue HalfLD =
7284 EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,
7285 DAG, Subtarget, IsAfterLegalize);
7286 if (HalfLD)
7287 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),
7288 HalfLD, DAG.getIntPtrConstant(0, DL));
7289 }
7290 }
7291
7292 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
7293 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
7294 ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||
7295 LoadSizeInBits == 64) &&
7296 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
7297 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
7298 : MVT::getIntegerVT(LoadSizeInBits);
7299 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
7300 // Allow v4f32 on SSE1 only targets.
7301 // FIXME: Add more isel patterns so we can just use VT directly.
7302 if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
7303 VecVT = MVT::v4f32;
7304 if (TLI.isTypeLegal(VecVT)) {
7305 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
7306 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
7307 SDValue ResNode = DAG.getMemIntrinsicNode(
7308 X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
7310 for (auto *LD : Loads)
7311 if (LD)
7312 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
7313 return DAG.getBitcast(VT, ResNode);
7314 }
7315 }
7316
7317 // BROADCAST - match the smallest possible repetition pattern, load that
7318 // scalar/subvector element and then broadcast to the entire vector.
7319 if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&
7320 (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {
7321 for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {
7322 unsigned RepeatSize = SubElems * BaseSizeInBits;
7323 unsigned ScalarSize = std::min(RepeatSize, 64u);
7324 if (!Subtarget.hasAVX2() && ScalarSize < 32)
7325 continue;
7326
7327 // Don't attempt a 1:N subvector broadcast - it should be caught by
7328 // combineConcatVectorOps, else will cause infinite loops.
7329 if (RepeatSize > ScalarSize && SubElems == 1)
7330 continue;
7331
7332 bool Match = true;
7333 SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));
7334 for (unsigned i = 0; i != NumElems && Match; ++i) {
7335 if (!LoadMask[i])
7336 continue;
7337 SDValue Elt = peekThroughBitcasts(Elts[i]);
7338 if (RepeatedLoads[i % SubElems].isUndef())
7339 RepeatedLoads[i % SubElems] = Elt;
7340 else
7341 Match &= (RepeatedLoads[i % SubElems] == Elt);
7342 }
7343
7344 // We must have loads at both ends of the repetition.
7345 Match &= !RepeatedLoads.front().isUndef();
7346 Match &= !RepeatedLoads.back().isUndef();
7347 if (!Match)
7348 continue;
7349
7350 EVT RepeatVT =
7351 VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))
7352 ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)
7353 : EVT::getFloatingPointVT(ScalarSize);
7354 if (RepeatSize > ScalarSize)
7355 RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,
7356 RepeatSize / ScalarSize);
7357 EVT BroadcastVT =
7358 EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),
7359 VT.getSizeInBits() / ScalarSize);
7360 if (TLI.isTypeLegal(BroadcastVT)) {
7361 if (SDValue RepeatLoad = EltsFromConsecutiveLoads(
7362 RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {
7363 SDValue Broadcast = RepeatLoad;
7364 if (RepeatSize > ScalarSize) {
7365 while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())
7366 Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);
7367 } else {
7368 if (!Subtarget.hasAVX2() &&
7370 RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),
7371 Subtarget,
7372 /*AssumeSingleUse=*/true))
7373 return SDValue();
7374 Broadcast =
7375 DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);
7376 }
7377 return DAG.getBitcast(VT, Broadcast);
7378 }
7379 }
7380 }
7381 }
7382
7383 return SDValue();
7384}
7385
7386// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
7387// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
7388// are consecutive, non-overlapping, and in the right order.
7390 SelectionDAG &DAG,
7391 const X86Subtarget &Subtarget,
7392 bool IsAfterLegalize) {
7394 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
7395 if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
7396 Elts.push_back(Elt);
7397 continue;
7398 }
7399 return SDValue();
7400 }
7401 assert(Elts.size() == VT.getVectorNumElements());
7402 return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,
7403 IsAfterLegalize);
7404}
7405
7407 const APInt &Undefs, LLVMContext &C) {
7408 unsigned ScalarSize = VT.getScalarSizeInBits();
7409 Type *Ty = EVT(VT.getScalarType()).getTypeForEVT(C);
7410
7411 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7412 if (VT.isFloatingPoint()) {
7413 if (ScalarSize == 16)
7414 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7415 if (ScalarSize == 32)
7416 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7417 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7418 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7419 }
7420 return Constant::getIntegerValue(Ty, Val);
7421 };
7422
7423 SmallVector<Constant *, 32> ConstantVec;
7424 for (unsigned I = 0, E = Bits.size(); I != E; ++I)
7425 ConstantVec.push_back(Undefs[I] ? UndefValue::get(Ty)
7426 : getConstantScalar(Bits[I]));
7427
7428 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7429}
7430
7431static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
7432 unsigned SplatBitSize, LLVMContext &C) {
7433 unsigned ScalarSize = VT.getScalarSizeInBits();
7434
7435 auto getConstantScalar = [&](const APInt &Val) -> Constant * {
7436 if (VT.isFloatingPoint()) {
7437 if (ScalarSize == 16)
7438 return ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));
7439 if (ScalarSize == 32)
7440 return ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
7441 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
7442 return ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
7443 }
7444 return Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
7445 };
7446
7447 if (ScalarSize == SplatBitSize)
7448 return getConstantScalar(SplatValue);
7449
7450 unsigned NumElm = SplatBitSize / ScalarSize;
7451 SmallVector<Constant *, 32> ConstantVec;
7452 for (unsigned I = 0; I != NumElm; ++I) {
7453 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * I);
7454 ConstantVec.push_back(getConstantScalar(Val));
7455 }
7456 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
7457}
7458
7460 for (auto *U : N->users()) {
7461 unsigned Opc = U->getOpcode();
7462 // VPERMV/VPERMV3 shuffles can never fold their index operands.
7463 if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
7464 return false;
7465 if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)
7466 return false;
7467 if (isTargetShuffle(Opc))
7468 return true;
7469 if (Opc == ISD::BITCAST) // Ignore bitcasts
7470 return isFoldableUseOfShuffle(U);
7471 if (N->hasOneUse()) {
7472 // TODO, there may be some general way to know if a SDNode can
7473 // be folded. We now only know whether an MI is foldable.
7474 if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)
7475 return false;
7476 return true;
7477 }
7478 }
7479 return false;
7480}
7481
7482/// Attempt to use the vbroadcast instruction to generate a splat value
7483/// from a splat BUILD_VECTOR which uses:
7484/// a. A single scalar load, or a constant.
7485/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
7486///
7487/// The VBROADCAST node is returned when a pattern is found,
7488/// or SDValue() otherwise.
7490 const SDLoc &dl,
7491 const X86Subtarget &Subtarget,
7492 SelectionDAG &DAG) {
7493 // VBROADCAST requires AVX.
7494 // TODO: Splats could be generated for non-AVX CPUs using SSE
7495 // instructions, but there's less potential gain for only 128-bit vectors.
7496 if (!Subtarget.hasAVX())
7497 return SDValue();
7498
7499 MVT VT = BVOp->getSimpleValueType(0);
7500 unsigned NumElts = VT.getVectorNumElements();
7501 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7502 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7503 "Unsupported vector type for broadcast.");
7504
7505 // See if the build vector is a repeating sequence of scalars (inc. splat).
7506 SDValue Ld;
7507 BitVector UndefElements;
7508 SmallVector<SDValue, 16> Sequence;
7509 if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {
7510 assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.");
7511 if (Sequence.size() == 1)
7512 Ld = Sequence[0];
7513 }
7514
7515 // Attempt to use VBROADCASTM
7516 // From this pattern:
7517 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7518 // b. t1 = (build_vector t0 t0)
7519 //
7520 // Create (VBROADCASTM v2i1 X)
7521 if (!Sequence.empty() && Subtarget.hasCDI()) {
7522 // If not a splat, are the upper sequence values zeroable?
7523 unsigned SeqLen = Sequence.size();
7524 bool UpperZeroOrUndef =
7525 SeqLen == 1 ||
7526 llvm::all_of(ArrayRef(Sequence).drop_front(),
7527 [](SDValue V) { return !V || isNullConstantOrUndef(V); });
7528 SDValue Op0 = Sequence[0];
7529 if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||
7530 (Op0.getOpcode() == ISD::ZERO_EXTEND &&
7531 Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {
7532 SDValue BOperand = Op0.getOpcode() == ISD::BITCAST
7533 ? Op0.getOperand(0)
7534 : Op0.getOperand(0).getOperand(0);
7535 MVT MaskVT = BOperand.getSimpleValueType();
7536 MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
7537 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7538 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7539 MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);
7540 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
7541 unsigned Scale = 512 / VT.getSizeInBits();
7542 BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));
7543 }
7544 SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);
7545 if (BcstVT.getSizeInBits() != VT.getSizeInBits())
7546 Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());
7547 return DAG.getBitcast(VT, Bcst);
7548 }
7549 }
7550 }
7551
7552 unsigned NumUndefElts = UndefElements.count();
7553 if (!Ld || (NumElts - NumUndefElts) <= 1) {
7554 APInt SplatValue, Undef;
7555 unsigned SplatBitSize;
7556 bool HasUndef;
7557 // Check if this is a repeated constant pattern suitable for broadcasting.
7558 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7559 SplatBitSize > VT.getScalarSizeInBits() &&
7560 SplatBitSize < VT.getSizeInBits()) {
7561 // Avoid replacing with broadcast when it's a use of a shuffle
7562 // instruction to preserve the present custom lowering of shuffles.
7563 if (isFoldableUseOfShuffle(BVOp))
7564 return SDValue();
7565 // replace BUILD_VECTOR with broadcast of the repeated constants.
7566 LLVMContext *Ctx = DAG.getContext();
7567 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7568 if (SplatBitSize == 32 || SplatBitSize == 64 ||
7569 (SplatBitSize < 32 && Subtarget.hasAVX2())) {
7570 // Load the constant scalar/subvector and broadcast it.
7571 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7572 Constant *C = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7573 SDValue CP = DAG.getConstantPool(C, PVT);
7574 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7575
7576 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7577 SDVTList Tys = DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
7578 SDValue Ops[] = {DAG.getEntryNode(), CP};
7579 MachinePointerInfo MPI =
7581 SDValue Brdcst =
7582 DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7583 MPI, Alignment, MachineMemOperand::MOLoad);
7584 return DAG.getBitcast(VT, Brdcst);
7585 }
7586 if (SplatBitSize > 64) {
7587 // Load the vector of constants and broadcast it.
7588 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, *Ctx);
7589 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7590 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7591 MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);
7592 Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
7593 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7594 SDValue Ops[] = {DAG.getEntryNode(), VCP};
7595 MachinePointerInfo MPI =
7598 Ops, VVT, MPI, Alignment,
7600 }
7601 }
7602
7603 // If we are moving a scalar into a vector (Ld must be set and all elements
7604 // but 1 are undef) and that operation is not obviously supported by
7605 // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.
7606 // That's better than general shuffling and may eliminate a load to GPR and
7607 // move from scalar to vector register.
7608 if (!Ld || NumElts - NumUndefElts != 1)
7609 return SDValue();
7610 unsigned ScalarSize = Ld.getValueSizeInBits();
7611 if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))
7612 return SDValue();
7613 }
7614
7615 bool ConstSplatVal =
7616 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7617 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7618
7619 // TODO: Handle broadcasts of non-constant sequences.
7620
7621 // Make sure that all of the users of a non-constant load are from the
7622 // BUILD_VECTOR node.
7623 // FIXME: Is the use count needed for non-constant, non-load case?
7624 if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
7625 return SDValue();
7626
7627 unsigned ScalarSize = Ld.getValueSizeInBits();
7628 bool IsGE256 = (VT.getSizeInBits() >= 256);
7629
7630 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7631 // instruction to save 8 or more bytes of constant pool data.
7632 // TODO: If multiple splats are generated to load the same constant,
7633 // it may be detrimental to overall size. There needs to be a way to detect
7634 // that condition to know if this is truly a size win.
7635 bool OptForSize = DAG.shouldOptForSize();
7636
7637 // Handle broadcasting a single constant scalar from the constant pool
7638 // into a vector.
7639 // On Sandybridge (no AVX2), it is still better to load a constant vector
7640 // from the constant pool and not to broadcast it from a scalar.
7641 // But override that restriction when optimizing for size.
7642 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7643 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7644 EVT CVT = Ld.getValueType();
7645 assert(!CVT.isVector() && "Must not broadcast a vector type");
7646
7647 // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.
7648 // For size optimization, also splat v2f64 and v2i64, and for size opt
7649 // with AVX2, also splat i8 and i16.
7650 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7651 if (ScalarSize == 32 ||
7652 (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||
7653 (CVT == MVT::f16 && Subtarget.hasAVX2()) ||
7654 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7655 const Constant *C = nullptr;
7656 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
7657 C = CI->getConstantIntValue();
7658 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
7659 C = CF->getConstantFPValue();
7660
7661 assert(C && "Invalid constant type");
7662
7663 SDValue CP =
7665 Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
7666
7667 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7668 SDValue Ops[] = {DAG.getEntryNode(), CP};
7669 MachinePointerInfo MPI =
7671 return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
7672 MPI, Alignment, MachineMemOperand::MOLoad);
7673 }
7674 }
7675
7676 // Handle AVX2 in-register broadcasts.
7677 if (!IsLoad && Subtarget.hasInt256() &&
7678 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7679 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7680
7681 // The scalar source must be a normal load.
7682 if (!IsLoad)
7683 return SDValue();
7684
7685 // Make sure the non-chain result is only used by this build vector.
7686 if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
7687 return SDValue();
7688
7689 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7690 (Subtarget.hasVLX() && ScalarSize == 64)) {
7691 auto *LN = cast<LoadSDNode>(Ld);
7692 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7693 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7694 SDValue BCast =
7696 LN->getMemoryVT(), LN->getMemOperand());
7697 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7698 return BCast;
7699 }
7700
7701 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7702 // double since there is no vbroadcastsd xmm
7703 if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
7704 (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
7705 auto *LN = cast<LoadSDNode>(Ld);
7706 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
7707 SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
7708 SDValue BCast =
7710 LN->getMemoryVT(), LN->getMemOperand());
7711 DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
7712 return BCast;
7713 }
7714
7715 if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)
7716 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7717
7718 // Unsupported broadcast.
7719 return SDValue();
7720}
7721
7722/// For an EXTRACT_VECTOR_ELT with a constant index return the real
7723/// underlying vector and index.
7724///
7725/// Modifies \p ExtractedFromVec to the real vector and returns the real
7726/// index.
7727static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7728 SDValue ExtIdx) {
7729 int Idx = ExtIdx->getAsZExtVal();
7730 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7731 return Idx;
7732
7733 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7734 // lowered this:
7735 // (extract_vector_elt (v8f32 %1), Constant<6>)
7736 // to:
7737 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7738 // (extract_subvector (v8f32 %0), Constant<4>),
7739 // undef)
7740 // Constant<0>)
7741 // In this case the vector is the extract_subvector expression and the index
7742 // is 2, as specified by the shuffle.
7743 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7744 SDValue ShuffleVec = SVOp->getOperand(0);
7745 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7746 assert(ShuffleVecVT.getVectorElementType() ==
7747 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7748
7749 int ShuffleIdx = SVOp->getMaskElt(Idx);
7750 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7751 ExtractedFromVec = ShuffleVec;
7752 return ShuffleIdx;
7753 }
7754 return Idx;
7755}
7756
7758 SelectionDAG &DAG) {
7759 MVT VT = Op.getSimpleValueType();
7760
7761 // Skip if insert_vec_elt is not supported.
7762 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7764 return SDValue();
7765
7766 unsigned NumElems = Op.getNumOperands();
7767 SDValue VecIn1;
7768 SDValue VecIn2;
7769 SmallVector<unsigned, 4> InsertIndices;
7770 SmallVector<int, 8> Mask(NumElems, -1);
7771
7772 for (unsigned i = 0; i != NumElems; ++i) {
7773 unsigned Opc = Op.getOperand(i).getOpcode();
7774
7775 if (Opc == ISD::UNDEF)
7776 continue;
7777
7778 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
7779 // Quit if more than 1 elements need inserting.
7780 if (InsertIndices.size() > 1)
7781 return SDValue();
7782
7783 InsertIndices.push_back(i);
7784 continue;
7785 }
7786
7787 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7788 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7789
7790 // Quit if non-constant index.
7791 if (!isa<ConstantSDNode>(ExtIdx))
7792 return SDValue();
7793 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7794
7795 // Quit if extracted from vector of different type.
7796 if (ExtractedFromVec.getValueType() != VT)
7797 return SDValue();
7798
7799 if (!VecIn1.getNode())
7800 VecIn1 = ExtractedFromVec;
7801 else if (VecIn1 != ExtractedFromVec) {
7802 if (!VecIn2.getNode())
7803 VecIn2 = ExtractedFromVec;
7804 else if (VecIn2 != ExtractedFromVec)
7805 // Quit if more than 2 vectors to shuffle
7806 return SDValue();
7807 }
7808
7809 if (ExtractedFromVec == VecIn1)
7810 Mask[i] = Idx;
7811 else if (ExtractedFromVec == VecIn2)
7812 Mask[i] = Idx + NumElems;
7813 }
7814
7815 if (!VecIn1.getNode())
7816 return SDValue();
7817
7818 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7819 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7820
7821 for (unsigned Idx : InsertIndices)
7822 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7823 DAG.getIntPtrConstant(Idx, DL));
7824
7825 return NV;
7826}
7827
7828// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.
7830 const X86Subtarget &Subtarget) {
7831 MVT VT = Op.getSimpleValueType();
7832 MVT IVT =
7833 VT.changeVectorElementType(Subtarget.hasFP16() ? MVT::f16 : MVT::i16);
7835 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)
7836 NewOps.push_back(DAG.getBitcast(Subtarget.hasFP16() ? MVT::f16 : MVT::i16,
7837 Op.getOperand(I)));
7838 SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);
7839 return DAG.getBitcast(VT, Res);
7840}
7841
7842// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7844 SelectionDAG &DAG,
7845 const X86Subtarget &Subtarget) {
7846
7847 MVT VT = Op.getSimpleValueType();
7848 assert((VT.getVectorElementType() == MVT::i1) &&
7849 "Unexpected type in LowerBUILD_VECTORvXi1!");
7850 if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
7851 ISD::isBuildVectorAllOnes(Op.getNode()))
7852 return Op;
7853
7854 uint64_t Immediate = 0;
7855 SmallVector<unsigned, 16> NonConstIdx;
7856 bool IsSplat = true;
7857 bool HasConstElts = false;
7858 int SplatIdx = -1;
7859 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7860 SDValue In = Op.getOperand(idx);
7861 if (In.isUndef())
7862 continue;
7863 if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
7864 Immediate |= (InC->getZExtValue() & 0x1) << idx;
7865 HasConstElts = true;
7866 } else {
7867 NonConstIdx.push_back(idx);
7868 }
7869 if (SplatIdx < 0)
7870 SplatIdx = idx;
7871 else if (In != Op.getOperand(SplatIdx))
7872 IsSplat = false;
7873 }
7874
7875 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7876 if (IsSplat) {
7877 // The build_vector allows the scalar element to be larger than the vector
7878 // element type. We need to mask it to use as a condition unless we know
7879 // the upper bits are zero.
7880 // FIXME: Use computeKnownBits instead of checking specific opcode?
7881 SDValue Cond = Op.getOperand(SplatIdx);
7882 assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!");
7883 if (Cond.getOpcode() != ISD::SETCC)
7884 Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
7885 DAG.getConstant(1, dl, MVT::i8));
7886
7887 // Perform the select in the scalar domain so we can use cmov.
7888 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7889 SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
7890 DAG.getAllOnesConstant(dl, MVT::i32),
7891 DAG.getConstant(0, dl, MVT::i32));
7892 Select = DAG.getBitcast(MVT::v32i1, Select);
7893 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
7894 } else {
7895 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
7896 SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
7897 DAG.getAllOnesConstant(dl, ImmVT),
7898 DAG.getConstant(0, dl, ImmVT));
7899 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
7900 Select = DAG.getBitcast(VecVT, Select);
7901 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
7902 DAG.getIntPtrConstant(0, dl));
7903 }
7904 }
7905
7906 // insert elements one by one
7907 SDValue DstVec;
7908 if (HasConstElts) {
7909 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7910 SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);
7911 SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);
7912 ImmL = DAG.getBitcast(MVT::v32i1, ImmL);
7913 ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
7914 DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
7915 } else {
7916 MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
7917 SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
7918 MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
7919 DstVec = DAG.getBitcast(VecVT, Imm);
7920 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,
7921 DAG.getIntPtrConstant(0, dl));
7922 }
7923 } else
7924 DstVec = DAG.getUNDEF(VT);
7925
7926 for (unsigned InsertIdx : NonConstIdx) {
7927 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7928 Op.getOperand(InsertIdx),
7929 DAG.getIntPtrConstant(InsertIdx, dl));
7930 }
7931 return DstVec;
7932}
7933
7934LLVM_ATTRIBUTE_UNUSED static bool isHorizOp(unsigned Opcode) {
7935 switch (Opcode) {
7936 case X86ISD::PACKSS:
7937 case X86ISD::PACKUS:
7938 case X86ISD::FHADD:
7939 case X86ISD::FHSUB:
7940 case X86ISD::HADD:
7941 case X86ISD::HSUB:
7942 return true;
7943 }
7944 return false;
7945}
7946
7947/// This is a helper function of LowerToHorizontalOp().
7948/// This function checks that the build_vector \p N in input implements a
7949/// 128-bit partial horizontal operation on a 256-bit vector, but that operation
7950/// may not match the layout of an x86 256-bit horizontal instruction.
7951/// In other words, if this returns true, then some extraction/insertion will
7952/// be required to produce a valid horizontal instruction.
7953///
7954/// Parameter \p Opcode defines the kind of horizontal operation to match.
7955/// For example, if \p Opcode is equal to ISD::ADD, then this function
7956/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7957/// is equal to ISD::SUB, then this function checks if this is a horizontal
7958/// arithmetic sub.
7959///
7960/// This function only analyzes elements of \p N whose indices are
7961/// in range [BaseIdx, LastIdx).
7962///
7963/// TODO: This function was originally used to match both real and fake partial
7964/// horizontal operations, but the index-matching logic is incorrect for that.
7965/// See the corrected implementation in isHopBuildVector(). Can we reduce this
7966/// code because it is only used for partial h-op matching now?
7967static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
7968 const SDLoc &DL, SelectionDAG &DAG,
7969 unsigned BaseIdx, unsigned LastIdx,
7970 SDValue &V0, SDValue &V1) {
7971 EVT VT = N->getValueType(0);
7972 assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops");
7973 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7974 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7975 "Invalid Vector in input!");
7976
7977 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7978 bool CanFold = true;
7979 unsigned ExpectedVExtractIdx = BaseIdx;
7980 unsigned NumElts = LastIdx - BaseIdx;
7981 V0 = DAG.getUNDEF(VT);
7982 V1 = DAG.getUNDEF(VT);
7983
7984 // Check if N implements a horizontal binop.
7985 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7986 SDValue Op = N->getOperand(i + BaseIdx);
7987
7988 // Skip UNDEFs.
7989 if (Op->isUndef()) {
7990 // Update the expected vector extract index.
7991 if (i * 2 == NumElts)
7992 ExpectedVExtractIdx = BaseIdx;
7993 ExpectedVExtractIdx += 2;
7994 continue;
7995 }
7996
7997 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7998
7999 if (!CanFold)
8000 break;
8001
8002 SDValue Op0 = Op.getOperand(0);
8003 SDValue Op1 = Op.getOperand(1);
8004
8005 // Try to match the following pattern:
8006 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
8007 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
8009 Op0.getOperand(0) == Op1.getOperand(0) &&
8010 isa<ConstantSDNode>(Op0.getOperand(1)) &&
8011 isa<ConstantSDNode>(Op1.getOperand(1)));
8012 if (!CanFold)
8013 break;
8014
8015 unsigned I0 = Op0.getConstantOperandVal(1);
8016 unsigned I1 = Op1.getConstantOperandVal(1);
8017
8018 if (i * 2 < NumElts) {
8019 if (V0.isUndef()) {
8020 V0 = Op0.getOperand(0);
8021 if (V0.getValueType() != VT)
8022 return false;
8023 }
8024 } else {
8025 if (V1.isUndef()) {
8026 V1 = Op0.getOperand(0);
8027 if (V1.getValueType() != VT)
8028 return false;
8029 }
8030 if (i * 2 == NumElts)
8031 ExpectedVExtractIdx = BaseIdx;
8032 }
8033
8034 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
8035 if (I0 == ExpectedVExtractIdx)
8036 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
8037 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
8038 // Try to match the following dag sequence:
8039 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
8040 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
8041 } else
8042 CanFold = false;
8043
8044 ExpectedVExtractIdx += 2;
8045 }
8046
8047 return CanFold;
8048}
8049
8050/// Emit a sequence of two 128-bit horizontal add/sub followed by
8051/// a concat_vector.
8052///
8053/// This is a helper function of LowerToHorizontalOp().
8054/// This function expects two 256-bit vectors called V0 and V1.
8055/// At first, each vector is split into two separate 128-bit vectors.
8056/// Then, the resulting 128-bit vectors are used to implement two
8057/// horizontal binary operations.
8058///
8059/// The kind of horizontal binary operation is defined by \p X86Opcode.
8060///
8061/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
8062/// the two new horizontal binop.
8063/// When Mode is set, the first horizontal binop dag node would take as input
8064/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
8065/// horizontal binop dag node would take as input the lower 128-bit of V1
8066/// and the upper 128-bit of V1.
8067/// Example:
8068/// HADD V0_LO, V0_HI
8069/// HADD V1_LO, V1_HI
8070///
8071/// Otherwise, the first horizontal binop dag node takes as input the lower
8072/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
8073/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
8074/// Example:
8075/// HADD V0_LO, V1_LO
8076/// HADD V0_HI, V1_HI
8077///
8078/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
8079/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
8080/// the upper 128-bits of the result.
8081static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
8082 const SDLoc &DL, SelectionDAG &DAG,
8083 unsigned X86Opcode, bool Mode,
8084 bool isUndefLO, bool isUndefHI) {
8085 MVT VT = V0.getSimpleValueType();
8086 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
8087 "Invalid nodes in input!");
8088
8089 unsigned NumElts = VT.getVectorNumElements();
8090 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
8091 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
8092 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
8093 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
8094 MVT NewVT = V0_LO.getSimpleValueType();
8095
8096 SDValue LO = DAG.getUNDEF(NewVT);
8097 SDValue HI = DAG.getUNDEF(NewVT);
8098
8099 if (Mode) {
8100 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8101 if (!isUndefLO && !V0->isUndef())
8102 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
8103 if (!isUndefHI && !V1->isUndef())
8104 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
8105 } else {
8106 // Don't emit a horizontal binop if the result is expected to be UNDEF.
8107 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
8108 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
8109
8110 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
8111 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
8112 }
8113
8114 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
8115}
8116
8117/// Returns true iff \p BV builds a vector with the result equivalent to
8118/// the result of ADDSUB/SUBADD operation.
8119/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
8120/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
8121/// \p Opnd0 and \p Opnd1.
8123 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8124 SDValue &Opnd0, SDValue &Opnd1,
8125 unsigned &NumExtracts,
8126 bool &IsSubAdd) {
8127
8128 MVT VT = BV->getSimpleValueType(0);
8129 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
8130 return false;
8131
8132 unsigned NumElts = VT.getVectorNumElements();
8133 SDValue InVec0 = DAG.getUNDEF(VT);
8134 SDValue InVec1 = DAG.getUNDEF(VT);
8135
8136 NumExtracts = 0;
8137
8138 // Odd-numbered elements in the input build vector are obtained from
8139 // adding/subtracting two integer/float elements.
8140 // Even-numbered elements in the input build vector are obtained from
8141 // subtracting/adding two integer/float elements.
8142 unsigned Opc[2] = {0, 0};
8143 for (unsigned i = 0, e = NumElts; i != e; ++i) {
8144 SDValue Op = BV->getOperand(i);
8145
8146 // Skip 'undef' values.
8147 unsigned Opcode = Op.getOpcode();
8148 if (Opcode == ISD::UNDEF)
8149 continue;
8150
8151 // Early exit if we found an unexpected opcode.
8152 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
8153 return false;
8154
8155 SDValue Op0 = Op.getOperand(0);
8156 SDValue Op1 = Op.getOperand(1);
8157
8158 // Try to match the following pattern:
8159 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
8160 // Early exit if we cannot match that sequence.
8161 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8163 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
8164 Op0.getOperand(1) != Op1.getOperand(1))
8165 return false;
8166
8167 unsigned I0 = Op0.getConstantOperandVal(1);
8168 if (I0 != i)
8169 return false;
8170
8171 // We found a valid add/sub node, make sure its the same opcode as previous
8172 // elements for this parity.
8173 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
8174 return false;
8175 Opc[i % 2] = Opcode;
8176
8177 // Update InVec0 and InVec1.
8178 if (InVec0.isUndef()) {
8179 InVec0 = Op0.getOperand(0);
8180 if (InVec0.getSimpleValueType() != VT)
8181 return false;
8182 }
8183 if (InVec1.isUndef()) {
8184 InVec1 = Op1.getOperand(0);
8185 if (InVec1.getSimpleValueType() != VT)
8186 return false;
8187 }
8188
8189 // Make sure that operands in input to each add/sub node always
8190 // come from a same pair of vectors.
8191 if (InVec0 != Op0.getOperand(0)) {
8192 if (Opcode == ISD::FSUB)
8193 return false;
8194
8195 // FADD is commutable. Try to commute the operands
8196 // and then test again.
8197 std::swap(Op0, Op1);
8198 if (InVec0 != Op0.getOperand(0))
8199 return false;
8200 }
8201
8202 if (InVec1 != Op1.getOperand(0))
8203 return false;
8204
8205 // Increment the number of extractions done.
8206 ++NumExtracts;
8207 }
8208
8209 // Ensure we have found an opcode for both parities and that they are
8210 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
8211 // inputs are undef.
8212 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
8213 InVec0.isUndef() || InVec1.isUndef())
8214 return false;
8215
8216 IsSubAdd = Opc[0] == ISD::FADD;
8217
8218 Opnd0 = InVec0;
8219 Opnd1 = InVec1;
8220 return true;
8221}
8222
8223/// Returns true if is possible to fold MUL and an idiom that has already been
8224/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
8225/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
8226/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
8227///
8228/// Prior to calling this function it should be known that there is some
8229/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
8230/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
8231/// before replacement of such SDNode with ADDSUB operation. Thus the number
8232/// of \p Opnd0 uses is expected to be equal to 2.
8233/// For example, this function may be called for the following IR:
8234/// %AB = fmul fast <2 x double> %A, %B
8235/// %Sub = fsub fast <2 x double> %AB, %C
8236/// %Add = fadd fast <2 x double> %AB, %C
8237/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
8238/// <2 x i32> <i32 0, i32 3>
8239/// There is a def for %Addsub here, which potentially can be replaced by
8240/// X86ISD::ADDSUB operation:
8241/// %Addsub = X86ISD::ADDSUB %AB, %C
8242/// and such ADDSUB can further be replaced with FMADDSUB:
8243/// %Addsub = FMADDSUB %A, %B, %C.
8244///
8245/// The main reason why this method is called before the replacement of the
8246/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
8247/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
8248/// FMADDSUB is.
8249static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
8250 SelectionDAG &DAG,
8251 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
8252 unsigned ExpectedUses) {
8253 if (Opnd0.getOpcode() != ISD::FMUL ||
8254 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
8255 return false;
8256
8257 // FIXME: These checks must match the similar ones in
8258 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
8259 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
8260 // or MUL + ADDSUB to FMADDSUB.
8261 const TargetOptions &Options = DAG.getTarget().Options;
8262 bool AllowFusion =
8263 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
8264 if (!AllowFusion)
8265 return false;
8266
8267 Opnd2 = Opnd1;
8268 Opnd1 = Opnd0.getOperand(1);
8269 Opnd0 = Opnd0.getOperand(0);
8270
8271 return true;
8272}
8273
8274/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
8275/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
8276/// X86ISD::FMSUBADD node.
8278 const SDLoc &DL,
8279 const X86Subtarget &Subtarget,
8280 SelectionDAG &DAG) {
8281 SDValue Opnd0, Opnd1;
8282 unsigned NumExtracts;
8283 bool IsSubAdd;
8284 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
8285 IsSubAdd))
8286 return SDValue();
8287
8288 MVT VT = BV->getSimpleValueType(0);
8289
8290 // Try to generate X86ISD::FMADDSUB node here.
8291 SDValue Opnd2;
8292 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
8293 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
8294 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
8295 }
8296
8297 // We only support ADDSUB.
8298 if (IsSubAdd)
8299 return SDValue();
8300
8301 // There are no known X86 targets with 512-bit ADDSUB instructions!
8302 // Convert to blend(fsub,fadd).
8303 if (VT.is512BitVector()) {
8304 SmallVector<int> Mask;
8305 for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {
8306 Mask.push_back(I);
8307 Mask.push_back(I + E + 1);
8308 }
8309 SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);
8310 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);
8311 return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);
8312 }
8313
8314 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
8315}
8316
8318 unsigned &HOpcode, SDValue &V0, SDValue &V1) {
8319 // Initialize outputs to known values.
8320 MVT VT = BV->getSimpleValueType(0);
8321 HOpcode = ISD::DELETED_NODE;
8322 V0 = DAG.getUNDEF(VT);
8323 V1 = DAG.getUNDEF(VT);
8324
8325 // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit
8326 // half of the result is calculated independently from the 128-bit halves of
8327 // the inputs, so that makes the index-checking logic below more complicated.
8328 unsigned NumElts = VT.getVectorNumElements();
8329 unsigned GenericOpcode = ISD::DELETED_NODE;
8330 unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;
8331 unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;
8332 unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;
8333 for (unsigned i = 0; i != Num128BitChunks; ++i) {
8334 for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {
8335 // Ignore undef elements.
8336 SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);
8337 if (Op.isUndef())
8338 continue;
8339
8340 // If there's an opcode mismatch, we're done.
8341 if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)
8342 return false;
8343
8344 // Initialize horizontal opcode.
8345 if (HOpcode == ISD::DELETED_NODE) {
8346 GenericOpcode = Op.getOpcode();
8347 switch (GenericOpcode) {
8348 // clang-format off
8349 case ISD::ADD: HOpcode = X86ISD::HADD; break;
8350 case ISD::SUB: HOpcode = X86ISD::HSUB; break;
8351 case ISD::FADD: HOpcode = X86ISD::FHADD; break;
8352 case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;
8353 default: return false;
8354 // clang-format on
8355 }
8356 }
8357
8358 SDValue Op0 = Op.getOperand(0);
8359 SDValue Op1 = Op.getOperand(1);
8360 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
8362 Op0.getOperand(0) != Op1.getOperand(0) ||
8363 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
8364 !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())
8365 return false;
8366
8367 // The source vector is chosen based on which 64-bit half of the
8368 // destination vector is being calculated.
8369 if (j < NumEltsIn64Bits) {
8370 if (V0.isUndef())
8371 V0 = Op0.getOperand(0);
8372 } else {
8373 if (V1.isUndef())
8374 V1 = Op0.getOperand(0);
8375 }
8376
8377 SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;
8378 if (SourceVec != Op0.getOperand(0))
8379 return false;
8380
8381 // op (extract_vector_elt A, I), (extract_vector_elt A, I+1)
8382 unsigned ExtIndex0 = Op0.getConstantOperandVal(1);
8383 unsigned ExtIndex1 = Op1.getConstantOperandVal(1);
8384 unsigned ExpectedIndex = i * NumEltsIn128Bits +
8385 (j % NumEltsIn64Bits) * 2;
8386 if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)
8387 continue;
8388
8389 // If this is not a commutative op, this does not match.
8390 if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)
8391 return false;
8392
8393 // Addition is commutative, so try swapping the extract indexes.
8394 // op (extract_vector_elt A, I+1), (extract_vector_elt A, I)
8395 if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)
8396 continue;
8397
8398 // Extract indexes do not match horizontal requirement.
8399 return false;
8400 }
8401 }
8402 // We matched. Opcode and operands are returned by reference as arguments.
8403 return true;
8404}
8405
8407 const SDLoc &DL, SelectionDAG &DAG,
8408 unsigned HOpcode, SDValue V0, SDValue V1) {
8409 // If either input vector is not the same size as the build vector,
8410 // extract/insert the low bits to the correct size.
8411 // This is free (examples: zmm --> xmm, xmm --> ymm).
8412 MVT VT = BV->getSimpleValueType(0);
8413 unsigned Width = VT.getSizeInBits();
8414 if (V0.getValueSizeInBits() > Width)
8415 V0 = extractSubVector(V0, 0, DAG, DL, Width);
8416 else if (V0.getValueSizeInBits() < Width)
8417 V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, DL, Width);
8418
8419 if (V1.getValueSizeInBits() > Width)
8420 V1 = extractSubVector(V1, 0, DAG, DL, Width);
8421 else if (V1.getValueSizeInBits() < Width)
8422 V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, DL, Width);
8423
8424 unsigned NumElts = VT.getVectorNumElements();
8425 APInt DemandedElts = APInt::getAllOnes(NumElts);
8426 for (unsigned i = 0; i != NumElts; ++i)
8427 if (BV->getOperand(i).isUndef())
8428 DemandedElts.clearBit(i);
8429
8430 // If we don't need the upper xmm, then perform as a xmm hop.
8431 unsigned HalfNumElts = NumElts / 2;
8432 if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {
8433 MVT HalfVT = VT.getHalfNumVectorElementsVT();
8434 V0 = extractSubVector(V0, 0, DAG, DL, 128);
8435 V1 = extractSubVector(V1, 0, DAG, DL, 128);
8436 SDValue Half = DAG.getNode(HOpcode, DL, HalfVT, V0, V1);
8437 return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, DL, 256);
8438 }
8439
8440 return DAG.getNode(HOpcode, DL, VT, V0, V1);
8441}
8442
8443/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
8445 const X86Subtarget &Subtarget,
8446 SelectionDAG &DAG) {
8447 // We need at least 2 non-undef elements to make this worthwhile by default.
8448 unsigned NumNonUndefs =
8449 count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });
8450 if (NumNonUndefs < 2)
8451 return SDValue();
8452
8453 // There are 4 sets of horizontal math operations distinguished by type:
8454 // int/FP at 128-bit/256-bit. Each type was introduced with a different
8455 // subtarget feature. Try to match those "native" patterns first.
8456 MVT VT = BV->getSimpleValueType(0);
8457 if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||
8458 ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||
8459 ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||
8460 ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {
8461 unsigned HOpcode;
8462 SDValue V0, V1;
8463 if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))
8464 return getHopForBuildVector(BV, DL, DAG, HOpcode, V0, V1);
8465 }
8466
8467 // Try harder to match 256-bit ops by using extract/concat.
8468 if (!Subtarget.hasAVX() || !VT.is256BitVector())
8469 return SDValue();
8470
8471 // Count the number of UNDEF operands in the build_vector in input.
8472 unsigned NumElts = VT.getVectorNumElements();
8473 unsigned Half = NumElts / 2;
8474 unsigned NumUndefsLO = 0;
8475 unsigned NumUndefsHI = 0;
8476 for (unsigned i = 0, e = Half; i != e; ++i)
8477 if (BV->getOperand(i)->isUndef())
8478 NumUndefsLO++;
8479
8480 for (unsigned i = Half, e = NumElts; i != e; ++i)
8481 if (BV->getOperand(i)->isUndef())
8482 NumUndefsHI++;
8483
8484 SDValue InVec0, InVec1;
8485 if (VT == MVT::v8i32 || VT == MVT::v16i16) {
8486 SDValue InVec2, InVec3;
8487 unsigned X86Opcode;
8488 bool CanFold = true;
8489
8490 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, Half, InVec0, InVec1) &&
8491 isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, Half, NumElts, InVec2,
8492 InVec3) &&
8493 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8494 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8495 X86Opcode = X86ISD::HADD;
8496 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, Half, InVec0,
8497 InVec1) &&
8498 isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, Half, NumElts, InVec2,
8499 InVec3) &&
8500 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
8501 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
8502 X86Opcode = X86ISD::HSUB;
8503 else
8504 CanFold = false;
8505
8506 if (CanFold) {
8507 // Do not try to expand this build_vector into a pair of horizontal
8508 // add/sub if we can emit a pair of scalar add/sub.
8509 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8510 return SDValue();
8511
8512 // Convert this build_vector into a pair of horizontal binops followed by
8513 // a concat vector. We must adjust the outputs from the partial horizontal
8514 // matching calls above to account for undefined vector halves.
8515 SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;
8516 SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;
8517 assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?");
8518 bool isUndefLO = NumUndefsLO == Half;
8519 bool isUndefHI = NumUndefsHI == Half;
8520 return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,
8521 isUndefHI);
8522 }
8523 }
8524
8525 if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
8526 VT == MVT::v16i16) {
8527 unsigned X86Opcode;
8528 if (isHorizontalBinOpPart(BV, ISD::ADD, DL, DAG, 0, NumElts, InVec0,
8529 InVec1))
8530 X86Opcode = X86ISD::HADD;
8531 else if (isHorizontalBinOpPart(BV, ISD::SUB, DL, DAG, 0, NumElts, InVec0,
8532 InVec1))
8533 X86Opcode = X86ISD::HSUB;
8534 else if (isHorizontalBinOpPart(BV, ISD::FADD, DL, DAG, 0, NumElts, InVec0,
8535 InVec1))
8536 X86Opcode = X86ISD::FHADD;
8537 else if (isHorizontalBinOpPart(BV, ISD::FSUB, DL, DAG, 0, NumElts, InVec0,
8538 InVec1))
8539 X86Opcode = X86ISD::FHSUB;
8540 else
8541 return SDValue();
8542
8543 // Don't try to expand this build_vector into a pair of horizontal add/sub
8544 // if we can simply emit a pair of scalar add/sub.
8545 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
8546 return SDValue();
8547
8548 // Convert this build_vector into two horizontal add/sub followed by
8549 // a concat vector.
8550 bool isUndefLO = NumUndefsLO == Half;
8551 bool isUndefHI = NumUndefsHI == Half;
8552 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
8553 isUndefLO, isUndefHI);
8554 }
8555
8556 return SDValue();
8557}
8558
8559static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
8560 SelectionDAG &DAG);
8561
8562/// If a BUILD_VECTOR's source elements all apply the same bit operation and
8563/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
8564/// just apply the bit to the vectors.
8565/// NOTE: Its not in our interest to start make a general purpose vectorizer
8566/// from this, but enough scalar bit operations are created from the later
8567/// legalization + scalarization stages to need basic support.
8569 const X86Subtarget &Subtarget,
8570 SelectionDAG &DAG) {
8571 MVT VT = Op->getSimpleValueType(0);
8572 unsigned NumElems = VT.getVectorNumElements();
8573 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
8574
8575 // Check that all elements have the same opcode.
8576 // TODO: Should we allow UNDEFS and if so how many?
8577 unsigned Opcode = Op->getOperand(0).getOpcode();
8578 for (unsigned i = 1; i < NumElems; ++i)
8579 if (Opcode != Op->getOperand(i).getOpcode())
8580 return SDValue();
8581
8582 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
8583 bool IsShift = false;
8584 switch (Opcode) {
8585 default:
8586 return SDValue();
8587 case ISD::SHL:
8588 case ISD::SRL:
8589 case ISD::SRA:
8590 IsShift = true;
8591 break;
8592 case ISD::AND:
8593 case ISD::XOR:
8594 case ISD::OR:
8595 // Don't do this if the buildvector is a splat - we'd replace one
8596 // constant with an entire vector.
8597 if (Op->getSplatValue())
8598 return SDValue();
8599 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
8600 return SDValue();
8601 break;
8602 }
8603
8604 SmallVector<SDValue, 4> LHSElts, RHSElts;
8605 for (SDValue Elt : Op->ops()) {
8606 SDValue LHS = Elt.getOperand(0);
8607 SDValue RHS = Elt.getOperand(1);
8608
8609 // We expect the canonicalized RHS operand to be the constant.
8610 if (!isa<ConstantSDNode>(RHS))
8611 return SDValue();
8612
8613 // Extend shift amounts.
8614 if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {
8615 if (!IsShift)
8616 return SDValue();
8617 RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());
8618 }
8619
8620 LHSElts.push_back(LHS);
8621 RHSElts.push_back(RHS);
8622 }
8623
8624 // Limit to shifts by uniform immediates.
8625 // TODO: Only accept vXi8/vXi64 special cases?
8626 // TODO: Permit non-uniform XOP/AVX2/MULLO cases?
8627 if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))
8628 return SDValue();
8629
8630 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
8631 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
8632 SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
8633
8634 if (!IsShift)
8635 return Res;
8636
8637 // Immediately lower the shift to ensure the constant build vector doesn't
8638 // get converted to a constant pool before the shift is lowered.
8639 return LowerShift(Res, Subtarget, DAG);
8640}
8641
8642/// Create a vector constant without a load. SSE/AVX provide the bare minimum
8643/// functionality to do this, so it's all zeros, all ones, or some derivation
8644/// that is cheap to calculate.
8646 SelectionDAG &DAG,
8647 const X86Subtarget &Subtarget) {
8648 MVT VT = Op.getSimpleValueType();
8649
8650 // Vectors containing all zeros can be matched by pxor and xorps.
8651 if (ISD::isBuildVectorAllZeros(Op.getNode()))
8652 return Op;
8653
8654 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
8655 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
8656 // vpcmpeqd on 256-bit vectors.
8657 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
8658 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
8659 return Op;
8660
8661 return getOnesVector(VT, DAG, DL);
8662 }
8663
8664 return SDValue();
8665}
8666
8667/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
8668/// from a vector of source values and a vector of extraction indices.
8669/// The vectors might be manipulated to match the type of the permute op.
8670static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8671 const SDLoc &DL, SelectionDAG &DAG,
8672 const X86Subtarget &Subtarget) {
8673 MVT ShuffleVT = VT;
8674 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8675 unsigned NumElts = VT.getVectorNumElements();
8676 unsigned SizeInBits = VT.getSizeInBits();
8677
8678 // Adjust IndicesVec to match VT size.
8679 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8680 "Illegal variable permute mask size");
8681 if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
8682 // Narrow/widen the indices vector to the correct size.
8683 if (IndicesVec.getValueSizeInBits() > SizeInBits)
8684 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8685 NumElts * VT.getScalarSizeInBits());
8686 else if (IndicesVec.getValueSizeInBits() < SizeInBits)
8687 IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,
8688 SDLoc(IndicesVec), SizeInBits);
8689 // Zero-extend the index elements within the vector.
8690 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8691 IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),
8692 IndicesVT, IndicesVec);
8693 }
8694 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8695
8696 // Handle SrcVec that don't match VT type.
8697 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8698 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8699 // Handle larger SrcVec by treating it as a larger permute.
8700 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8701 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8702 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8703 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8704 Subtarget, DAG, SDLoc(IndicesVec));
8705 SDValue NewSrcVec =
8706 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8707 if (NewSrcVec)
8708 return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
8709 return SDValue();
8710 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8711 // Widen smaller SrcVec to match VT.
8712 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8713 } else
8714 return SDValue();
8715 }
8716
8717 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8718 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8719 EVT SrcVT = Idx.getValueType();
8720 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8721 uint64_t IndexScale = 0;
8722 uint64_t IndexOffset = 0;
8723
8724 // If we're scaling a smaller permute op, then we need to repeat the
8725 // indices, scaling and offsetting them as well.
8726 // e.g. v4i32 -> v16i8 (Scale = 4)
8727 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8728 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8729 for (uint64_t i = 0; i != Scale; ++i) {
8730 IndexScale |= Scale << (i * NumDstBits);
8731 IndexOffset |= i << (i * NumDstBits);
8732 }
8733
8734 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8735 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8736 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8737 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8738 return Idx;
8739 };
8740
8741 unsigned Opcode = 0;
8742 switch (VT.SimpleTy) {
8743 default:
8744 break;
8745 case MVT::v16i8:
8746 if (Subtarget.hasSSSE3())
8747 Opcode = X86ISD::PSHUFB;
8748 break;
8749 case MVT::v8i16:
8750 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8751 Opcode = X86ISD::VPERMV;
8752 else if (Subtarget.hasSSSE3()) {
8753 Opcode = X86ISD::PSHUFB;
8754 ShuffleVT = MVT::v16i8;
8755 }
8756 break;
8757 case MVT::v4f32:
8758 case MVT::v4i32:
8759 if (Subtarget.hasAVX()) {
8760 Opcode = X86ISD::VPERMILPV;
8761 ShuffleVT = MVT::v4f32;
8762 } else if (Subtarget.hasSSSE3()) {
8763 Opcode = X86ISD::PSHUFB;
8764 ShuffleVT = MVT::v16i8;
8765 }
8766 break;
8767 case MVT::v2f64:
8768 case MVT::v2i64:
8769 if (Subtarget.hasAVX()) {
8770 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8771 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8772 Opcode = X86ISD::VPERMILPV;
8773 ShuffleVT = MVT::v2f64;
8774 } else if (Subtarget.hasSSE41()) {
8775 // SSE41 can compare v2i64 - select between indices 0 and 1.
8776 return DAG.getSelectCC(
8777 DL, IndicesVec,
8778 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8779 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8780 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8782 }
8783 break;
8784 case MVT::v32i8:
8785 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8786 Opcode = X86ISD::VPERMV;
8787 else if (Subtarget.hasXOP()) {
8788 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8789 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
8790 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
8791 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
8792 return DAG.getNode(
8794 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
8795 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
8796 } else if (Subtarget.hasAVX()) {
8797 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
8798 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
8799 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
8800 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
8801 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
8802 ArrayRef<SDValue> Ops) {
8803 // Permute Lo and Hi and then select based on index range.
8804 // This works as SHUFB uses bits[3:0] to permute elements and we don't
8805 // care about the bit[7] as its just an index vector.
8806 SDValue Idx = Ops[2];
8807 EVT VT = Idx.getValueType();
8808 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
8809 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
8810 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
8812 };
8813 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
8814 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
8815 PSHUFBBuilder);
8816 }
8817 break;
8818 case MVT::v16i16:
8819 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8820 Opcode = X86ISD::VPERMV;
8821 else if (Subtarget.hasAVX()) {
8822 // Scale to v32i8 and perform as v32i8.
8823 IndicesVec = ScaleIndices(IndicesVec, 2);
8824 return DAG.getBitcast(
8826 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
8827 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
8828 }
8829 break;
8830 case MVT::v8f32:
8831 case MVT::v8i32:
8832 if (Subtarget.hasAVX2())
8833 Opcode = X86ISD::VPERMV;
8834 else if (Subtarget.hasAVX()) {
8835 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
8836 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8837 {0, 1, 2, 3, 0, 1, 2, 3});
8838 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8839 {4, 5, 6, 7, 4, 5, 6, 7});
8840 if (Subtarget.hasXOP())
8841 return DAG.getBitcast(
8842 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,
8843 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
8844 // Permute Lo and Hi and then select based on index range.
8845 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
8846 SDValue Res = DAG.getSelectCC(
8847 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
8848 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
8849 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
8851 return DAG.getBitcast(VT, Res);
8852 }
8853 break;
8854 case MVT::v4i64:
8855 case MVT::v4f64:
8856 if (Subtarget.hasAVX512()) {
8857 if (!Subtarget.hasVLX()) {
8858 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
8859 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
8860 SDLoc(SrcVec));
8861 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
8862 DAG, SDLoc(IndicesVec));
8863 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
8864 DAG, Subtarget);
8865 return extract256BitVector(Res, 0, DAG, DL);
8866 }
8867 Opcode = X86ISD::VPERMV;
8868 } else if (Subtarget.hasAVX()) {
8869 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
8870 SDValue LoLo =
8871 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
8872 SDValue HiHi =
8873 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
8874 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
8875 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8876 if (Subtarget.hasXOP())
8877 return DAG.getBitcast(
8878 VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,
8879 IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));
8880 // Permute Lo and Hi and then select based on index range.
8881 // This works as VPERMILPD only uses index bit[1] to permute elements.
8882 SDValue Res = DAG.getSelectCC(
8883 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
8884 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
8885 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
8887 return DAG.getBitcast(VT, Res);
8888 }
8889 break;
8890 case MVT::v64i8:
8891 if (Subtarget.hasVBMI())
8892 Opcode = X86ISD::VPERMV;
8893 break;
8894 case MVT::v32i16:
8895 if (Subtarget.hasBWI())
8896 Opcode = X86ISD::VPERMV;
8897 break;
8898 case MVT::v16f32:
8899 case MVT::v16i32:
8900 case MVT::v8f64:
8901 case MVT::v8i64:
8902 if (Subtarget.hasAVX512())
8903 Opcode = X86ISD::VPERMV;
8904 break;
8905 }
8906 if (!Opcode)
8907 return SDValue();
8908
8909 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
8910 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
8911 "Illegal variable permute shuffle type");
8912
8913 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
8914 if (Scale > 1)
8915 IndicesVec = ScaleIndices(IndicesVec, Scale);
8916
8917 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
8918 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
8919
8920 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
8921 SDValue Res = Opcode == X86ISD::VPERMV
8922 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
8923 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
8924 return DAG.getBitcast(VT, Res);
8925}
8926
8927// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
8928// reasoned to be a permutation of a vector by indices in a non-constant vector.
8929// (build_vector (extract_elt V, (extract_elt I, 0)),
8930// (extract_elt V, (extract_elt I, 1)),
8931// ...
8932// ->
8933// (vpermv I, V)
8934//
8935// TODO: Handle undefs
8936// TODO: Utilize pshufb and zero mask blending to support more efficient
8937// construction of vectors with constant-0 elements.
8938static SDValue
8940 SelectionDAG &DAG,
8941 const X86Subtarget &Subtarget) {
8942 SDValue SrcVec, IndicesVec;
8943 // Check for a match of the permute source vector and permute index elements.
8944 // This is done by checking that the i-th build_vector operand is of the form:
8945 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
8946 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
8947 SDValue Op = V.getOperand(Idx);
8948 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8949 return SDValue();
8950
8951 // If this is the first extract encountered in V, set the source vector,
8952 // otherwise verify the extract is from the previously defined source
8953 // vector.
8954 if (!SrcVec)
8955 SrcVec = Op.getOperand(0);
8956 else if (SrcVec != Op.getOperand(0))
8957 return SDValue();
8958 SDValue ExtractedIndex = Op->getOperand(1);
8959 // Peek through extends.
8960 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
8961 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
8962 ExtractedIndex = ExtractedIndex.getOperand(0);
8963 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8964 return SDValue();
8965
8966 // If this is the first extract from the index vector candidate, set the
8967 // indices vector, otherwise verify the extract is from the previously
8968 // defined indices vector.
8969 if (!IndicesVec)
8970 IndicesVec = ExtractedIndex.getOperand(0);
8971 else if (IndicesVec != ExtractedIndex.getOperand(0))
8972 return SDValue();
8973
8974 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
8975 if (!PermIdx || PermIdx->getAPIntValue() != Idx)
8976 return SDValue();
8977 }
8978
8979 MVT VT = V.getSimpleValueType();
8980 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8981}
8982
8983SDValue
8984X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
8985 SDLoc dl(Op);
8986
8987 MVT VT = Op.getSimpleValueType();
8988 MVT EltVT = VT.getVectorElementType();
8989 MVT OpEltVT = Op.getOperand(0).getSimpleValueType();
8990 unsigned NumElems = Op.getNumOperands();
8991
8992 // Generate vectors for predicate vectors.
8993 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
8994 return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
8995
8996 if (VT.getVectorElementType() == MVT::bf16 &&
8997 (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
8998 return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
8999
9000 if (SDValue VectorCst = materializeVectorConstant(Op, dl, DAG, Subtarget))
9001 return VectorCst;
9002
9003 unsigned EVTBits = EltVT.getSizeInBits();
9004 APInt UndefMask = APInt::getZero(NumElems);
9005 APInt FrozenUndefMask = APInt::getZero(NumElems);
9006 APInt ZeroMask = APInt::getZero(NumElems);
9007 APInt NonZeroMask = APInt::getZero(NumElems);
9008 bool IsAllConstants = true;
9009 bool OneUseFrozenUndefs = true;
9010 SmallSet<SDValue, 8> Values;
9011 unsigned NumConstants = NumElems;
9012 for (unsigned i = 0; i < NumElems; ++i) {
9013 SDValue Elt = Op.getOperand(i);
9014 if (Elt.isUndef()) {
9015 UndefMask.setBit(i);
9016 continue;
9017 }
9018 if (ISD::isFreezeUndef(Elt.getNode())) {
9019 OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse();
9020 FrozenUndefMask.setBit(i);
9021 continue;
9022 }
9023 Values.insert(Elt);
9024 if (!isIntOrFPConstant(Elt)) {
9025 IsAllConstants = false;
9026 NumConstants--;
9027 }
9028 if (X86::isZeroNode(Elt)) {
9029 ZeroMask.setBit(i);
9030 } else {
9031 NonZeroMask.setBit(i);
9032 }
9033 }
9034
9035 // All undef vector. Return an UNDEF.
9036 if (UndefMask.isAllOnes())
9037 return DAG.getUNDEF(VT);
9038
9039 // All undef/freeze(undef) vector. Return a FREEZE UNDEF.
9040 if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes())
9041 return DAG.getFreeze(DAG.getUNDEF(VT));
9042
9043 // All undef/freeze(undef)/zero vector. Return a zero vector.
9044 if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes())
9045 return getZeroVector(VT, Subtarget, DAG, dl);
9046
9047 // If we have multiple FREEZE-UNDEF operands, we are likely going to end up
9048 // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in
9049 // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,
9050 // and blend the FREEZE-UNDEF operands back in.
9051 // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?
9052 if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();
9053 NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {
9054 SmallVector<int, 16> BlendMask(NumElems, -1);
9055 SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));
9056 for (unsigned i = 0; i < NumElems; ++i) {
9057 if (UndefMask[i]) {
9058 BlendMask[i] = -1;
9059 continue;
9060 }
9061 BlendMask[i] = i;
9062 if (!FrozenUndefMask[i])
9063 Elts[i] = Op.getOperand(i);
9064 else
9065 BlendMask[i] += NumElems;
9066 }
9067 SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);
9068 SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));
9069 SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);
9070 return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);
9071 }
9072
9073 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
9074
9075 // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might
9076 // be better off lowering to a smaller build vector and padding with
9077 // undef/zero.
9078 if ((VT.is256BitVector() || VT.is512BitVector()) &&
9080 unsigned UpperElems = NumElems / 2;
9081 APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask;
9082 unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();
9083 if (NumUpperUndefsOrZeros >= UpperElems) {
9084 if (VT.is512BitVector() &&
9085 NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))
9086 UpperElems = NumElems - (NumElems / 4);
9087 // If freeze(undef) is in any upper elements, force to zero.
9088 bool UndefUpper = UndefMask.countl_one() >= UpperElems;
9089 MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);
9090 SDValue NewBV =
9091 DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));
9092 return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);
9093 }
9094 }
9095
9096 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, dl, Subtarget, DAG))
9097 return AddSub;
9098 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, dl, Subtarget, DAG))
9099 return HorizontalOp;
9100 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, dl, Subtarget, DAG))
9101 return Broadcast;
9102 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, dl, Subtarget, DAG))
9103 return BitOp;
9104
9105 unsigned NumZero = ZeroMask.popcount();
9106 unsigned NumNonZero = NonZeroMask.popcount();
9107
9108 // If we are inserting one variable into a vector of non-zero constants, try
9109 // to avoid loading each constant element as a scalar. Load the constants as a
9110 // vector and then insert the variable scalar element. If insertion is not
9111 // supported, fall back to a shuffle to get the scalar blended with the
9112 // constants. Insertion into a zero vector is handled as a special-case
9113 // somewhere below here.
9114 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
9115 FrozenUndefMask.isZero() &&
9118 // Create an all-constant vector. The variable element in the old
9119 // build vector is replaced by undef in the constant vector. Save the
9120 // variable scalar element and its index for use in the insertelement.
9121 LLVMContext &Context = *DAG.getContext();
9122 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
9123 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
9124 SDValue VarElt;
9125 SDValue InsIndex;
9126 for (unsigned i = 0; i != NumElems; ++i) {
9127 SDValue Elt = Op.getOperand(i);
9128 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
9129 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
9130 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
9131 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
9132 else if (!Elt.isUndef()) {
9133 assert(!VarElt.getNode() && !InsIndex.getNode() &&
9134 "Expected one variable element in this vector");
9135 VarElt = Elt;
9136 InsIndex = DAG.getVectorIdxConstant(i, dl);
9137 }
9138 }
9139 Constant *CV = ConstantVector::get(ConstVecOps);
9140 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
9141
9142 // The constants we just created may not be legal (eg, floating point). We
9143 // must lower the vector right here because we can not guarantee that we'll
9144 // legalize it before loading it. This is also why we could not just create
9145 // a new build vector here. If the build vector contains illegal constants,
9146 // it could get split back up into a series of insert elements.
9147 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
9148 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
9151 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
9152 unsigned InsertC = InsIndex->getAsZExtVal();
9153 unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();
9154 if (InsertC < NumEltsInLow128Bits)
9155 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
9156
9157 // There's no good way to insert into the high elements of a >128-bit
9158 // vector, so use shuffles to avoid an extract/insert sequence.
9159 assert(VT.getSizeInBits() > 128 && "Invalid insertion index?");
9160 assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector");
9161 SmallVector<int, 8> ShuffleMask;
9162 unsigned NumElts = VT.getVectorNumElements();
9163 for (unsigned i = 0; i != NumElts; ++i)
9164 ShuffleMask.push_back(i == InsertC ? NumElts : i);
9165 SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);
9166 return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);
9167 }
9168
9169 // Special case for single non-zero, non-undef, element.
9170 if (NumNonZero == 1) {
9171 unsigned Idx = NonZeroMask.countr_zero();
9172 SDValue Item = Op.getOperand(Idx);
9173
9174 // If we have a constant or non-constant insertion into the low element of
9175 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
9176 // the rest of the elements. This will be matched as movd/movq/movss/movsd
9177 // depending on what the source datatype is.
9178 if (Idx == 0) {
9179 if (NumZero == 0)
9180 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9181
9182 if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||
9183 EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||
9184 (EltVT == MVT::i16 && Subtarget.hasFP16())) {
9185 assert((VT.is128BitVector() || VT.is256BitVector() ||
9186 VT.is512BitVector()) &&
9187 "Expected an SSE value type!");
9188 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9189 // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a
9190 // zero vector.
9191 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9192 }
9193
9194 // We can't directly insert an i8 or i16 into a vector, so zero extend
9195 // it to i32 first.
9196 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
9197 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
9198 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
9199 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
9200 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
9201 return DAG.getBitcast(VT, Item);
9202 }
9203 }
9204
9205 // Is it a vector logical left shift?
9206 if (NumElems == 2 && Idx == 1 &&
9207 X86::isZeroNode(Op.getOperand(0)) &&
9208 !X86::isZeroNode(Op.getOperand(1))) {
9209 unsigned NumBits = VT.getSizeInBits();
9210 return getVShift(true, VT,
9212 VT, Op.getOperand(1)),
9213 NumBits/2, DAG, *this, dl);
9214 }
9215
9216 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
9217 return SDValue();
9218
9219 // Otherwise, if this is a vector with i32 or f32 elements, and the element
9220 // is a non-constant being inserted into an element other than the low one,
9221 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
9222 // movd/movss) to move this into the low element, then shuffle it into
9223 // place.
9224 if (EVTBits == 32) {
9225 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
9226 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
9227 }
9228 }
9229
9230 // Splat is obviously ok. Let legalizer expand it to a shuffle.
9231 if (Values.size() == 1) {
9232 if (EVTBits == 32) {
9233 // Instead of a shuffle like this:
9234 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
9235 // Check if it's possible to issue this instead.
9236 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
9237 unsigned Idx = NonZeroMask.countr_zero();
9238 SDValue Item = Op.getOperand(Idx);
9239 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
9240 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
9241 }
9242 return SDValue();
9243 }
9244
9245 // A vector full of immediates; various special cases are already
9246 // handled, so this is best done with a single constant-pool load.
9247 if (IsAllConstants)
9248 return SDValue();
9249
9250 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, dl, DAG, Subtarget))
9251 return V;
9252
9253 // See if we can use a vector load to get all of the elements.
9254 {
9255 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
9256 if (SDValue LD =
9257 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9258 return LD;
9259 }
9260
9261 // If this is a splat of pairs of 32-bit elements, we can use a narrower
9262 // build_vector and broadcast it.
9263 // TODO: We could probably generalize this more.
9264 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
9265 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
9266 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
9267 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
9268 // Make sure all the even/odd operands match.
9269 for (unsigned i = 2; i != NumElems; ++i)
9270 if (Ops[i % 2] != Op.getOperand(i))
9271 return false;
9272 return true;
9273 };
9274 if (CanSplat(Op, NumElems, Ops)) {
9275 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
9276 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
9277 // Create a new build vector and cast to v2i64/v2f64.
9278 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
9279 DAG.getBuildVector(NarrowVT, dl, Ops));
9280 // Broadcast from v2i64/v2f64 and cast to final VT.
9281 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);
9282 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
9283 NewBV));
9284 }
9285 }
9286
9287 // For AVX-length vectors, build the individual 128-bit pieces and use
9288 // shuffles to put them in place.
9289 if (VT.getSizeInBits() > 128) {
9290 MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);
9291
9292 // Build both the lower and upper subvector.
9293 SDValue Lower =
9294 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
9296 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
9297
9298 // Recreate the wider vector with the lower and upper part.
9299 return concatSubVectors(Lower, Upper, DAG, dl);
9300 }
9301
9302 // Let legalizer expand 2-wide build_vectors.
9303 if (EVTBits == 64) {
9304 if (NumNonZero == 1) {
9305 // One half is zero or undef.
9306 unsigned Idx = NonZeroMask.countr_zero();
9308 Op.getOperand(Idx));
9309 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
9310 }
9311 return SDValue();
9312 }
9313
9314 // If element VT is < 32 bits, convert it to inserts into a zero vector.
9315 if (EVTBits == 8 && NumElems == 16)
9316 if (SDValue V = LowerBuildVectorv16i8(Op, dl, NonZeroMask, NumNonZero,
9317 NumZero, DAG, Subtarget))
9318 return V;
9319
9320 if (EltVT == MVT::i16 && NumElems == 8)
9321 if (SDValue V = LowerBuildVectorv8i16(Op, dl, NonZeroMask, NumNonZero,
9322 NumZero, DAG, Subtarget))
9323 return V;
9324
9325 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
9326 if (EVTBits == 32 && NumElems == 4)
9327 if (SDValue V = LowerBuildVectorv4x32(Op, dl, DAG, Subtarget))
9328 return V;
9329
9330 // If element VT is == 32 bits, turn it into a number of shuffles.
9331 if (NumElems == 4 && NumZero > 0) {
9332 SmallVector<SDValue, 8> Ops(NumElems);
9333 for (unsigned i = 0; i < 4; ++i) {
9334 bool isZero = !NonZeroMask[i];
9335 if (isZero)
9336 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
9337 else
9338 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9339 }
9340
9341 for (unsigned i = 0; i < 2; ++i) {
9342 switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {
9343 default: llvm_unreachable("Unexpected NonZero count");
9344 case 0:
9345 Ops[i] = Ops[i*2]; // Must be a zero vector.
9346 break;
9347 case 1:
9348 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
9349 break;
9350 case 2:
9351 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9352 break;
9353 case 3:
9354 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
9355 break;
9356 }
9357 }
9358
9359 bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;
9360 bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;
9361 int MaskVec[] = {
9362 Reverse1 ? 1 : 0,
9363 Reverse1 ? 0 : 1,
9364 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
9365 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
9366 };
9367 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
9368 }
9369
9370 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
9371
9372 // Check for a build vector from mostly shuffle plus few inserting.
9373 if (SDValue Sh = buildFromShuffleMostly(Op, dl, DAG))
9374 return Sh;
9375
9376 // For SSE 4.1, use insertps to put the high elements into the low element.
9377 if (Subtarget.hasSSE41() && EltVT != MVT::f16) {
9379 if (!Op.getOperand(0).isUndef())
9380 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
9381 else
9382 Result = DAG.getUNDEF(VT);
9383
9384 for (unsigned i = 1; i < NumElems; ++i) {
9385 if (Op.getOperand(i).isUndef()) continue;
9386 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
9387 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
9388 }
9389 return Result;
9390 }
9391
9392 // Otherwise, expand into a number of unpckl*, start by extending each of
9393 // our (non-undef) elements to the full vector width with the element in the
9394 // bottom slot of the vector (which generates no code for SSE).
9395 SmallVector<SDValue, 8> Ops(NumElems);
9396 for (unsigned i = 0; i < NumElems; ++i) {
9397 if (!Op.getOperand(i).isUndef())
9398 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
9399 else
9400 Ops[i] = DAG.getUNDEF(VT);
9401 }
9402
9403 // Next, we iteratively mix elements, e.g. for v4f32:
9404 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
9405 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
9406 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
9407 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
9408 // Generate scaled UNPCKL shuffle mask.
9410 for(unsigned i = 0; i != Scale; ++i)
9411 Mask.push_back(i);
9412 for (unsigned i = 0; i != Scale; ++i)
9413 Mask.push_back(NumElems+i);
9414 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
9415
9416 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
9417 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
9418 }
9419 return Ops[0];
9420}
9421
9422// 256-bit AVX can use the vinsertf128 instruction
9423// to create 256-bit vectors from two other 128-bit ones.
9424// TODO: Detect subvector broadcast here instead of DAG combine?
9426 const X86Subtarget &Subtarget) {
9427 SDLoc dl(Op);
9428 MVT ResVT = Op.getSimpleValueType();
9429
9430 assert((ResVT.is256BitVector() ||
9431 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
9432
9433 unsigned NumOperands = Op.getNumOperands();
9434 unsigned NumFreezeUndef = 0;
9435 unsigned NumZero = 0;
9436 unsigned NumNonZero = 0;
9437 unsigned NonZeros = 0;
9438 for (unsigned i = 0; i != NumOperands; ++i) {
9439 SDValue SubVec = Op.getOperand(i);
9440 if (SubVec.isUndef())
9441 continue;
9442 if (ISD::isFreezeUndef(SubVec.getNode())) {
9443 // If the freeze(undef) has multiple uses then we must fold to zero.
9444 if (SubVec.hasOneUse())
9445 ++NumFreezeUndef;
9446 else
9447 ++NumZero;
9448 }
9449 else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9450 ++NumZero;
9451 else {
9452 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9453 NonZeros |= 1 << i;
9454 ++NumNonZero;
9455 }
9456 }
9457
9458 // If we have more than 2 non-zeros, build each half separately.
9459 if (NumNonZero > 2) {
9460 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9461 ArrayRef<SDUse> Ops = Op->ops();
9462 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9463 Ops.slice(0, NumOperands/2));
9464 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9465 Ops.slice(NumOperands/2));
9466 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9467 }
9468
9469 // Otherwise, build it up through insert_subvectors.
9470 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
9471 : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))
9472 : DAG.getUNDEF(ResVT));
9473
9474 MVT SubVT = Op.getOperand(0).getSimpleValueType();
9475 unsigned NumSubElems = SubVT.getVectorNumElements();
9476 for (unsigned i = 0; i != NumOperands; ++i) {
9477 if ((NonZeros & (1 << i)) == 0)
9478 continue;
9479
9480 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
9481 Op.getOperand(i),
9482 DAG.getIntPtrConstant(i * NumSubElems, dl));
9483 }
9484
9485 return Vec;
9486}
9487
9488// Returns true if the given node is a type promotion (by concatenating i1
9489// zeros) of the result of a node that already zeros all upper bits of
9490// k-register.
9491// TODO: Merge this with LowerAVXCONCAT_VECTORS?
9493 const X86Subtarget &Subtarget,
9494 SelectionDAG & DAG) {
9495 SDLoc dl(Op);
9496 MVT ResVT = Op.getSimpleValueType();
9497 unsigned NumOperands = Op.getNumOperands();
9498
9499 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
9500 "Unexpected number of operands in CONCAT_VECTORS");
9501
9502 uint64_t Zeros = 0;
9503 uint64_t NonZeros = 0;
9504 for (unsigned i = 0; i != NumOperands; ++i) {
9505 SDValue SubVec = Op.getOperand(i);
9506 if (SubVec.isUndef())
9507 continue;
9508 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
9509 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
9510 Zeros |= (uint64_t)1 << i;
9511 else
9512 NonZeros |= (uint64_t)1 << i;
9513 }
9514
9515 unsigned NumElems = ResVT.getVectorNumElements();
9516
9517 // If we are inserting non-zero vector and there are zeros in LSBs and undef
9518 // in the MSBs we need to emit a KSHIFTL. The generic lowering to
9519 // insert_subvector will give us two kshifts.
9520 if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&
9521 Log2_64(NonZeros) != NumOperands - 1) {
9522 unsigned Idx = Log2_64(NonZeros);
9523 SDValue SubVec = Op.getOperand(Idx);
9524 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9525 MVT ShiftVT = widenMaskVectorType(ResVT, Subtarget);
9526 Op = widenSubVector(ShiftVT, SubVec, false, Subtarget, DAG, dl);
9527 Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, Op,
9528 DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));
9529 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,
9530 DAG.getIntPtrConstant(0, dl));
9531 }
9532
9533 // If there are zero or one non-zeros we can handle this very simply.
9534 if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {
9535 SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);
9536 if (!NonZeros)
9537 return Vec;
9538 unsigned Idx = Log2_64(NonZeros);
9539 SDValue SubVec = Op.getOperand(Idx);
9540 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
9541 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
9542 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
9543 }
9544
9545 if (NumOperands > 2) {
9546 MVT HalfVT = ResVT.getHalfNumVectorElementsVT();
9547 ArrayRef<SDUse> Ops = Op->ops();
9548 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9549 Ops.slice(0, NumOperands/2));
9550 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
9551 Ops.slice(NumOperands/2));
9552 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
9553 }
9554
9555 assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?");
9556
9557 if (ResVT.getVectorNumElements() >= 16)
9558 return Op; // The operation is legal with KUNPCK
9559
9560 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
9561 DAG.getUNDEF(ResVT), Op.getOperand(0),
9562 DAG.getIntPtrConstant(0, dl));
9563 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
9564 DAG.getIntPtrConstant(NumElems/2, dl));
9565}
9566
9568 const X86Subtarget &Subtarget,
9569 SelectionDAG &DAG) {
9570 MVT VT = Op.getSimpleValueType();
9571 if (VT.getVectorElementType() == MVT::i1)
9572 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
9573
9574 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
9575 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
9576 Op.getNumOperands() == 4)));
9577
9578 // AVX can use the vinsertf128 instruction to create 256-bit vectors
9579 // from two other 128-bit ones.
9580
9581 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
9582 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
9583}
9584
9585//===----------------------------------------------------------------------===//
9586// Vector shuffle lowering
9587//
9588// This is an experimental code path for lowering vector shuffles on x86. It is
9589// designed to handle arbitrary vector shuffles and blends, gracefully
9590// degrading performance as necessary. It works hard to recognize idiomatic
9591// shuffles and lower them to optimal instruction patterns without leaving
9592// a framework that allows reasonably efficient handling of all vector shuffle
9593// patterns.
9594//===----------------------------------------------------------------------===//
9595
9596/// Tiny helper function to identify a no-op mask.
9597///
9598/// This is a somewhat boring predicate function. It checks whether the mask
9599/// array input, which is assumed to be a single-input shuffle mask of the kind
9600/// used by the X86 shuffle instructions (not a fully general
9601/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
9602/// in-place shuffle are 'no-op's.
9604 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9605 assert(Mask[i] >= -1 && "Out of bound mask element!");
9606 if (Mask[i] >= 0 && Mask[i] != i)
9607 return false;
9608 }
9609 return true;
9610}
9611
9612/// Test whether there are elements crossing LaneSizeInBits lanes in this
9613/// shuffle mask.
9614///
9615/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
9616/// and we routinely test for these.
9617static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
9618 unsigned ScalarSizeInBits,
9619 ArrayRef<int> Mask) {
9620 assert(LaneSizeInBits && ScalarSizeInBits &&
9621 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9622 "Illegal shuffle lane size");
9623 int LaneSize = LaneSizeInBits / ScalarSizeInBits;
9624 int Size = Mask.size();
9625 for (int i = 0; i < Size; ++i)
9626 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
9627 return true;
9628 return false;
9629}
9630
9631/// Test whether there are elements crossing 128-bit lanes in this
9632/// shuffle mask.
9634 return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
9635}
9636
9637/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come
9638/// from multiple lanes - this is different to isLaneCrossingShuffleMask to
9639/// better support 'repeated mask + lane permute' style shuffles.
9640static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,
9641 unsigned ScalarSizeInBits,
9642 ArrayRef<int> Mask) {
9643 assert(LaneSizeInBits && ScalarSizeInBits &&
9644 (LaneSizeInBits % ScalarSizeInBits) == 0 &&
9645 "Illegal shuffle lane size");
9646 int NumElts = Mask.size();
9647 int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;
9648 int NumLanes = NumElts / NumEltsPerLane;
9649 if (NumLanes > 1) {
9650 for (int i = 0; i != NumLanes; ++i) {
9651 int SrcLane = -1;
9652 for (int j = 0; j != NumEltsPerLane; ++j) {
9653 int M = Mask[(i * NumEltsPerLane) + j];
9654 if (M < 0)
9655 continue;
9656 int Lane = (M % NumElts) / NumEltsPerLane;
9657 if (SrcLane >= 0 && SrcLane != Lane)
9658 return true;
9659 SrcLane = Lane;
9660 }
9661 }
9662 }
9663 return false;
9664}
9665
9666/// Test whether a shuffle mask is equivalent within each sub-lane.
9667///
9668/// This checks a shuffle mask to see if it is performing the same
9669/// lane-relative shuffle in each sub-lane. This trivially implies
9670/// that it is also not lane-crossing. It may however involve a blend from the
9671/// same lane of a second vector.
9672///
9673/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
9674/// non-trivial to compute in the face of undef lanes. The representation is
9675/// suitable for use with existing 128-bit shuffles as entries from the second
9676/// vector have been remapped to [LaneSize, 2*LaneSize).
9677static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
9678 ArrayRef<int> Mask,
9679 SmallVectorImpl<int> &RepeatedMask) {
9680 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
9681 RepeatedMask.assign(LaneSize, -1);
9682 int Size = Mask.size();
9683 for (int i = 0; i < Size; ++i) {
9684 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
9685 if (Mask[i] < 0)
9686 continue;
9687 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9688 // This entry crosses lanes, so there is no way to model this shuffle.
9689 return false;
9690
9691 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
9692 // Adjust second vector indices to start at LaneSize instead of Size.
9693 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
9694 : Mask[i] % LaneSize + LaneSize;
9695 if (RepeatedMask[i % LaneSize] < 0)
9696 // This is the first non-undef entry in this slot of a 128-bit lane.
9697 RepeatedMask[i % LaneSize] = LocalM;
9698 else if (RepeatedMask[i % LaneSize] != LocalM)
9699 // Found a mismatch with the repeated mask.
9700 return false;
9701 }
9702 return true;
9703}
9704
9705/// Test whether a shuffle mask is equivalent within each 128-bit lane.
9706static bool
9708 SmallVectorImpl<int> &RepeatedMask) {
9709 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
9710}
9711
9712static bool
9714 SmallVector<int, 32> RepeatedMask;
9715 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
9716}
9717
9718/// Test whether a shuffle mask is equivalent within each 256-bit lane.
9719static bool
9721 SmallVectorImpl<int> &RepeatedMask) {
9722 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
9723}
9724
9725/// Test whether a target shuffle mask is equivalent within each sub-lane.
9726/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
9727static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,
9728 unsigned EltSizeInBits,
9729 ArrayRef<int> Mask,
9730 SmallVectorImpl<int> &RepeatedMask) {
9731 int LaneSize = LaneSizeInBits / EltSizeInBits;
9732 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
9733 int Size = Mask.size();
9734 for (int i = 0; i < Size; ++i) {
9735 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
9736 if (Mask[i] == SM_SentinelUndef)
9737 continue;
9738 if (Mask[i] == SM_SentinelZero) {
9739 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
9740 return false;
9741 RepeatedMask[i % LaneSize] = SM_SentinelZero;
9742 continue;
9743 }
9744 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
9745 // This entry crosses lanes, so there is no way to model this shuffle.
9746 return false;
9747
9748 // Handle the in-lane shuffles by detecting if and when they repeat. Adjust
9749 // later vector indices to start at multiples of LaneSize instead of Size.
9750 int LaneM = Mask[i] / Size;
9751 int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);
9752 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
9753 // This is the first non-undef entry in this slot of a 128-bit lane.
9754 RepeatedMask[i % LaneSize] = LocalM;
9755 else if (RepeatedMask[i % LaneSize] != LocalM)
9756 // Found a mismatch with the repeated mask.
9757 return false;
9758 }
9759 return true;
9760}
9761
9762/// Test whether a target shuffle mask is equivalent within each sub-lane.
9763/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
9764static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
9765 ArrayRef<int> Mask,
9766 SmallVectorImpl<int> &RepeatedMask) {
9767 return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),
9768 Mask, RepeatedMask);
9769}
9770
9771/// Checks whether the vector elements referenced by two shuffle masks are
9772/// equivalent.
9773static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,
9774 int Idx, int ExpectedIdx) {
9775 assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&
9776 ExpectedIdx < MaskSize && "Out of range element index");
9777 if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())
9778 return false;
9779
9780 switch (Op.getOpcode()) {
9781 case ISD::BUILD_VECTOR:
9782 // If the values are build vectors, we can look through them to find
9783 // equivalent inputs that make the shuffles equivalent.
9784 // TODO: Handle MaskSize != Op.getNumOperands()?
9785 if (MaskSize == (int)Op.getNumOperands() &&
9786 MaskSize == (int)ExpectedOp.getNumOperands())
9787 return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);
9788 break;
9789 case X86ISD::VBROADCAST:
9791 // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?
9792 return (Op == ExpectedOp &&
9793 (int)Op.getValueType().getVectorNumElements() == MaskSize);
9794 case X86ISD::HADD:
9795 case X86ISD::HSUB:
9796 case X86ISD::FHADD:
9797 case X86ISD::FHSUB:
9798 case X86ISD::PACKSS:
9799 case X86ISD::PACKUS:
9800 // HOP(X,X) can refer to the elt from the lower/upper half of a lane.
9801 // TODO: Handle MaskSize != NumElts?
9802 // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.
9803 if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {
9804 MVT VT = Op.getSimpleValueType();
9805 int NumElts = VT.getVectorNumElements();
9806 if (MaskSize == NumElts) {
9807 int NumLanes = VT.getSizeInBits() / 128;
9808 int NumEltsPerLane = NumElts / NumLanes;
9809 int NumHalfEltsPerLane = NumEltsPerLane / 2;
9810 bool SameLane =
9811 (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);
9812 bool SameElt =
9813 (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);
9814 return SameLane && SameElt;
9815 }
9816 }
9817 break;
9818 }
9819
9820 return false;
9821}
9822
9823/// Checks whether a shuffle mask is equivalent to an explicit list of
9824/// arguments.
9825///
9826/// This is a fast way to test a shuffle mask against a fixed pattern:
9827///
9828/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
9829///
9830/// It returns true if the mask is exactly as wide as the argument list, and
9831/// each element of the mask is either -1 (signifying undef) or the value given
9832/// in the argument.
9833static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,
9834 SDValue V1 = SDValue(),
9835 SDValue V2 = SDValue()) {
9836 int Size = Mask.size();
9837 if (Size != (int)ExpectedMask.size())
9838 return false;
9839
9840 for (int i = 0; i < Size; ++i) {
9841 assert(Mask[i] >= -1 && "Out of bound mask element!");
9842 int MaskIdx = Mask[i];
9843 int ExpectedIdx = ExpectedMask[i];
9844 if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {
9845 SDValue MaskV = MaskIdx < Size ? V1 : V2;
9846 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9847 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9848 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9849 if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
9850 return false;
9851 }
9852 }
9853 return true;
9854}
9855
9856/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
9857///
9858/// The masks must be exactly the same width.
9859///
9860/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
9861/// value in ExpectedMask is always accepted. Otherwise the indices must match.
9862///
9863/// SM_SentinelZero is accepted as a valid negative index but must match in
9864/// both, or via a known bits test.
9866 ArrayRef<int> ExpectedMask,
9867 const SelectionDAG &DAG,
9868 SDValue V1 = SDValue(),
9869 SDValue V2 = SDValue()) {
9870 int Size = Mask.size();
9871 if (Size != (int)ExpectedMask.size())
9872 return false;
9873 assert(llvm::all_of(ExpectedMask,
9874 [Size](int M) { return isInRange(M, 0, 2 * Size); }) &&
9875 "Illegal target shuffle mask");
9876
9877 // Check for out-of-range target shuffle mask indices.
9878 if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))
9879 return false;
9880
9881 // Don't use V1/V2 if they're not the same size as the shuffle mask type.
9882 if (V1 && (V1.getValueSizeInBits() != VT.getSizeInBits() ||
9883 !V1.getValueType().isVector()))
9884 V1 = SDValue();
9885 if (V2 && (V2.getValueSizeInBits() != VT.getSizeInBits() ||
9886 !V2.getValueType().isVector()))
9887 V2 = SDValue();
9888
9889 APInt ZeroV1 = APInt::getZero(Size);
9890 APInt ZeroV2 = APInt::getZero(Size);
9891
9892 for (int i = 0; i < Size; ++i) {
9893 int MaskIdx = Mask[i];
9894 int ExpectedIdx = ExpectedMask[i];
9895 if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)
9896 continue;
9897 if (MaskIdx == SM_SentinelZero) {
9898 // If we need this expected index to be a zero element, then update the
9899 // relevant zero mask and perform the known bits at the end to minimize
9900 // repeated computes.
9901 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9902 if (ExpectedV &&
9903 Size == (int)ExpectedV.getValueType().getVectorNumElements()) {
9904 int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9905 APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;
9906 ZeroMask.setBit(BitIdx);
9907 continue;
9908 }
9909 }
9910 if (MaskIdx >= 0) {
9911 SDValue MaskV = MaskIdx < Size ? V1 : V2;
9912 SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;
9913 MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);
9914 ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);
9915 if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))
9916 continue;
9917 }
9918 return false;
9919 }
9920 return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&
9921 (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));
9922}
9923
9924// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
9925// instructions.
9927 const SelectionDAG &DAG) {
9928 if (VT != MVT::v8i32 && VT != MVT::v8f32)
9929 return false;
9930
9931 SmallVector<int, 8> Unpcklwd;
9932 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
9933 /* Unary = */ false);
9934 SmallVector<int, 8> Unpckhwd;
9935 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
9936 /* Unary = */ false);
9937 bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||
9938 isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));
9939 return IsUnpackwdMask;
9940}
9941
9943 const SelectionDAG &DAG) {
9944 // Create 128-bit vector type based on mask size.
9945 MVT EltVT = MVT::getIntegerVT(128 / Mask.size());
9946 MVT VT = MVT::getVectorVT(EltVT, Mask.size());
9947
9948 // We can't assume a canonical shuffle mask, so try the commuted version too.
9949 SmallVector<int, 4> CommutedMask(Mask);
9951
9952 // Match any of unary/binary or low/high.
9953 for (unsigned i = 0; i != 4; ++i) {
9954 SmallVector<int, 16> UnpackMask;
9955 createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);
9956 if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||
9957 isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))
9958 return true;
9959 }
9960 return false;
9961}
9962
9963/// Return true if a shuffle mask chooses elements identically in its top and
9964/// bottom halves. For example, any splat mask has the same top and bottom
9965/// halves. If an element is undefined in only one half of the mask, the halves
9966/// are not considered identical.
9968 assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask");
9969 unsigned HalfSize = Mask.size() / 2;
9970 for (unsigned i = 0; i != HalfSize; ++i) {
9971 if (Mask[i] != Mask[i + HalfSize])
9972 return false;
9973 }
9974 return true;
9975}
9976
9977/// Get a 4-lane 8-bit shuffle immediate for a mask.
9978///
9979/// This helper function produces an 8-bit shuffle immediate corresponding to
9980/// the ubiquitous shuffle encoding scheme used in x86 instructions for
9981/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
9982/// example.
9983///
9984/// NB: We rely heavily on "undef" masks preserving the input lane.
9985static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
9986 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
9987 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
9988 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
9989 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
9990 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
9991
9992 // If the mask only uses one non-undef element, then fully 'splat' it to
9993 // improve later broadcast matching.
9994 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
9995 assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask");
9996
9997 int FirstElt = Mask[FirstIndex];
9998 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))
9999 return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;
10000
10001 unsigned Imm = 0;
10002 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
10003 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
10004 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
10005 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
10006 return Imm;
10007}
10008
10010 SelectionDAG &DAG) {
10011 return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
10012}
10013
10014// Canonicalize SHUFPD mask to improve chances of further folding.
10015// Mask elements are assumed to be -1, 0 or 1 to match the SHUFPD lo/hi pattern.
10016static unsigned getSHUFPDImm(ArrayRef<int> Mask) {
10017 assert((Mask.size() == 2 || Mask.size() == 4 || Mask.size() == 8) &&
10018 "Unexpected SHUFPD mask size");
10019 assert(all_of(Mask, [](int M) { return -1 <= M && M <= 1; }) &&
10020 "Unexpected SHUFPD mask elements");
10021
10022 // If the mask only uses one non-undef element, then fully 'splat' it to
10023 // improve later broadcast matching.
10024 int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();
10025 assert(0 <= FirstIndex && FirstIndex < (int)Mask.size() &&
10026 "All undef shuffle mask");
10027
10028 int FirstElt = Mask[FirstIndex];
10029 if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }) &&
10030 count_if(Mask, [FirstElt](int M) { return M == FirstElt; }) > 1) {
10031 unsigned Imm = 0;
10032 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10033 Imm |= FirstElt << I;
10034 return Imm;
10035 }
10036
10037 // Attempt to keep any undef elements in place to improve chances of the
10038 // shuffle becoming a (commutative) blend.
10039 unsigned Imm = 0;
10040 for (unsigned I = 0, E = Mask.size(); I != E; ++I)
10041 Imm |= (Mask[I] < 0 ? (I & 1) : Mask[I]) << I;
10042
10043 return Imm;
10044}
10045
10047 SelectionDAG &DAG) {
10048 return DAG.getTargetConstant(getSHUFPDImm(Mask), DL, MVT::i8);
10049}
10050
10051// The Shuffle result is as follow:
10052// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
10053// Each Zeroable's element correspond to a particular Mask's element.
10054// As described in computeZeroableShuffleElements function.
10055//
10056// The function looks for a sub-mask that the nonzero elements are in
10057// increasing order. If such sub-mask exist. The function returns true.
10058static bool isNonZeroElementsInOrder(const APInt &Zeroable,
10059 ArrayRef<int> Mask, const EVT &VectorType,
10060 bool &IsZeroSideLeft) {
10061 int NextElement = -1;
10062 // Check if the Mask's nonzero elements are in increasing order.
10063 for (int i = 0, e = Mask.size(); i < e; i++) {
10064 // Checks if the mask's zeros elements are built from only zeros.
10065 assert(Mask[i] >= -1 && "Out of bound mask element!");
10066 if (Mask[i] < 0)
10067 return false;
10068 if (Zeroable[i])
10069 continue;
10070 // Find the lowest non zero element
10071 if (NextElement < 0) {
10072 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
10073 IsZeroSideLeft = NextElement != 0;
10074 }
10075 // Exit if the mask's non zero elements are not in increasing order.
10076 if (NextElement != Mask[i])
10077 return false;
10078 NextElement++;
10079 }
10080 return true;
10081}
10082
10083/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
10085 ArrayRef<int> Mask, SDValue V1,
10086 SDValue V2, const APInt &Zeroable,
10087 const X86Subtarget &Subtarget,
10088 SelectionDAG &DAG) {
10089 int Size = Mask.size();
10090 int LaneSize = 128 / VT.getScalarSizeInBits();
10091 const int NumBytes = VT.getSizeInBits() / 8;
10092 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
10093
10094 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
10095 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
10096 (Subtarget.hasBWI() && VT.is512BitVector()));
10097
10098 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
10099 // Sign bit set in i8 mask means zero element.
10100 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
10101
10102 SDValue V;
10103 for (int i = 0; i < NumBytes; ++i) {
10104 int M = Mask[i / NumEltBytes];
10105 if (M < 0) {
10106 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
10107 continue;
10108 }
10109 if (Zeroable[i / NumEltBytes]) {
10110 PSHUFBMask[i] = ZeroMask;
10111 continue;
10112 }
10113
10114 // We can only use a single input of V1 or V2.
10115 SDValue SrcV = (M >= Size ? V2 : V1);
10116 if (V && V != SrcV)
10117 return SDValue();
10118 V = SrcV;
10119 M %= Size;
10120
10121 // PSHUFB can't cross lanes, ensure this doesn't happen.
10122 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
10123 return SDValue();
10124
10125 M = M % LaneSize;
10126 M = M * NumEltBytes + (i % NumEltBytes);
10127 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
10128 }
10129 assert(V && "Failed to find a source input");
10130
10131 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
10132 return DAG.getBitcast(
10133 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
10134 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
10135}
10136
10137static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
10138 const X86Subtarget &Subtarget, SelectionDAG &DAG,
10139 const SDLoc &dl);
10140
10141// X86 has dedicated shuffle that can be lowered to VEXPAND
10143 SDValue V2, ArrayRef<int> Mask,
10144 const APInt &Zeroable,
10145 const X86Subtarget &Subtarget,
10146 SelectionDAG &DAG) {
10147 bool IsLeftZeroSide = true;
10148 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
10149 IsLeftZeroSide))
10150 return SDValue();
10151 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
10153 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
10154 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
10155 unsigned NumElts = VT.getVectorNumElements();
10156 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
10157 "Unexpected number of vector elements");
10158 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
10159 Subtarget, DAG, DL);
10160 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
10161 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
10162 return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
10163}
10164
10165static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
10166 unsigned &UnpackOpcode, bool IsUnary,
10167 ArrayRef<int> TargetMask, const SDLoc &DL,
10168 SelectionDAG &DAG,
10169 const X86Subtarget &Subtarget) {
10170 int NumElts = VT.getVectorNumElements();
10171
10172 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
10173 for (int i = 0; i != NumElts; i += 2) {
10174 int M1 = TargetMask[i + 0];
10175 int M2 = TargetMask[i + 1];
10176 Undef1 &= (SM_SentinelUndef == M1);
10177 Undef2 &= (SM_SentinelUndef == M2);
10178 Zero1 &= isUndefOrZero(M1);
10179 Zero2 &= isUndefOrZero(M2);
10180 }
10181 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
10182 "Zeroable shuffle detected");
10183
10184 // Attempt to match the target mask against the unpack lo/hi mask patterns.
10185 SmallVector<int, 64> Unpckl, Unpckh;
10186 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
10187 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,
10188 (IsUnary ? V1 : V2))) {
10189 UnpackOpcode = X86ISD::UNPCKL;
10190 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10191 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10192 return true;
10193 }
10194
10195 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
10196 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,
10197 (IsUnary ? V1 : V2))) {
10198 UnpackOpcode = X86ISD::UNPCKH;
10199 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
10200 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
10201 return true;
10202 }
10203
10204 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
10205 if (IsUnary && (Zero1 || Zero2)) {
10206 // Don't bother if we can blend instead.
10207 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
10208 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
10209 return false;
10210
10211 bool MatchLo = true, MatchHi = true;
10212 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
10213 int M = TargetMask[i];
10214
10215 // Ignore if the input is known to be zero or the index is undef.
10216 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
10217 (M == SM_SentinelUndef))
10218 continue;
10219
10220 MatchLo &= (M == Unpckl[i]);
10221 MatchHi &= (M == Unpckh[i]);
10222 }
10223
10224 if (MatchLo || MatchHi) {
10225 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
10226 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10227 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
10228 return true;
10229 }
10230 }
10231
10232 // If a binary shuffle, commute and try again.
10233 if (!IsUnary) {
10235 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {
10236 UnpackOpcode = X86ISD::UNPCKL;
10237 std::swap(V1, V2);
10238 return true;
10239 }
10240
10242 if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {
10243 UnpackOpcode = X86ISD::UNPCKH;
10244 std::swap(V1, V2);
10245 return true;
10246 }
10247 }
10248
10249 return false;
10250}
10251
10252// X86 has dedicated unpack instructions that can handle specific blend
10253// operations: UNPCKH and UNPCKL.
10255 SDValue V2, ArrayRef<int> Mask,
10256 SelectionDAG &DAG) {
10257 SmallVector<int, 8> Unpckl;
10258 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
10259 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10260 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
10261
10262 SmallVector<int, 8> Unpckh;
10263 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
10264 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10265 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
10266
10267 // Commute and try again.
10269 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10270 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
10271
10273 if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10274 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
10275
10276 return SDValue();
10277}
10278
10279/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
10280/// followed by unpack 256-bit.
10282 SDValue V2, ArrayRef<int> Mask,
10283 SelectionDAG &DAG) {
10284 SmallVector<int, 32> Unpckl, Unpckh;
10285 createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
10286 createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
10287
10288 unsigned UnpackOpcode;
10289 if (isShuffleEquivalent(Mask, Unpckl, V1, V2))
10290 UnpackOpcode = X86ISD::UNPCKL;
10291 else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))
10292 UnpackOpcode = X86ISD::UNPCKH;
10293 else
10294 return SDValue();
10295
10296 // This is a "natural" unpack operation (rather than the 128-bit sectored
10297 // operation implemented by AVX). We need to rearrange 64-bit chunks of the
10298 // input in order to use the x86 instruction.
10299 V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
10300 DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
10301 V1 = DAG.getBitcast(VT, V1);
10302 return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
10303}
10304
10305// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
10306// source into the lower elements and zeroing the upper elements.
10307static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
10308 ArrayRef<int> Mask, const APInt &Zeroable,
10309 const X86Subtarget &Subtarget) {
10310 if (!VT.is512BitVector() && !Subtarget.hasVLX())
10311 return false;
10312
10313 unsigned NumElts = Mask.size();
10314 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10315 unsigned MaxScale = 64 / EltSizeInBits;
10316
10317 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10318 unsigned SrcEltBits = EltSizeInBits * Scale;
10319 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10320 continue;
10321 unsigned NumSrcElts = NumElts / Scale;
10322 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
10323 continue;
10324 unsigned UpperElts = NumElts - NumSrcElts;
10325 if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10326 continue;
10327 SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
10328 SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
10329 DstVT = MVT::getIntegerVT(EltSizeInBits);
10330 if ((NumSrcElts * EltSizeInBits) >= 128) {
10331 // ISD::TRUNCATE
10332 DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
10333 } else {
10334 // X86ISD::VTRUNC
10335 DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
10336 }
10337 return true;
10338 }
10339
10340 return false;
10341}
10342
10343// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper
10344// element padding to the final DstVT.
10345static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
10346 const X86Subtarget &Subtarget,
10347 SelectionDAG &DAG, bool ZeroUppers) {
10348 MVT SrcVT = Src.getSimpleValueType();
10349 MVT DstSVT = DstVT.getScalarType();
10350 unsigned NumDstElts = DstVT.getVectorNumElements();
10351 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10352 unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
10353
10354 if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))
10355 return SDValue();
10356
10357 // Perform a direct ISD::TRUNCATE if possible.
10358 if (NumSrcElts == NumDstElts)
10359 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
10360
10361 if (NumSrcElts > NumDstElts) {
10362 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10363 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10364 return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
10365 }
10366
10367 if ((NumSrcElts * DstEltSizeInBits) >= 128) {
10368 MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
10369 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
10370 return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10371 DstVT.getSizeInBits());
10372 }
10373
10374 // Non-VLX targets must truncate from a 512-bit type, so we need to
10375 // widen, truncate and then possibly extract the original subvector.
10376 if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
10377 SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);
10378 return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
10379 }
10380
10381 // Fallback to a X86ISD::VTRUNC, padding if necessary.
10382 MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
10383 SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
10384 if (DstVT != TruncVT)
10385 Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
10386 DstVT.getSizeInBits());
10387 return Trunc;
10388}
10389
10390// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
10391//
10392// An example is the following:
10393//
10394// t0: ch = EntryToken
10395// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
10396// t25: v4i32 = truncate t2
10397// t41: v8i16 = bitcast t25
10398// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
10399// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
10400// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
10401// t18: v2i64 = bitcast t51
10402//
10403// One can just use a single vpmovdw instruction, without avx512vl we need to
10404// use the zmm variant and extract the lower subvector, padding with zeroes.
10405// TODO: Merge with lowerShuffleAsVTRUNC.
10407 SDValue V2, ArrayRef<int> Mask,
10408 const APInt &Zeroable,
10409 const X86Subtarget &Subtarget,
10410 SelectionDAG &DAG) {
10411 assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
10412 if (!Subtarget.hasAVX512())
10413 return SDValue();
10414
10415 unsigned NumElts = VT.getVectorNumElements();
10416 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10417 unsigned MaxScale = 64 / EltSizeInBits;
10418 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10419 unsigned SrcEltBits = EltSizeInBits * Scale;
10420 unsigned NumSrcElts = NumElts / Scale;
10421 unsigned UpperElts = NumElts - NumSrcElts;
10422 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
10423 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10424 continue;
10425
10426 // Attempt to find a matching source truncation, but as a fall back VLX
10427 // cases can use the VPMOV directly.
10428 SDValue Src = peekThroughBitcasts(V1);
10429 if (Src.getOpcode() == ISD::TRUNCATE &&
10430 Src.getScalarValueSizeInBits() == SrcEltBits) {
10431 Src = Src.getOperand(0);
10432 } else if (Subtarget.hasVLX()) {
10433 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10434 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10435 Src = DAG.getBitcast(SrcVT, Src);
10436 // Don't do this if PACKSS/PACKUS could perform it cheaper.
10437 if (Scale == 2 &&
10438 ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
10439 (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
10440 return SDValue();
10441 } else
10442 return SDValue();
10443
10444 // VPMOVWB is only available with avx512bw.
10445 if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
10446 return SDValue();
10447
10448 bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
10449 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10450 }
10451
10452 return SDValue();
10453}
10454
10455// Attempt to match binary shuffle patterns as a truncate.
10457 SDValue V2, ArrayRef<int> Mask,
10458 const APInt &Zeroable,
10459 const X86Subtarget &Subtarget,
10460 SelectionDAG &DAG) {
10461 assert((VT.is128BitVector() || VT.is256BitVector()) &&
10462 "Unexpected VTRUNC type");
10463 if (!Subtarget.hasAVX512())
10464 return SDValue();
10465
10466 unsigned NumElts = VT.getVectorNumElements();
10467 unsigned EltSizeInBits = VT.getScalarSizeInBits();
10468 unsigned MaxScale = 64 / EltSizeInBits;
10469 for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
10470 // TODO: Support non-BWI VPMOVWB truncations?
10471 unsigned SrcEltBits = EltSizeInBits * Scale;
10472 if (SrcEltBits < 32 && !Subtarget.hasBWI())
10473 continue;
10474
10475 // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>
10476 // Bail if the V2 elements are undef.
10477 unsigned NumHalfSrcElts = NumElts / Scale;
10478 unsigned NumSrcElts = 2 * NumHalfSrcElts;
10479 for (unsigned Offset = 0; Offset != Scale; ++Offset) {
10480 if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||
10481 isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
10482 continue;
10483
10484 // The elements beyond the truncation must be undef/zero.
10485 unsigned UpperElts = NumElts - NumSrcElts;
10486 if (UpperElts > 0 &&
10487 !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
10488 continue;
10489 bool UndefUppers =
10490 UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);
10491
10492 // For offset truncations, ensure that the concat is cheap.
10493 if (Offset) {
10494 auto IsCheapConcat = [&](SDValue Lo, SDValue Hi) {
10495 if (Lo.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
10496 Hi.getOpcode() == ISD::EXTRACT_SUBVECTOR)
10497 return Lo.getOperand(0) == Hi.getOperand(0);
10498 if (ISD::isNormalLoad(Lo.getNode()) &&
10499 ISD::isNormalLoad(Hi.getNode())) {
10500 auto *LDLo = cast<LoadSDNode>(Lo);
10501 auto *LDHi = cast<LoadSDNode>(Hi);
10503 LDHi, LDLo, Lo.getValueType().getStoreSize(), 1);
10504 }
10505 return false;
10506 };
10507 if (!IsCheapConcat(peekThroughBitcasts(V1), peekThroughBitcasts(V2)))
10508 continue;
10509 }
10510
10511 // As we're using both sources then we need to concat them together
10512 // and truncate from the double-sized src.
10513 MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
10514 SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
10515
10516 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10517 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10518 Src = DAG.getBitcast(SrcVT, Src);
10519
10520 // Shift the offset'd elements into place for the truncation.
10521 // TODO: Use getTargetVShiftByConstNode.
10522 if (Offset)
10523 Src = DAG.getNode(
10524 X86ISD::VSRLI, DL, SrcVT, Src,
10525 DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));
10526
10527 return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);
10528 }
10529 }
10530
10531 return SDValue();
10532}
10533
10534/// Check whether a compaction lowering can be done by dropping even/odd
10535/// elements and compute how many times even/odd elements must be dropped.
10536///
10537/// This handles shuffles which take every Nth element where N is a power of
10538/// two. Example shuffle masks:
10539///
10540/// (even)
10541/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10542/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10543/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10544/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10545/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10546/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10547///
10548/// (odd)
10549/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14
10550/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
10551///
10552/// Any of these lanes can of course be undef.
10553///
10554/// This routine only supports N <= 3.
10555/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10556/// for larger N.
10557///
10558/// \returns N above, or the number of times even/odd elements must be dropped
10559/// if there is such a number. Otherwise returns zero.
10560static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,
10561 bool IsSingleInput) {
10562 // The modulus for the shuffle vector entries is based on whether this is
10563 // a single input or not.
10564 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10565 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10566 "We should only be called with masks with a power-of-2 size!");
10567
10568 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10569 int Offset = MatchEven ? 0 : 1;
10570
10571 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10572 // and 2^3 simultaneously. This is because we may have ambiguity with
10573 // partially undef inputs.
10574 bool ViableForN[3] = {true, true, true};
10575
10576 for (int i = 0, e = Mask.size(); i < e; ++i) {
10577 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10578 // want.
10579 if (Mask[i] < 0)
10580 continue;
10581
10582 bool IsAnyViable = false;
10583 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10584 if (ViableForN[j]) {
10585 uint64_t N = j + 1;
10586
10587 // The shuffle mask must be equal to (i * 2^N) % M.
10588 if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))
10589 IsAnyViable = true;
10590 else
10591 ViableForN[j] = false;
10592 }
10593 // Early exit if we exhaust the possible powers of two.
10594 if (!IsAnyViable)
10595 break;
10596 }
10597
10598 for (unsigned j = 0; j != std::size(ViableForN); ++j)
10599 if (ViableForN[j])
10600 return j + 1;
10601
10602 // Return 0 as there is no viable power of two.
10603 return 0;
10604}
10605
10606// X86 has dedicated pack instructions that can handle specific truncation
10607// operations: PACKSS and PACKUS.
10608// Checks for compaction shuffle masks if MaxStages > 1.
10609// TODO: Add support for matching multiple PACKSS/PACKUS stages.
10610static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
10611 unsigned &PackOpcode, ArrayRef<int> TargetMask,
10612 const SelectionDAG &DAG,
10613 const X86Subtarget &Subtarget,
10614 unsigned MaxStages = 1) {
10615 unsigned NumElts = VT.getVectorNumElements();
10616 unsigned BitSize = VT.getScalarSizeInBits();
10617 assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
10618 "Illegal maximum compaction");
10619
10620 auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
10621 unsigned NumSrcBits = PackVT.getScalarSizeInBits();
10622 unsigned NumPackedBits = NumSrcBits - BitSize;
10623 N1 = peekThroughBitcasts(N1);
10624 N2 = peekThroughBitcasts(N2);
10625 unsigned NumBits1 = N1.getScalarValueSizeInBits();
10626 unsigned NumBits2 = N2.getScalarValueSizeInBits();
10627 bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);
10628 bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);
10629 if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||
10630 (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))
10631 return false;
10632 if (Subtarget.hasSSE41() || BitSize == 8) {
10633 APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
10634 if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&
10635 (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {
10636 V1 = N1;
10637 V2 = N2;
10638 SrcVT = PackVT;
10639 PackOpcode = X86ISD::PACKUS;
10640 return true;
10641 }
10642 }
10643 bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);
10644 bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);
10645 if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||
10646 DAG.ComputeNumSignBits(N1) > NumPackedBits) &&
10647 (N2.isUndef() || IsZero2 || IsAllOnes2 ||
10648 DAG.ComputeNumSignBits(N2) > NumPackedBits)) {
10649 V1 = N1;
10650 V2 = N2;
10651 SrcVT = PackVT;
10652 PackOpcode = X86ISD::PACKSS;
10653 return true;
10654 }
10655 return false;
10656 };
10657
10658 // Attempt to match against wider and wider compaction patterns.
10659 for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
10660 MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
10661 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
10662
10663 // Try binary shuffle.
10664 SmallVector<int, 32> BinaryMask;
10665 createPackShuffleMask(VT, BinaryMask, false, NumStages);
10666 if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))
10667 if (MatchPACK(V1, V2, PackVT))
10668 return true;
10669
10670 // Try unary shuffle.
10671 SmallVector<int, 32> UnaryMask;
10672 createPackShuffleMask(VT, UnaryMask, true, NumStages);
10673 if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))
10674 if (MatchPACK(V1, V1, PackVT))
10675 return true;
10676 }
10677
10678 return false;
10679}
10680
10682 SDValue V2, ArrayRef<int> Mask,
10683 const X86Subtarget &Subtarget,
10684 SelectionDAG &DAG) {
10685 MVT PackVT;
10686 unsigned PackOpcode;
10687 unsigned SizeBits = VT.getSizeInBits();
10688 unsigned EltBits = VT.getScalarSizeInBits();
10689 unsigned MaxStages = Log2_32(64 / EltBits);
10690 if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
10691 Subtarget, MaxStages))
10692 return SDValue();
10693
10694 unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
10695 unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
10696
10697 // Don't lower multi-stage packs on AVX512, truncation is better.
10698 if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
10699 return SDValue();
10700
10701 // Pack to the largest type possible:
10702 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
10703 unsigned MaxPackBits = 16;
10704 if (CurrentEltBits > 16 &&
10705 (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
10706 MaxPackBits = 32;
10707
10708 // Repeatedly pack down to the target size.
10709 SDValue Res;
10710 for (unsigned i = 0; i != NumStages; ++i) {
10711 unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
10712 unsigned NumSrcElts = SizeBits / SrcEltBits;
10713 MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
10714 MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
10715 MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
10716 MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
10717 Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
10718 DAG.getBitcast(SrcVT, V2));
10719 V1 = V2 = Res;
10720 CurrentEltBits /= 2;
10721 }
10722 assert(Res && Res.getValueType() == VT &&
10723 "Failed to lower compaction shuffle");
10724 return Res;
10725}
10726
10727/// Try to emit a bitmask instruction for a shuffle.
10728///
10729/// This handles cases where we can model a blend exactly as a bitmask due to
10730/// one of the inputs being zeroable.
10732 SDValue V2, ArrayRef<int> Mask,
10733 const APInt &Zeroable,
10734 const X86Subtarget &Subtarget,
10735 SelectionDAG &DAG) {
10736 MVT MaskVT = VT;
10737 MVT EltVT = VT.getVectorElementType();
10738 SDValue Zero, AllOnes;
10739 // Use f64 if i64 isn't legal.
10740 if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
10741 EltVT = MVT::f64;
10742 MaskVT = MVT::getVectorVT(EltVT, Mask.size());
10743 }
10744
10745 MVT LogicVT = VT;
10746 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
10747 Zero = DAG.getConstantFP(0.0, DL, EltVT);
10748 APFloat AllOnesValue = APFloat::getAllOnesValue(EltVT.getFltSemantics());
10749 AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
10750 LogicVT =
10751 MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
10752 } else {
10753 Zero = DAG.getConstant(0, DL, EltVT);
10754 AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10755 }
10756
10757 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
10758 SDValue V;
10759 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10760 if (Zeroable[i])
10761 continue;
10762 if (Mask[i] % Size != i)
10763 return SDValue(); // Not a blend.
10764 if (!V)
10765 V = Mask[i] < Size ? V1 : V2;
10766 else if (V != (Mask[i] < Size ? V1 : V2))
10767 return SDValue(); // Can only let one input through the mask.
10768
10769 VMaskOps[i] = AllOnes;
10770 }
10771 if (!V)
10772 return SDValue(); // No non-zeroable elements!
10773
10774 SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);
10775 VMask = DAG.getBitcast(LogicVT, VMask);
10776 V = DAG.getBitcast(LogicVT, V);
10777 SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);
10778 return DAG.getBitcast(VT, And);
10779}
10780
10781/// Try to emit a blend instruction for a shuffle using bit math.
10782///
10783/// This is used as a fallback approach when first class blend instructions are
10784/// unavailable. Currently it is only suitable for integer vectors, but could
10785/// be generalized for floating point vectors if desirable.
10787 SDValue V2, ArrayRef<int> Mask,
10788 SelectionDAG &DAG) {
10789 assert(VT.isInteger() && "Only supports integer vector types!");
10790 MVT EltVT = VT.getVectorElementType();
10791 SDValue Zero = DAG.getConstant(0, DL, EltVT);
10792 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
10794 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
10795 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
10796 return SDValue(); // Shuffled input!
10797 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
10798 }
10799
10800 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
10801 return getBitSelect(DL, VT, V1, V2, V1Mask, DAG);
10802}
10803
10805 SDValue PreservedSrc,
10806 const X86Subtarget &Subtarget,
10807 SelectionDAG &DAG);
10808
10811 const APInt &Zeroable, bool &ForceV1Zero,
10812 bool &ForceV2Zero, uint64_t &BlendMask) {
10813 bool V1IsZeroOrUndef =
10815 bool V2IsZeroOrUndef =
10816 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
10817
10818 BlendMask = 0;
10819 ForceV1Zero = false, ForceV2Zero = false;
10820 assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask");
10821
10822 int NumElts = Mask.size();
10823 int NumLanes = VT.getSizeInBits() / 128;
10824 int NumEltsPerLane = NumElts / NumLanes;
10825 assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch");
10826
10827 // For 32/64-bit elements, if we only reference one input (plus any undefs),
10828 // then ensure the blend mask part for that lane just references that input.
10829 bool ForceWholeLaneMasks =
10830 VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;
10831
10832 // Attempt to generate the binary blend mask. If an input is zero then
10833 // we can use any lane.
10834 for (int Lane = 0; Lane != NumLanes; ++Lane) {
10835 // Keep track of the inputs used per lane.
10836 bool LaneV1InUse = false;
10837 bool LaneV2InUse = false;
10838 uint64_t LaneBlendMask = 0;
10839 for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {
10840 int Elt = (Lane * NumEltsPerLane) + LaneElt;
10841 int M = Mask[Elt];
10842 if (M == SM_SentinelUndef)
10843 continue;
10844 if (M == Elt || (0 <= M && M < NumElts &&
10845 IsElementEquivalent(NumElts, V1, V1, M, Elt))) {
10846 Mask[Elt] = Elt;
10847 LaneV1InUse = true;
10848 continue;
10849 }
10850 if (M == (Elt + NumElts) ||
10851 (NumElts <= M &&
10852 IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {
10853 LaneBlendMask |= 1ull << LaneElt;
10854 Mask[Elt] = Elt + NumElts;
10855 LaneV2InUse = true;
10856 continue;
10857 }
10858 if (Zeroable[Elt]) {
10859 if (V1IsZeroOrUndef) {
10860 ForceV1Zero = true;
10861 Mask[Elt] = Elt;
10862 LaneV1InUse = true;
10863 continue;
10864 }
10865 if (V2IsZeroOrUndef) {
10866 ForceV2Zero = true;
10867 LaneBlendMask |= 1ull << LaneElt;
10868 Mask[Elt] = Elt + NumElts;
10869 LaneV2InUse = true;
10870 continue;
10871 }
10872 }
10873 return false;
10874 }
10875
10876 // If we only used V2 then splat the lane blend mask to avoid any demanded
10877 // elts from V1 in this lane (the V1 equivalent is implicit with a zero
10878 // blend mask bit).
10879 if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)
10880 LaneBlendMask = (1ull << NumEltsPerLane) - 1;
10881
10882 BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);
10883 }
10884 return true;
10885}
10886
10887/// Try to emit a blend instruction for a shuffle.
10888///
10889/// This doesn't do any checks for the availability of instructions for blending
10890/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
10891/// be matched in the backend with the type given. What it does check for is
10892/// that the shuffle mask is a blend, or convertible into a blend with zero.
10894 SDValue V2, ArrayRef<int> Original,
10895 const APInt &Zeroable,
10896 const X86Subtarget &Subtarget,
10897 SelectionDAG &DAG) {
10898 uint64_t BlendMask = 0;
10899 bool ForceV1Zero = false, ForceV2Zero = false;
10900 SmallVector<int, 64> Mask(Original);
10901 if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
10902 BlendMask))
10903 return SDValue();
10904
10905 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
10906 if (ForceV1Zero)
10907 V1 = getZeroVector(VT, Subtarget, DAG, DL);
10908 if (ForceV2Zero)
10909 V2 = getZeroVector(VT, Subtarget, DAG, DL);
10910
10911 unsigned NumElts = VT.getVectorNumElements();
10912
10913 switch (VT.SimpleTy) {
10914 case MVT::v4i64:
10915 case MVT::v8i32:
10916 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
10917 [[fallthrough]];
10918 case MVT::v4f64:
10919 case MVT::v8f32:
10920 assert(Subtarget.hasAVX() && "256-bit float blends require AVX!");
10921 [[fallthrough]];
10922 case MVT::v2f64:
10923 case MVT::v2i64:
10924 case MVT::v4f32:
10925 case MVT::v4i32:
10926 case MVT::v8i16:
10927 assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!");
10928 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
10929 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
10930 case MVT::v16i16: {
10931 assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!");
10932 SmallVector<int, 8> RepeatedMask;
10933 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
10934 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
10935 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
10936 BlendMask = 0;
10937 for (int i = 0; i < 8; ++i)
10938 if (RepeatedMask[i] >= 8)
10939 BlendMask |= 1ull << i;
10940 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10941 DAG.getTargetConstant(BlendMask, DL, MVT::i8));
10942 }
10943 // Use PBLENDW for lower/upper lanes and then blend lanes.
10944 // TODO - we should allow 2 PBLENDW here and leave shuffle combine to
10945 // merge to VSELECT where useful.
10946 uint64_t LoMask = BlendMask & 0xFF;
10947 uint64_t HiMask = (BlendMask >> 8) & 0xFF;
10948 if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {
10949 SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10950 DAG.getTargetConstant(LoMask, DL, MVT::i8));
10951 SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
10952 DAG.getTargetConstant(HiMask, DL, MVT::i8));
10953 return DAG.getVectorShuffle(
10954 MVT::v16i16, DL, Lo, Hi,
10955 {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});
10956 }
10957 [[fallthrough]];
10958 }
10959 case MVT::v32i8:
10960 assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!");
10961 [[fallthrough]];
10962 case MVT::v16i8: {
10963 assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!");
10964
10965 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
10966 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
10967 Subtarget, DAG))
10968 return Masked;
10969
10970 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
10971 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
10972 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
10973 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
10974 }
10975
10976 // If we have VPTERNLOG, we can use that as a bit blend.
10977 if (Subtarget.hasVLX())
10978 if (SDValue BitBlend =
10979 lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
10980 return BitBlend;
10981
10982 // Scale the blend by the number of bytes per element.
10983 int Scale = VT.getScalarSizeInBits() / 8;
10984
10985 // This form of blend is always done on bytes. Compute the byte vector
10986 // type.
10987 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
10988
10989 // x86 allows load folding with blendvb from the 2nd source operand. But
10990 // we are still using LLVM select here (see comment below), so that's V1.
10991 // If V2 can be load-folded and V1 cannot be load-folded, then commute to
10992 // allow that load-folding possibility.
10993 if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
10995 std::swap(V1, V2);
10996 }
10997
10998 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
10999 // mix of LLVM's code generator and the x86 backend. We tell the code
11000 // generator that boolean values in the elements of an x86 vector register
11001 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
11002 // mapping a select to operand #1, and 'false' mapping to operand #2. The
11003 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
11004 // of the element (the remaining are ignored) and 0 in that high bit would
11005 // mean operand #1 while 1 in the high bit would mean operand #2. So while
11006 // the LLVM model for boolean values in vector elements gets the relevant
11007 // bit set, it is set backwards and over constrained relative to x86's
11008 // actual model.
11009 SmallVector<SDValue, 32> VSELECTMask;
11010 for (int i = 0, Size = Mask.size(); i < Size; ++i)
11011 for (int j = 0; j < Scale; ++j)
11012 VSELECTMask.push_back(
11013 Mask[i] < 0
11014 ? DAG.getUNDEF(MVT::i8)
11015 : DAG.getSignedConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8));
11016
11017 V1 = DAG.getBitcast(BlendVT, V1);
11018 V2 = DAG.getBitcast(BlendVT, V2);
11019 return DAG.getBitcast(
11020 VT,
11021 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
11022 V1, V2));
11023 }
11024 case MVT::v16f32:
11025 case MVT::v8f64:
11026 case MVT::v8i64:
11027 case MVT::v16i32:
11028 case MVT::v32i16:
11029 case MVT::v64i8: {
11030 // Attempt to lower to a bitmask if we can. Only if not optimizing for size.
11031 bool OptForSize = DAG.shouldOptForSize();
11032 if (!OptForSize) {
11033 if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
11034 Subtarget, DAG))
11035 return Masked;
11036 }
11037
11038 // Otherwise load an immediate into a GPR, cast to k-register, and use a
11039 // masked move.
11040 MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));
11041 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
11042 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
11043 }
11044 default:
11045 llvm_unreachable("Not a supported integer vector type!");
11046 }
11047}
11048
11049/// Try to lower as a blend of elements from two inputs followed by
11050/// a single-input permutation.
11051///
11052/// This matches the pattern where we can blend elements from two inputs and
11053/// then reduce the shuffle to a single-input permutation.
11055 SDValue V1, SDValue V2,
11056 ArrayRef<int> Mask,
11057 SelectionDAG &DAG,
11058 bool ImmBlends = false) {
11059 // We build up the blend mask while checking whether a blend is a viable way
11060 // to reduce the shuffle.
11061 SmallVector<int, 32> BlendMask(Mask.size(), -1);
11062 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
11063
11064 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
11065 if (Mask[i] < 0)
11066 continue;
11067
11068 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
11069
11070 if (BlendMask[Mask[i] % Size] < 0)
11071 BlendMask[Mask[i] % Size] = Mask[i];
11072 else if (BlendMask[Mask[i] % Size] != Mask[i])
11073 return SDValue(); // Can't blend in the needed input!
11074
11075 PermuteMask[i] = Mask[i] % Size;
11076 }
11077
11078 // If only immediate blends, then bail if the blend mask can't be widened to
11079 // i16.
11080 unsigned EltSize = VT.getScalarSizeInBits();
11081 if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))
11082 return SDValue();
11083
11084 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
11085 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
11086}
11087
11088/// Try to lower as an unpack of elements from two inputs followed by
11089/// a single-input permutation.
11090///
11091/// This matches the pattern where we can unpack elements from two inputs and
11092/// then reduce the shuffle to a single-input (wider) permutation.
11094 SDValue V1, SDValue V2,
11095 ArrayRef<int> Mask,
11096 SelectionDAG &DAG) {
11097 int NumElts = Mask.size();
11098 int NumLanes = VT.getSizeInBits() / 128;
11099 int NumLaneElts = NumElts / NumLanes;
11100 int NumHalfLaneElts = NumLaneElts / 2;
11101
11102 bool MatchLo = true, MatchHi = true;
11103 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
11104
11105 // Determine UNPCKL/UNPCKH type and operand order.
11106 for (int Elt = 0; Elt != NumElts; ++Elt) {
11107 int M = Mask[Elt];
11108 if (M < 0)
11109 continue;
11110
11111 // Normalize the mask value depending on whether it's V1 or V2.
11112 int NormM = M;
11113 SDValue &Op = Ops[Elt & 1];
11114 if (M < NumElts && (Op.isUndef() || Op == V1))
11115 Op = V1;
11116 else if (NumElts <= M && (Op.isUndef() || Op == V2)) {
11117 Op = V2;
11118 NormM -= NumElts;
11119 } else
11120 return SDValue();
11121
11122 bool MatchLoAnyLane = false, MatchHiAnyLane = false;
11123 for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {
11124 int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;
11125 MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);
11126 MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);
11127 if (MatchLoAnyLane || MatchHiAnyLane) {
11128 assert((MatchLoAnyLane ^ MatchHiAnyLane) &&
11129 "Failed to match UNPCKLO/UNPCKHI");
11130 break;
11131 }
11132 }
11133 MatchLo &= MatchLoAnyLane;
11134 MatchHi &= MatchHiAnyLane;
11135 if (!MatchLo && !MatchHi)
11136 return SDValue();
11137 }
11138 assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI");
11139
11140 // Element indices have changed after unpacking. Calculate permute mask
11141 // so that they will be put back to the position as dictated by the
11142 // original shuffle mask indices.
11143 SmallVector<int, 32> PermuteMask(NumElts, -1);
11144 for (int Elt = 0; Elt != NumElts; ++Elt) {
11145 int M = Mask[Elt];
11146 if (M < 0)
11147 continue;
11148 int NormM = M;
11149 if (NumElts <= M)
11150 NormM -= NumElts;
11151 bool IsFirstOp = M < NumElts;
11152 int BaseMaskElt =
11153 NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts));
11154 if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0]))
11155 PermuteMask[Elt] = BaseMaskElt;
11156 else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1]))
11157 PermuteMask[Elt] = BaseMaskElt + 1;
11158 assert(PermuteMask[Elt] != -1 &&
11159 "Input mask element is defined but failed to assign permute mask");
11160 }
11161
11162 unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
11163 SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);
11164 return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);
11165}
11166
11167/// Try to lower a shuffle as a permute of the inputs followed by an
11168/// UNPCK instruction.
11169///
11170/// This specifically targets cases where we end up with alternating between
11171/// the two inputs, and so can permute them into something that feeds a single
11172/// UNPCK instruction. Note that this routine only targets integer vectors
11173/// because for floating point vectors we have a generalized SHUFPS lowering
11174/// strategy that handles everything that doesn't *exactly* match an unpack,
11175/// making this clever lowering unnecessary.
11177 SDValue V1, SDValue V2,
11178 ArrayRef<int> Mask,
11179 const X86Subtarget &Subtarget,
11180 SelectionDAG &DAG) {
11181 int Size = Mask.size();
11182 assert(Mask.size() >= 2 && "Single element masks are invalid.");
11183
11184 // This routine only supports 128-bit integer dual input vectors.
11185 if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())
11186 return SDValue();
11187
11188 int NumLoInputs =
11189 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
11190 int NumHiInputs =
11191 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
11192
11193 bool UnpackLo = NumLoInputs >= NumHiInputs;
11194
11195 auto TryUnpack = [&](int ScalarSize, int Scale) {
11196 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11197 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11198
11199 for (int i = 0; i < Size; ++i) {
11200 if (Mask[i] < 0)
11201 continue;
11202
11203 // Each element of the unpack contains Scale elements from this mask.
11204 int UnpackIdx = i / Scale;
11205
11206 // We only handle the case where V1 feeds the first slots of the unpack.
11207 // We rely on canonicalization to ensure this is the case.
11208 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
11209 return SDValue();
11210
11211 // Setup the mask for this input. The indexing is tricky as we have to
11212 // handle the unpack stride.
11213 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
11214 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
11215 Mask[i] % Size;
11216 }
11217
11218 // If we will have to shuffle both inputs to use the unpack, check whether
11219 // we can just unpack first and shuffle the result. If so, skip this unpack.
11220 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
11221 !isNoopShuffleMask(V2Mask))
11222 return SDValue();
11223
11224 // Shuffle the inputs into place.
11225 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11226 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11227
11228 // Cast the inputs to the type we will use to unpack them.
11229 MVT UnpackVT =
11230 MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
11231 V1 = DAG.getBitcast(UnpackVT, V1);
11232 V2 = DAG.getBitcast(UnpackVT, V2);
11233
11234 // Unpack the inputs and cast the result back to the desired type.
11235 return DAG.getBitcast(
11236 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11237 UnpackVT, V1, V2));
11238 };
11239
11240 // We try each unpack from the largest to the smallest to try and find one
11241 // that fits this mask.
11242 int OrigScalarSize = VT.getScalarSizeInBits();
11243 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11244 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11245 return Unpack;
11246
11247 // If we're shuffling with a zero vector then we're better off not doing
11248 // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.
11250 ISD::isBuildVectorAllZeros(V2.getNode()))
11251 return SDValue();
11252
11253 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11254 // initial unpack.
11255 if (NumLoInputs == 0 || NumHiInputs == 0) {
11256 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11257 "We have to have *some* inputs!");
11258 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11259
11260 // FIXME: We could consider the total complexity of the permute of each
11261 // possible unpacking. Or at the least we should consider how many
11262 // half-crossings are created.
11263 // FIXME: We could consider commuting the unpacks.
11264
11265 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11266 for (int i = 0; i < Size; ++i) {
11267 if (Mask[i] < 0)
11268 continue;
11269
11270 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11271
11272 PermMask[i] =
11273 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11274 }
11275 return DAG.getVectorShuffle(
11276 VT, DL,
11277 DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,
11278 V1, V2),
11279 DAG.getUNDEF(VT), PermMask);
11280 }
11281
11282 return SDValue();
11283}
11284
11285/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then
11286/// permuting the elements of the result in place.
11288 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11289 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11290 if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||
11291 (VT.is256BitVector() && !Subtarget.hasAVX2()) ||
11292 (VT.is512BitVector() && !Subtarget.hasBWI()))
11293 return SDValue();
11294
11295 // We don't currently support lane crossing permutes.
11296 if (is128BitLaneCrossingShuffleMask(VT, Mask))
11297 return SDValue();
11298
11299 int Scale = VT.getScalarSizeInBits() / 8;
11300 int NumLanes = VT.getSizeInBits() / 128;
11301 int NumElts = VT.getVectorNumElements();
11302 int NumEltsPerLane = NumElts / NumLanes;
11303
11304 // Determine range of mask elts.
11305 bool Blend1 = true;
11306 bool Blend2 = true;
11307 std::pair<int, int> Range1 = std::make_pair(INT_MAX, INT_MIN);
11308 std::pair<int, int> Range2 = std::make_pair(INT_MAX, INT_MIN);
11309 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11310 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11311 int M = Mask[Lane + Elt];
11312 if (M < 0)
11313 continue;
11314 if (M < NumElts) {
11315 Blend1 &= (M == (Lane + Elt));
11316 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11317 M = M % NumEltsPerLane;
11318 Range1.first = std::min(Range1.first, M);
11319 Range1.second = std::max(Range1.second, M);
11320 } else {
11321 M -= NumElts;
11322 Blend2 &= (M == (Lane + Elt));
11323 assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask");
11324 M = M % NumEltsPerLane;
11325 Range2.first = std::min(Range2.first, M);
11326 Range2.second = std::max(Range2.second, M);
11327 }
11328 }
11329 }
11330
11331 // Bail if we don't need both elements.
11332 // TODO - it might be worth doing this for unary shuffles if the permute
11333 // can be widened.
11334 if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||
11335 !(0 <= Range2.first && Range2.second < NumEltsPerLane))
11336 return SDValue();
11337
11338 if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))
11339 return SDValue();
11340
11341 // Rotate the 2 ops so we can access both ranges, then permute the result.
11342 auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {
11343 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11344 SDValue Rotate = DAG.getBitcast(
11345 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),
11346 DAG.getBitcast(ByteVT, Lo),
11347 DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));
11348 SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);
11349 for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {
11350 for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {
11351 int M = Mask[Lane + Elt];
11352 if (M < 0)
11353 continue;
11354 if (M < NumElts)
11355 PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);
11356 else
11357 PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);
11358 }
11359 }
11360 return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);
11361 };
11362
11363 // Check if the ranges are small enough to rotate from either direction.
11364 if (Range2.second < Range1.first)
11365 return RotateAndPermute(V1, V2, Range1.first, 0);
11366 if (Range1.second < Range2.first)
11367 return RotateAndPermute(V2, V1, Range2.first, NumElts);
11368 return SDValue();
11369}
11370
11372 return isUndefOrEqual(Mask, 0);
11373}
11374
11376 return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);
11377}
11378
11379/// Check if the Mask consists of the same element repeated multiple times.
11381 size_t NumUndefs = 0;
11382 std::optional<int> UniqueElt;
11383 for (int Elt : Mask) {
11384 if (Elt == SM_SentinelUndef) {
11385 NumUndefs++;
11386 continue;
11387 }
11388 if (UniqueElt.has_value() && UniqueElt.value() != Elt)
11389 return false;
11390 UniqueElt = Elt;
11391 }
11392 // Make sure the element is repeated enough times by checking the number of
11393 // undefs is small.
11394 return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();
11395}
11396
11397/// Generic routine to decompose a shuffle and blend into independent
11398/// blends and permutes.
11399///
11400/// This matches the extremely common pattern for handling combined
11401/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
11402/// operations. It will try to pick the best arrangement of shuffles and
11403/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.
11405 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11406 const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11407 int NumElts = Mask.size();
11408 int NumLanes = VT.getSizeInBits() / 128;
11409 int NumEltsPerLane = NumElts / NumLanes;
11410
11411 // Shuffle the input elements into the desired positions in V1 and V2 and
11412 // unpack/blend them together.
11413 bool IsAlternating = true;
11414 bool V1Zero = true, V2Zero = true;
11415 SmallVector<int, 32> V1Mask(NumElts, -1);
11416 SmallVector<int, 32> V2Mask(NumElts, -1);
11417 SmallVector<int, 32> FinalMask(NumElts, -1);
11418 for (int i = 0; i < NumElts; ++i) {
11419 int M = Mask[i];
11420 if (M >= 0 && M < NumElts) {
11421 V1Mask[i] = M;
11422 FinalMask[i] = i;
11423 V1Zero &= Zeroable[i];
11424 IsAlternating &= (i & 1) == 0;
11425 } else if (M >= NumElts) {
11426 V2Mask[i] = M - NumElts;
11427 FinalMask[i] = i + NumElts;
11428 V2Zero &= Zeroable[i];
11429 IsAlternating &= (i & 1) == 1;
11430 }
11431 }
11432
11433 // If we effectively only demand the 0'th element of \p Input, and not only
11434 // as 0'th element, then broadcast said input,
11435 // and change \p InputMask to be a no-op (identity) mask.
11436 auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,
11437 &DAG](SDValue &Input,
11438 MutableArrayRef<int> InputMask) {
11439 unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
11440 if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
11441 !X86::mayFoldLoad(Input, Subtarget)))
11442 return;
11443 if (isNoopShuffleMask(InputMask))
11444 return;
11445 assert(isBroadcastShuffleMask(InputMask) &&
11446 "Expected to demand only the 0'th element.");
11447 Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);
11448 for (auto I : enumerate(InputMask)) {
11449 int &InputMaskElt = I.value();
11450 if (InputMaskElt >= 0)
11451 InputMaskElt = I.index();
11452 }
11453 };
11454
11455 // Currently, we may need to produce one shuffle per input, and blend results.
11456 // It is possible that the shuffle for one of the inputs is already a no-op.
11457 // See if we can simplify non-no-op shuffles into broadcasts,
11458 // which we consider to be strictly better than an arbitrary shuffle.
11459 if (isNoopOrBroadcastShuffleMask(V1Mask) &&
11461 canonicalizeBroadcastableInput(V1, V1Mask);
11462 canonicalizeBroadcastableInput(V2, V2Mask);
11463 }
11464
11465 // Try to lower with the simpler initial blend/unpack/rotate strategies unless
11466 // one of the input shuffles would be a no-op. We prefer to shuffle inputs as
11467 // the shuffle may be able to fold with a load or other benefit. However, when
11468 // we'll have to do 2x as many shuffles in order to achieve this, a 2-input
11469 // pre-shuffle first is a better strategy.
11470 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {
11471 // Only prefer immediate blends to unpack/rotate.
11472 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11473 DAG, true))
11474 return BlendPerm;
11475 // If either input vector provides only a single element which is repeated
11476 // multiple times, unpacking from both input vectors would generate worse
11477 // code. e.g. for
11478 // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4
11479 // it is better to process t4 first to create a vector of t4[0], then unpack
11480 // that vector with t2.
11481 if (!V1Zero && !V2Zero && !isSingleElementRepeatedMask(V1Mask) &&
11483 if (SDValue UnpackPerm =
11484 lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))
11485 return UnpackPerm;
11487 DL, VT, V1, V2, Mask, Subtarget, DAG))
11488 return RotatePerm;
11489 // Unpack/rotate failed - try again with variable blends.
11490 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,
11491 DAG))
11492 return BlendPerm;
11493 if (VT.getScalarSizeInBits() >= 32)
11494 if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(
11495 DL, VT, V1, V2, Mask, Subtarget, DAG))
11496 return PermUnpack;
11497 }
11498
11499 // If the final mask is an alternating blend of vXi8/vXi16, convert to an
11500 // UNPCKL(SHUFFLE, SHUFFLE) pattern.
11501 // TODO: It doesn't have to be alternating - but each lane mustn't have more
11502 // than half the elements coming from each source.
11503 if (IsAlternating && VT.getScalarSizeInBits() < 32) {
11504 V1Mask.assign(NumElts, -1);
11505 V2Mask.assign(NumElts, -1);
11506 FinalMask.assign(NumElts, -1);
11507 for (int i = 0; i != NumElts; i += NumEltsPerLane)
11508 for (int j = 0; j != NumEltsPerLane; ++j) {
11509 int M = Mask[i + j];
11510 if (M >= 0 && M < NumElts) {
11511 V1Mask[i + (j / 2)] = M;
11512 FinalMask[i + j] = i + (j / 2);
11513 } else if (M >= NumElts) {
11514 V2Mask[i + (j / 2)] = M - NumElts;
11515 FinalMask[i + j] = i + (j / 2) + NumElts;
11516 }
11517 }
11518 }
11519
11520 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11521 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11522 return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);
11523}
11524
11525static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
11526 const X86Subtarget &Subtarget,
11527 ArrayRef<int> Mask) {
11528 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11529 assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
11530
11531 // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11532 int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
11533 int MaxSubElts = 64 / EltSizeInBits;
11534 unsigned RotateAmt, NumSubElts;
11535 if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts,
11536 MaxSubElts, NumSubElts, RotateAmt))
11537 return -1;
11538 unsigned NumElts = Mask.size();
11539 MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
11540 RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11541 return RotateAmt;
11542}
11543
11544/// Lower shuffle using X86ISD::VROTLI rotations.
11546 ArrayRef<int> Mask,
11547 const X86Subtarget &Subtarget,
11548 SelectionDAG &DAG) {
11549 // Only XOP + AVX512 targets have bit rotation instructions.
11550 // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
11551 bool IsLegal =
11552 (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11553 if (!IsLegal && Subtarget.hasSSE3())
11554 return SDValue();
11555
11556 MVT RotateVT;
11557 int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
11558 Subtarget, Mask);
11559 if (RotateAmt < 0)
11560 return SDValue();
11561
11562 // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11563 // expanded to OR(SRL,SHL), will be more efficient, but if they can
11564 // widen to vXi16 or more then existing lowering should will be better.
11565 if (!IsLegal) {
11566 if ((RotateAmt % 16) == 0)
11567 return SDValue();
11568 // TODO: Use getTargetVShiftByConstNode.
11569 unsigned ShlAmt = RotateAmt;
11570 unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
11571 V1 = DAG.getBitcast(RotateVT, V1);
11572 SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11573 DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11574 SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11575 DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11576 SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11577 return DAG.getBitcast(VT, Rot);
11578 }
11579
11580 SDValue Rot =
11581 DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11582 DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
11583 return DAG.getBitcast(VT, Rot);
11584}
11585
11586/// Try to match a vector shuffle as an element rotation.
11587///
11588/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
11590 ArrayRef<int> Mask) {
11591 int NumElts = Mask.size();
11592
11593 // We need to detect various ways of spelling a rotation:
11594 // [11, 12, 13, 14, 15, 0, 1, 2]
11595 // [-1, 12, 13, 14, -1, -1, 1, -1]
11596 // [-1, -1, -1, -1, -1, -1, 1, 2]
11597 // [ 3, 4, 5, 6, 7, 8, 9, 10]
11598 // [-1, 4, 5, 6, -1, -1, 9, -1]
11599 // [-1, 4, 5, 6, -1, -1, -1, -1]
11600 int Rotation = 0;
11601 SDValue Lo, Hi;
11602 for (int i = 0; i < NumElts; ++i) {
11603 int M = Mask[i];
11604 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
11605 "Unexpected mask index.");
11606 if (M < 0)
11607 continue;
11608
11609 // Determine where a rotated vector would have started.
11610 int StartIdx = i - (M % NumElts);
11611 if (StartIdx == 0)
11612 // The identity rotation isn't interesting, stop.
11613 return -1;
11614
11615 // If we found the tail of a vector the rotation must be the missing
11616 // front. If we found the head of a vector, it must be how much of the
11617 // head.
11618 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
11619
11620 if (Rotation == 0)
11621 Rotation = CandidateRotation;
11622 else if (Rotation != CandidateRotation)
11623 // The rotations don't match, so we can't match this mask.
11624 return -1;
11625
11626 // Compute which value this mask is pointing at.
11627 SDValue MaskV = M < NumElts ? V1 : V2;
11628
11629 // Compute which of the two target values this index should be assigned
11630 // to. This reflects whether the high elements are remaining or the low
11631 // elements are remaining.
11632 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
11633
11634 // Either set up this value if we've not encountered it before, or check
11635 // that it remains consistent.
11636 if (!TargetV)
11637 TargetV = MaskV;
11638 else if (TargetV != MaskV)
11639 // This may be a rotation, but it pulls from the inputs in some
11640 // unsupported interleaving.
11641 return -1;
11642 }
11643
11644 // Check that we successfully analyzed the mask, and normalize the results.
11645 assert(Rotation != 0 && "Failed to locate a viable rotation!");
11646 assert((Lo || Hi) && "Failed to find a rotated input vector!");
11647 if (!Lo)
11648 Lo = Hi;
11649 else if (!Hi)
11650 Hi = Lo;
11651
11652 V1 = Lo;
11653 V2 = Hi;
11654
11655 return Rotation;
11656}
11657
11658/// Try to lower a vector shuffle as a byte rotation.
11659///
11660/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
11661/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
11662/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
11663/// try to generically lower a vector shuffle through such an pattern. It
11664/// does not check for the profitability of lowering either as PALIGNR or
11665/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
11666/// This matches shuffle vectors that look like:
11667///
11668/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
11669///
11670/// Essentially it concatenates V1 and V2, shifts right by some number of
11671/// elements, and takes the low elements as the result. Note that while this is
11672/// specified as a *right shift* because x86 is little-endian, it is a *left
11673/// rotate* of the vector lanes.
11675 ArrayRef<int> Mask) {
11676 // Don't accept any shuffles with zero elements.
11677 if (isAnyZero(Mask))
11678 return -1;
11679
11680 // PALIGNR works on 128-bit lanes.
11681 SmallVector<int, 16> RepeatedMask;
11682 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
11683 return -1;
11684
11685 int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
11686 if (Rotation <= 0)
11687 return -1;
11688
11689 // PALIGNR rotates bytes, so we need to scale the
11690 // rotation based on how many bytes are in the vector lane.
11691 int NumElts = RepeatedMask.size();
11692 int Scale = 16 / NumElts;
11693 return Rotation * Scale;
11694}
11695
11697 SDValue V2, ArrayRef<int> Mask,
11698 const X86Subtarget &Subtarget,
11699 SelectionDAG &DAG) {
11700 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11701
11702 SDValue Lo = V1, Hi = V2;
11703 int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);
11704 if (ByteRotation <= 0)
11705 return SDValue();
11706
11707 // Cast the inputs to i8 vector of correct length to match PALIGNR or
11708 // PSLLDQ/PSRLDQ.
11709 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
11710 Lo = DAG.getBitcast(ByteVT, Lo);
11711 Hi = DAG.getBitcast(ByteVT, Hi);
11712
11713 // SSSE3 targets can use the palignr instruction.
11714 if (Subtarget.hasSSSE3()) {
11715 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
11716 "512-bit PALIGNR requires BWI instructions");
11717 return DAG.getBitcast(
11718 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
11719 DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));
11720 }
11721
11722 assert(VT.is128BitVector() &&
11723 "Rotate-based lowering only supports 128-bit lowering!");
11724 assert(Mask.size() <= 16 &&
11725 "Can shuffle at most 16 bytes in a 128-bit vector!");
11726 assert(ByteVT == MVT::v16i8 &&
11727 "SSE2 rotate lowering only needed for v16i8!");
11728
11729 // Default SSE2 implementation
11730 int LoByteShift = 16 - ByteRotation;
11731 int HiByteShift = ByteRotation;
11732
11733 SDValue LoShift =
11734 DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
11735 DAG.getTargetConstant(LoByteShift, DL, MVT::i8));
11736 SDValue HiShift =
11737 DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
11738 DAG.getTargetConstant(HiByteShift, DL, MVT::i8));
11739 return DAG.getBitcast(VT,
11740 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
11741}
11742
11743/// Try to lower a vector shuffle as a dword/qword rotation.
11744///
11745/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
11746/// rotation of the concatenation of two vectors; This routine will
11747/// try to generically lower a vector shuffle through such an pattern.
11748///
11749/// Essentially it concatenates V1 and V2, shifts right by some number of
11750/// elements, and takes the low elements as the result. Note that while this is
11751/// specified as a *right shift* because x86 is little-endian, it is a *left
11752/// rotate* of the vector lanes.
11754 SDValue V2, ArrayRef<int> Mask,
11755 const APInt &Zeroable,
11756 const X86Subtarget &Subtarget,
11757 SelectionDAG &DAG) {
11758 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
11759 "Only 32-bit and 64-bit elements are supported!");
11760
11761 // 128/256-bit vectors are only supported with VLX.
11762 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
11763 && "VLX required for 128/256-bit vectors");
11764
11765 SDValue Lo = V1, Hi = V2;
11766 int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
11767 if (0 < Rotation)
11768 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
11769 DAG.getTargetConstant(Rotation, DL, MVT::i8));
11770
11771 // See if we can use VALIGN as a cross-lane version of VSHLDQ/VSRLDQ.
11772 // TODO: Pull this out as a matchShuffleAsElementShift helper?
11773 // TODO: We can probably make this more aggressive and use shift-pairs like
11774 // lowerShuffleAsByteShiftMask.
11775 unsigned NumElts = Mask.size();
11776 unsigned ZeroLo = Zeroable.countr_one();
11777 unsigned ZeroHi = Zeroable.countl_one();
11778 assert((ZeroLo + ZeroHi) < NumElts && "Zeroable shuffle detected");
11779 if (!ZeroLo && !ZeroHi)
11780 return SDValue();
11781
11782 if (ZeroLo) {
11783 SDValue Src = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11784 int Low = Mask[ZeroLo] < (int)NumElts ? 0 : NumElts;
11785 if (isSequentialOrUndefInRange(Mask, ZeroLo, NumElts - ZeroLo, Low))
11786 return DAG.getNode(X86ISD::VALIGN, DL, VT, Src,
11787 getZeroVector(VT, Subtarget, DAG, DL),
11788 DAG.getTargetConstant(NumElts - ZeroLo, DL, MVT::i8));
11789 }
11790
11791 if (ZeroHi) {
11792 SDValue Src = Mask[0] < (int)NumElts ? V1 : V2;
11793 int Low = Mask[0] < (int)NumElts ? 0 : NumElts;
11794 if (isSequentialOrUndefInRange(Mask, 0, NumElts - ZeroHi, Low + ZeroHi))
11795 return DAG.getNode(X86ISD::VALIGN, DL, VT,
11796 getZeroVector(VT, Subtarget, DAG, DL), Src,
11797 DAG.getTargetConstant(ZeroHi, DL, MVT::i8));
11798 }
11799
11800 return SDValue();
11801}
11802
11803/// Try to lower a vector shuffle as a byte shift sequence.
11805 SDValue V2, ArrayRef<int> Mask,
11806 const APInt &Zeroable,
11807 const X86Subtarget &Subtarget,
11808 SelectionDAG &DAG) {
11809 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
11810 assert(VT.is128BitVector() && "Only 128-bit vectors supported");
11811
11812 // We need a shuffle that has zeros at one/both ends and a sequential
11813 // shuffle from one source within.
11814 unsigned ZeroLo = Zeroable.countr_one();
11815 unsigned ZeroHi = Zeroable.countl_one();
11816 if (!ZeroLo && !ZeroHi)
11817 return SDValue();
11818
11819 unsigned NumElts = Mask.size();
11820 unsigned Len = NumElts - (ZeroLo + ZeroHi);
11821 if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))
11822 return SDValue();
11823
11824 unsigned Scale = VT.getScalarSizeInBits() / 8;
11825 ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);
11826 if (!isUndefOrInRange(StubMask, 0, NumElts) &&
11827 !isUndefOrInRange(StubMask, NumElts, 2 * NumElts))
11828 return SDValue();
11829
11830 SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;
11831 Res = DAG.getBitcast(MVT::v16i8, Res);
11832
11833 // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an
11834 // inner sequential set of elements, possibly offset:
11835 // 01234567 --> zzzzzz01 --> 1zzzzzzz
11836 // 01234567 --> 4567zzzz --> zzzzz456
11837 // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz
11838 if (ZeroLo == 0) {
11839 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11840 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11841 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11842 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11843 DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));
11844 } else if (ZeroHi == 0) {
11845 unsigned Shift = Mask[ZeroLo] % NumElts;
11846 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11847 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11848 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11849 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
11850 } else if (!Subtarget.hasSSSE3()) {
11851 // If we don't have PSHUFB then its worth avoiding an AND constant mask
11852 // by performing 3 byte shifts. Shuffle combining can kick in above that.
11853 // TODO: There may be some cases where VSH{LR}DQ+PAND is still better.
11854 unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);
11855 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11856 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11857 Shift += Mask[ZeroLo] % NumElts;
11858 Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,
11859 DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));
11860 Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,
11861 DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));
11862 } else
11863 return SDValue();
11864
11865 return DAG.getBitcast(VT, Res);
11866}
11867
11868/// Try to lower a vector shuffle as a bit shift (shifts in zeros).
11869///
11870/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
11871/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
11872/// matches elements from one of the input vectors shuffled to the left or
11873/// right with zeroable elements 'shifted in'. It handles both the strictly
11874/// bit-wise element shifts and the byte shift across an entire 128-bit double
11875/// quad word lane.
11876///
11877/// PSHL : (little-endian) left bit shift.
11878/// [ zz, 0, zz, 2 ]
11879/// [ -1, 4, zz, -1 ]
11880/// PSRL : (little-endian) right bit shift.
11881/// [ 1, zz, 3, zz]
11882/// [ -1, -1, 7, zz]
11883/// PSLLDQ : (little-endian) left byte shift
11884/// [ zz, 0, 1, 2, 3, 4, 5, 6]
11885/// [ zz, zz, -1, -1, 2, 3, 4, -1]
11886/// [ zz, zz, zz, zz, zz, zz, -1, 1]
11887/// PSRLDQ : (little-endian) right byte shift
11888/// [ 5, 6, 7, zz, zz, zz, zz, zz]
11889/// [ -1, 5, 6, 7, zz, zz, zz, zz]
11890/// [ 1, 2, -1, -1, -1, -1, zz, zz]
11891static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
11892 unsigned ScalarSizeInBits, ArrayRef<int> Mask,
11893 int MaskOffset, const APInt &Zeroable,
11894 const X86Subtarget &Subtarget) {
11895 int Size = Mask.size();
11896 unsigned SizeInBits = Size * ScalarSizeInBits;
11897
11898 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
11899 for (int i = 0; i < Size; i += Scale)
11900 for (int j = 0; j < Shift; ++j)
11901 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
11902 return false;
11903
11904 return true;
11905 };
11906
11907 auto MatchShift = [&](int Shift, int Scale, bool Left) {
11908 for (int i = 0; i != Size; i += Scale) {
11909 unsigned Pos = Left ? i + Shift : i;
11910 unsigned Low = Left ? i : i + Shift;
11911 unsigned Len = Scale - Shift;
11912 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
11913 return -1;
11914 }
11915
11916 int ShiftEltBits = ScalarSizeInBits * Scale;
11917 bool ByteShift = ShiftEltBits > 64;
11918 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
11919 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
11920 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
11921
11922 // Normalize the scale for byte shifts to still produce an i64 element
11923 // type.
11924 Scale = ByteShift ? Scale / 2 : Scale;
11925
11926 // We need to round trip through the appropriate type for the shift.
11927 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
11928 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
11929 : MVT::getVectorVT(ShiftSVT, Size / Scale);
11930 return (int)ShiftAmt;
11931 };
11932
11933 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
11934 // keep doubling the size of the integer elements up to that. We can
11935 // then shift the elements of the integer vector by whole multiples of
11936 // their width within the elements of the larger integer vector. Test each
11937 // multiple to see if we can find a match with the moved element indices
11938 // and that the shifted in elements are all zeroable.
11939 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
11940 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
11941 for (int Shift = 1; Shift != Scale; ++Shift)
11942 for (bool Left : {true, false})
11943 if (CheckZeros(Shift, Scale, Left)) {
11944 int ShiftAmt = MatchShift(Shift, Scale, Left);
11945 if (0 < ShiftAmt)
11946 return ShiftAmt;
11947 }
11948
11949 // no match
11950 return -1;
11951}
11952
11954 SDValue V2, ArrayRef<int> Mask,
11955 const APInt &Zeroable,
11956 const X86Subtarget &Subtarget,
11957 SelectionDAG &DAG, bool BitwiseOnly) {
11958 int Size = Mask.size();
11959 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11960
11961 MVT ShiftVT;
11962 SDValue V = V1;
11963 unsigned Opcode;
11964
11965 // Try to match shuffle against V1 shift.
11966 int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
11967 Mask, 0, Zeroable, Subtarget);
11968
11969 // If V1 failed, try to match shuffle against V2 shift.
11970 if (ShiftAmt < 0) {
11971 ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
11972 Mask, Size, Zeroable, Subtarget);
11973 V = V2;
11974 }
11975
11976 if (ShiftAmt < 0)
11977 return SDValue();
11978
11979 if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))
11980 return SDValue();
11981
11982 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
11983 "Illegal integer vector type");
11984 V = DAG.getBitcast(ShiftVT, V);
11985 V = DAG.getNode(Opcode, DL, ShiftVT, V,
11986 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
11987 return DAG.getBitcast(VT, V);
11988}
11989
11990// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
11991// Remainder of lower half result is zero and upper half is all undef.
11992static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
11993 ArrayRef<int> Mask, uint64_t &BitLen,
11994 uint64_t &BitIdx, const APInt &Zeroable) {
11995 int Size = Mask.size();
11996 int HalfSize = Size / 2;
11997 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
11998 assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask");
11999
12000 // Upper half must be undefined.
12001 if (!isUndefUpperHalf(Mask))
12002 return false;
12003
12004 // Determine the extraction length from the part of the
12005 // lower half that isn't zeroable.
12006 int Len = HalfSize;
12007 for (; Len > 0; --Len)
12008 if (!Zeroable[Len - 1])
12009 break;
12010 assert(Len > 0 && "Zeroable shuffle mask");
12011
12012 // Attempt to match first Len sequential elements from the lower half.
12013 SDValue Src;
12014 int Idx = -1;
12015 for (int i = 0; i != Len; ++i) {
12016 int M = Mask[i];
12017 if (M == SM_SentinelUndef)
12018 continue;
12019 SDValue &V = (M < Size ? V1 : V2);
12020 M = M % Size;
12021
12022 // The extracted elements must start at a valid index and all mask
12023 // elements must be in the lower half.
12024 if (i > M || M >= HalfSize)
12025 return false;
12026
12027 if (Idx < 0 || (Src == V && Idx == (M - i))) {
12028 Src = V;
12029 Idx = M - i;
12030 continue;
12031 }
12032 return false;
12033 }
12034
12035 if (!Src || Idx < 0)
12036 return false;
12037
12038 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
12039 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12040 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12041 V1 = Src;
12042 return true;
12043}
12044
12045// INSERTQ: Extract lowest Len elements from lower half of second source and
12046// insert over first source, starting at Idx.
12047// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
12048static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
12049 ArrayRef<int> Mask, uint64_t &BitLen,
12050 uint64_t &BitIdx) {
12051 int Size = Mask.size();
12052 int HalfSize = Size / 2;
12053 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
12054
12055 // Upper half must be undefined.
12056 if (!isUndefUpperHalf(Mask))
12057 return false;
12058
12059 for (int Idx = 0; Idx != HalfSize; ++Idx) {
12060 SDValue Base;
12061
12062 // Attempt to match first source from mask before insertion point.
12063 if (isUndefInRange(Mask, 0, Idx)) {
12064 /* EMPTY */
12065 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
12066 Base = V1;
12067 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
12068 Base = V2;
12069 } else {
12070 continue;
12071 }
12072
12073 // Extend the extraction length looking to match both the insertion of
12074 // the second source and the remaining elements of the first.
12075 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
12076 SDValue Insert;
12077 int Len = Hi - Idx;
12078
12079 // Match insertion.
12080 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
12081 Insert = V1;
12082 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
12083 Insert = V2;
12084 } else {
12085 continue;
12086 }
12087
12088 // Match the remaining elements of the lower half.
12089 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
12090 /* EMPTY */
12091 } else if ((!Base || (Base == V1)) &&
12092 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
12093 Base = V1;
12094 } else if ((!Base || (Base == V2)) &&
12095 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
12096 Size + Hi)) {
12097 Base = V2;
12098 } else {
12099 continue;
12100 }
12101
12102 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
12103 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
12104 V1 = Base;
12105 V2 = Insert;
12106 return true;
12107 }
12108 }
12109
12110 return false;
12111}
12112
12113/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
12115 SDValue V2, ArrayRef<int> Mask,
12116 const APInt &Zeroable, SelectionDAG &DAG) {
12117 uint64_t BitLen, BitIdx;
12118 if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
12119 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
12120 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12121 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12122
12123 if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
12124 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
12125 V2 ? V2 : DAG.getUNDEF(VT),
12126 DAG.getTargetConstant(BitLen, DL, MVT::i8),
12127 DAG.getTargetConstant(BitIdx, DL, MVT::i8));
12128
12129 return SDValue();
12130}
12131
12132/// Lower a vector shuffle as a zero or any extension.
12133///
12134/// Given a specific number of elements, element bit width, and extension
12135/// stride, produce either a zero or any extension based on the available
12136/// features of the subtarget. The extended elements are consecutive and
12137/// begin and can start from an offsetted element index in the input; to
12138/// avoid excess shuffling the offset must either being in the bottom lane
12139/// or at the start of a higher lane. All extended elements must be from
12140/// the same lane.
12142 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
12143 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12144 assert(Scale > 1 && "Need a scale to extend.");
12145 int EltBits = VT.getScalarSizeInBits();
12146 int NumElements = VT.getVectorNumElements();
12147 int NumEltsPerLane = 128 / EltBits;
12148 int OffsetLane = Offset / NumEltsPerLane;
12149 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
12150 "Only 8, 16, and 32 bit elements can be extended.");
12151 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
12152 assert(0 <= Offset && "Extension offset must be positive.");
12153 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
12154 "Extension offset must be in the first lane or start an upper lane.");
12155
12156 // Check that an index is in same lane as the base offset.
12157 auto SafeOffset = [&](int Idx) {
12158 return OffsetLane == (Idx / NumEltsPerLane);
12159 };
12160
12161 // Shift along an input so that the offset base moves to the first element.
12162 auto ShuffleOffset = [&](SDValue V) {
12163 if (!Offset)
12164 return V;
12165
12166 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12167 for (int i = 0; i * Scale < NumElements; ++i) {
12168 int SrcIdx = i + Offset;
12169 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
12170 }
12171 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
12172 };
12173
12174 // Found a valid a/zext mask! Try various lowering strategies based on the
12175 // input type and available ISA extensions.
12176 if (Subtarget.hasSSE41()) {
12177 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
12178 // PUNPCK will catch this in a later shuffle match.
12179 if (Offset && Scale == 2 && VT.is128BitVector())
12180 return SDValue();
12181 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
12182 NumElements / Scale);
12183 InputV = DAG.getBitcast(VT, InputV);
12184 InputV = ShuffleOffset(InputV);
12186 DL, ExtVT, InputV, DAG);
12187 return DAG.getBitcast(VT, InputV);
12188 }
12189
12190 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
12191 InputV = DAG.getBitcast(VT, InputV);
12192
12193 // For any extends we can cheat for larger element sizes and use shuffle
12194 // instructions that can fold with a load and/or copy.
12195 if (AnyExt && EltBits == 32) {
12196 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
12197 -1};
12198 return DAG.getBitcast(
12199 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12200 DAG.getBitcast(MVT::v4i32, InputV),
12201 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12202 }
12203 if (AnyExt && EltBits == 16 && Scale > 2) {
12204 int PSHUFDMask[4] = {Offset / 2, -1,
12205 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
12206 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
12207 DAG.getBitcast(MVT::v4i32, InputV),
12208 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
12209 int PSHUFWMask[4] = {1, -1, -1, -1};
12210 unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
12211 return DAG.getBitcast(
12212 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
12213 DAG.getBitcast(MVT::v8i16, InputV),
12214 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
12215 }
12216
12217 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
12218 // to 64-bits.
12219 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
12220 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
12221 assert(VT.is128BitVector() && "Unexpected vector width!");
12222
12223 int LoIdx = Offset * EltBits;
12224 SDValue Lo = DAG.getBitcast(
12225 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12226 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12227 DAG.getTargetConstant(LoIdx, DL, MVT::i8)));
12228
12229 if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))
12230 return DAG.getBitcast(VT, Lo);
12231
12232 int HiIdx = (Offset + 1) * EltBits;
12233 SDValue Hi = DAG.getBitcast(
12234 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
12235 DAG.getTargetConstant(EltBits, DL, MVT::i8),
12236 DAG.getTargetConstant(HiIdx, DL, MVT::i8)));
12237 return DAG.getBitcast(VT,
12238 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
12239 }
12240
12241 // If this would require more than 2 unpack instructions to expand, use
12242 // pshufb when available. We can only use more than 2 unpack instructions
12243 // when zero extending i8 elements which also makes it easier to use pshufb.
12244 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
12245 assert(NumElements == 16 && "Unexpected byte vector width!");
12246 SDValue PSHUFBMask[16];
12247 for (int i = 0; i < 16; ++i) {
12248 int Idx = Offset + (i / Scale);
12249 if ((i % Scale == 0 && SafeOffset(Idx))) {
12250 PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);
12251 continue;
12252 }
12253 PSHUFBMask[i] =
12254 AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);
12255 }
12256 InputV = DAG.getBitcast(MVT::v16i8, InputV);
12257 return DAG.getBitcast(
12258 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
12259 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
12260 }
12261
12262 // If we are extending from an offset, ensure we start on a boundary that
12263 // we can unpack from.
12264 int AlignToUnpack = Offset % (NumElements / Scale);
12265 if (AlignToUnpack) {
12266 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
12267 for (int i = AlignToUnpack; i < NumElements; ++i)
12268 ShMask[i - AlignToUnpack] = i;
12269 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
12270 Offset -= AlignToUnpack;
12271 }
12272
12273 // Otherwise emit a sequence of unpacks.
12274 do {
12275 unsigned UnpackLoHi = X86ISD::UNPCKL;
12276 if (Offset >= (NumElements / 2)) {
12277 UnpackLoHi = X86ISD::UNPCKH;
12278 Offset -= (NumElements / 2);
12279 }
12280
12281 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
12282 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
12283 : getZeroVector(InputVT, Subtarget, DAG, DL);
12284 InputV = DAG.getBitcast(InputVT, InputV);
12285 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
12286 Scale /= 2;
12287 EltBits *= 2;
12288 NumElements /= 2;
12289 } while (Scale > 1);
12290 return DAG.getBitcast(VT, InputV);
12291}
12292
12293/// Try to lower a vector shuffle as a zero extension on any microarch.
12294///
12295/// This routine will try to do everything in its power to cleverly lower
12296/// a shuffle which happens to match the pattern of a zero extend. It doesn't
12297/// check for the profitability of this lowering, it tries to aggressively
12298/// match this pattern. It will use all of the micro-architectural details it
12299/// can to emit an efficient lowering. It handles both blends with all-zero
12300/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
12301/// masking out later).
12302///
12303/// The reason we have dedicated lowering for zext-style shuffles is that they
12304/// are both incredibly common and often quite performance sensitive.
12306 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12307 const APInt &Zeroable, const X86Subtarget &Subtarget,
12308 SelectionDAG &DAG) {
12309 int Bits = VT.getSizeInBits();
12310 int NumLanes = Bits / 128;
12311 int NumElements = VT.getVectorNumElements();
12312 int NumEltsPerLane = NumElements / NumLanes;
12313 assert(VT.getScalarSizeInBits() <= 32 &&
12314 "Exceeds 32-bit integer zero extension limit");
12315 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
12316
12317 // Define a helper function to check a particular ext-scale and lower to it if
12318 // valid.
12319 auto Lower = [&](int Scale) -> SDValue {
12320 SDValue InputV;
12321 bool AnyExt = true;
12322 int Offset = 0;
12323 int Matches = 0;
12324 for (int i = 0; i < NumElements; ++i) {
12325 int M = Mask[i];
12326 if (M < 0)
12327 continue; // Valid anywhere but doesn't tell us anything.
12328 if (i % Scale != 0) {
12329 // Each of the extended elements need to be zeroable.
12330 if (!Zeroable[i])
12331 return SDValue();
12332
12333 // We no longer are in the anyext case.
12334 AnyExt = false;
12335 continue;
12336 }
12337
12338 // Each of the base elements needs to be consecutive indices into the
12339 // same input vector.
12340 SDValue V = M < NumElements ? V1 : V2;
12341 M = M % NumElements;
12342 if (!InputV) {
12343 InputV = V;
12344 Offset = M - (i / Scale);
12345 } else if (InputV != V)
12346 return SDValue(); // Flip-flopping inputs.
12347
12348 // Offset must start in the lowest 128-bit lane or at the start of an
12349 // upper lane.
12350 // FIXME: Is it ever worth allowing a negative base offset?
12351 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
12352 (Offset % NumEltsPerLane) == 0))
12353 return SDValue();
12354
12355 // If we are offsetting, all referenced entries must come from the same
12356 // lane.
12357 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
12358 return SDValue();
12359
12360 if ((M % NumElements) != (Offset + (i / Scale)))
12361 return SDValue(); // Non-consecutive strided elements.
12362 Matches++;
12363 }
12364
12365 // If we fail to find an input, we have a zero-shuffle which should always
12366 // have already been handled.
12367 // FIXME: Maybe handle this here in case during blending we end up with one?
12368 if (!InputV)
12369 return SDValue();
12370
12371 // If we are offsetting, don't extend if we only match a single input, we
12372 // can always do better by using a basic PSHUF or PUNPCK.
12373 if (Offset != 0 && Matches < 2)
12374 return SDValue();
12375
12376 return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,
12377 InputV, Mask, Subtarget, DAG);
12378 };
12379
12380 // The widest scale possible for extending is to a 64-bit integer.
12381 assert(Bits % 64 == 0 &&
12382 "The number of bits in a vector must be divisible by 64 on x86!");
12383 int NumExtElements = Bits / 64;
12384
12385 // Each iteration, try extending the elements half as much, but into twice as
12386 // many elements.
12387 for (; NumExtElements < NumElements; NumExtElements *= 2) {
12388 assert(NumElements % NumExtElements == 0 &&
12389 "The input vector size must be divisible by the extended size.");
12390 if (SDValue V = Lower(NumElements / NumExtElements))
12391 return V;
12392 }
12393
12394 // General extends failed, but 128-bit vectors may be able to use MOVQ.
12395 if (Bits != 128)
12396 return SDValue();
12397
12398 // Returns one of the source operands if the shuffle can be reduced to a
12399 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
12400 auto CanZExtLowHalf = [&]() {
12401 for (int i = NumElements / 2; i != NumElements; ++i)
12402 if (!Zeroable[i])
12403 return SDValue();
12404 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
12405 return V1;
12406 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
12407 return V2;
12408 return SDValue();
12409 };
12410
12411 if (SDValue V = CanZExtLowHalf()) {
12412 V = DAG.getBitcast(MVT::v2i64, V);
12413 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
12414 return DAG.getBitcast(VT, V);
12415 }
12416
12417 // No viable ext lowering found.
12418 return SDValue();
12419}
12420
12421/// Try to get a scalar value for a specific element of a vector.
12422///
12423/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
12425 SelectionDAG &DAG) {
12426 MVT VT = V.getSimpleValueType();
12427 MVT EltVT = VT.getVectorElementType();
12428 V = peekThroughBitcasts(V);
12429
12430 // If the bitcasts shift the element size, we can't extract an equivalent
12431 // element from it.
12432 MVT NewVT = V.getSimpleValueType();
12433 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
12434 return SDValue();
12435
12436 if (V.getOpcode() == ISD::BUILD_VECTOR ||
12437 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
12438 // Ensure the scalar operand is the same size as the destination.
12439 // FIXME: Add support for scalar truncation where possible.
12440 SDValue S = V.getOperand(Idx);
12441 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
12442 return DAG.getBitcast(EltVT, S);
12443 }
12444
12445 return SDValue();
12446}
12447
12448/// Helper to test for a load that can be folded with x86 shuffles.
12449///
12450/// This is particularly important because the set of instructions varies
12451/// significantly based on whether the operand is a load or not.
12453 return V->hasOneUse() &&
12455}
12456
12457template<typename T>
12458static bool isSoftF16(T VT, const X86Subtarget &Subtarget) {
12459 T EltVT = VT.getScalarType();
12460 return (EltVT == MVT::bf16 && !Subtarget.hasAVX10_2()) ||
12461 (EltVT == MVT::f16 && !Subtarget.hasFP16());
12462}
12463
12464/// Try to lower insertion of a single element into a zero vector.
12465///
12466/// This is a common pattern that we have especially efficient patterns to lower
12467/// across all subtarget feature sets.
12469 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12470 const APInt &Zeroable, const X86Subtarget &Subtarget,
12471 SelectionDAG &DAG) {
12472 MVT ExtVT = VT;
12473 MVT EltVT = VT.getVectorElementType();
12474 unsigned NumElts = VT.getVectorNumElements();
12475 unsigned EltBits = VT.getScalarSizeInBits();
12476
12477 if (isSoftF16(EltVT, Subtarget))
12478 return SDValue();
12479
12480 int V2Index =
12481 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
12482 Mask.begin();
12483 bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr;
12484 bool IsV1Zeroable = true;
12485 for (int i = 0, Size = Mask.size(); i < Size; ++i)
12486 if (i != V2Index && !Zeroable[i]) {
12487 IsV1Zeroable = false;
12488 break;
12489 }
12490
12491 // Bail if a non-zero V1 isn't used in place.
12492 if (!IsV1Zeroable) {
12493 SmallVector<int, 8> V1Mask(Mask);
12494 V1Mask[V2Index] = -1;
12495 if (!isNoopShuffleMask(V1Mask))
12496 return SDValue();
12497 }
12498
12499 // Check for a single input from a SCALAR_TO_VECTOR node.
12500 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
12501 // all the smarts here sunk into that routine. However, the current
12502 // lowering of BUILD_VECTOR makes that nearly impossible until the old
12503 // vector shuffle lowering is dead.
12504 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
12505 DAG);
12506 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
12507 // We need to zext the scalar if it is smaller than an i32.
12508 V2S = DAG.getBitcast(EltVT, V2S);
12509 if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {
12510 // Using zext to expand a narrow element won't work for non-zero
12511 // insertions. But we can use a masked constant vector if we're
12512 // inserting V2 into the bottom of V1.
12513 if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0))
12514 return SDValue();
12515
12516 // Zero-extend directly to i32.
12517 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
12518 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
12519
12520 // If we're inserting into a constant, mask off the inserted index
12521 // and OR with the zero-extended scalar.
12522 if (!IsV1Zeroable) {
12523 SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits));
12524 Bits[V2Index] = APInt::getZero(EltBits);
12525 SDValue BitMask = getConstVector(Bits, VT, DAG, DL);
12526 V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask);
12527 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12528 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2));
12529 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
12530 }
12531 }
12532 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
12533 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
12534 (EltVT == MVT::i16 && !Subtarget.hasAVX10_2())) {
12535 // Either not inserting from the low element of the input or the input
12536 // element size is too small to use VZEXT_MOVL to clear the high bits.
12537 return SDValue();
12538 }
12539
12540 if (!IsV1Zeroable) {
12541 // If V1 can't be treated as a zero vector we have fewer options to lower
12542 // this. We can't support integer vectors or non-zero targets cheaply.
12543 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
12544 if (!VT.isFloatingPoint() || V2Index != 0)
12545 return SDValue();
12546 if (!VT.is128BitVector())
12547 return SDValue();
12548
12549 // Otherwise, use MOVSD, MOVSS or MOVSH.
12550 unsigned MovOpc = 0;
12551 if (EltVT == MVT::f16)
12552 MovOpc = X86ISD::MOVSH;
12553 else if (EltVT == MVT::f32)
12554 MovOpc = X86ISD::MOVSS;
12555 else if (EltVT == MVT::f64)
12556 MovOpc = X86ISD::MOVSD;
12557 else
12558 llvm_unreachable("Unsupported floating point element type to handle!");
12559 return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);
12560 }
12561
12562 // This lowering only works for the low element with floating point vectors.
12563 if (VT.isFloatingPoint() && V2Index != 0)
12564 return SDValue();
12565
12566 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
12567 if (ExtVT != VT)
12568 V2 = DAG.getBitcast(VT, V2);
12569
12570 if (V2Index != 0) {
12571 // If we have 4 or fewer lanes we can cheaply shuffle the element into
12572 // the desired position. Otherwise it is more efficient to do a vector
12573 // shift left. We know that we can do a vector shift left because all
12574 // the inputs are zero.
12575 if (VT.isFloatingPoint() || NumElts <= 4) {
12576 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
12577 V2Shuffle[V2Index] = 0;
12578 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
12579 } else {
12580 V2 = DAG.getBitcast(MVT::v16i8, V2);
12581 V2 = DAG.getNode(
12582 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
12583 DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8));
12584 V2 = DAG.getBitcast(VT, V2);
12585 }
12586 }
12587 return V2;
12588}
12589
12590/// Try to lower broadcast of a single - truncated - integer element,
12591/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
12592///
12593/// This assumes we have AVX2.
12595 int BroadcastIdx,
12596 const X86Subtarget &Subtarget,
12597 SelectionDAG &DAG) {
12598 assert(Subtarget.hasAVX2() &&
12599 "We can only lower integer broadcasts with AVX2!");
12600
12601 MVT EltVT = VT.getVectorElementType();
12602 MVT V0VT = V0.getSimpleValueType();
12603
12604 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
12605 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
12606
12607 MVT V0EltVT = V0VT.getVectorElementType();
12608 if (!V0EltVT.isInteger())
12609 return SDValue();
12610
12611 const unsigned EltSize = EltVT.getSizeInBits();
12612 const unsigned V0EltSize = V0EltVT.getSizeInBits();
12613
12614 // This is only a truncation if the original element type is larger.
12615 if (V0EltSize <= EltSize)
12616 return SDValue();
12617
12618 assert(((V0EltSize % EltSize) == 0) &&
12619 "Scalar type sizes must all be powers of 2 on x86!");
12620
12621 const unsigned V0Opc = V0.getOpcode();
12622 const unsigned Scale = V0EltSize / EltSize;
12623 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
12624
12625 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
12626 V0Opc != ISD::BUILD_VECTOR)
12627 return SDValue();
12628
12629 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
12630
12631 // If we're extracting non-least-significant bits, shift so we can truncate.
12632 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
12633 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
12634 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
12635 if (const int OffsetIdx = BroadcastIdx % Scale)
12636 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
12637 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
12638
12639 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
12640 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
12641}
12642
12643/// Test whether this can be lowered with a single SHUFPS instruction.
12644///
12645/// This is used to disable more specialized lowerings when the shufps lowering
12646/// will happen to be efficient.
12648 // This routine only handles 128-bit shufps.
12649 assert(Mask.size() == 4 && "Unsupported mask size!");
12650 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
12651 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
12652 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
12653 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
12654
12655 // To lower with a single SHUFPS we need to have the low half and high half
12656 // each requiring a single input.
12657 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
12658 return false;
12659 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
12660 return false;
12661
12662 return true;
12663}
12664
12665/// Test whether the specified input (0 or 1) is in-place blended by the
12666/// given mask.
12667///
12668/// This returns true if the elements from a particular input are already in the
12669/// slot required by the given mask and require no permutation.
12670static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12671 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12672 int Size = Mask.size();
12673 for (int i = 0; i < Size; ++i)
12674 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12675 return false;
12676
12677 return true;
12678}
12679
12680/// If we are extracting two 128-bit halves of a vector and shuffling the
12681/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a
12682/// multi-shuffle lowering.
12684 SDValue N1, ArrayRef<int> Mask,
12685 SelectionDAG &DAG) {
12686 MVT VT = N0.getSimpleValueType();
12687 assert((VT.is128BitVector() &&
12688 (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
12689 "VPERM* family of shuffles requires 32-bit or 64-bit elements");
12690
12691 // Check that both sources are extracts of the same source vector.
12692 if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
12694 N0.getOperand(0) != N1.getOperand(0) ||
12695 !N0.hasOneUse() || !N1.hasOneUse())
12696 return SDValue();
12697
12698 SDValue WideVec = N0.getOperand(0);
12699 MVT WideVT = WideVec.getSimpleValueType();
12700 if (!WideVT.is256BitVector())
12701 return SDValue();
12702
12703 // Match extracts of each half of the wide source vector. Commute the shuffle
12704 // if the extract of the low half is N1.
12705 unsigned NumElts = VT.getVectorNumElements();
12706 SmallVector<int, 4> NewMask(Mask);
12707 const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);
12708 const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);
12709 if (ExtIndex1 == 0 && ExtIndex0 == NumElts)
12711 else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)
12712 return SDValue();
12713
12714 // Final bailout: if the mask is simple, we are better off using an extract
12715 // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps
12716 // because that avoids a constant load from memory.
12717 if (NumElts == 4 &&
12718 (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))
12719 return SDValue();
12720
12721 // Extend the shuffle mask with undef elements.
12722 NewMask.append(NumElts, -1);
12723
12724 // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0
12725 SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),
12726 NewMask);
12727 // This is free: ymm -> xmm.
12728 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,
12729 DAG.getIntPtrConstant(0, DL));
12730}
12731
12732/// Try to lower broadcast of a single element.
12733///
12734/// For convenience, this code also bundles all of the subtarget feature set
12735/// filtering. While a little annoying to re-dispatch on type here, there isn't
12736/// a convenient way to factor it out.
12738 SDValue V2, ArrayRef<int> Mask,
12739 const X86Subtarget &Subtarget,
12740 SelectionDAG &DAG) {
12741 MVT EltVT = VT.getVectorElementType();
12742 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
12743 (Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
12744 (Subtarget.hasAVX2() && (VT.isInteger() || EltVT == MVT::f16))))
12745 return SDValue();
12746
12747 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
12748 // we can only broadcast from a register with AVX2.
12749 unsigned NumEltBits = VT.getScalarSizeInBits();
12750 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
12753 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
12754
12755 // Check that the mask is a broadcast.
12756 int BroadcastIdx = getSplatIndex(Mask);
12757 if (BroadcastIdx < 0)
12758 return SDValue();
12759 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
12760 "a sorted mask where the broadcast "
12761 "comes from V1.");
12762 int NumActiveElts = count_if(Mask, [](int M) { return M >= 0; });
12763
12764 // Go up the chain of (vector) values to find a scalar load that we can
12765 // combine with the broadcast.
12766 // TODO: Combine this logic with findEltLoadSrc() used by
12767 // EltsFromConsecutiveLoads().
12768 int BitOffset = BroadcastIdx * NumEltBits;
12769 SDValue V = V1;
12770 for (;;) {
12771 switch (V.getOpcode()) {
12772 case ISD::BITCAST: {
12773 V = V.getOperand(0);
12774 continue;
12775 }
12776 case ISD::CONCAT_VECTORS: {
12777 int OpBitWidth = V.getOperand(0).getValueSizeInBits();
12778 int OpIdx = BitOffset / OpBitWidth;
12779 V = V.getOperand(OpIdx);
12780 BitOffset %= OpBitWidth;
12781 continue;
12782 }
12784 // The extraction index adds to the existing offset.
12785 unsigned EltBitWidth = V.getScalarValueSizeInBits();
12786 unsigned Idx = V.getConstantOperandVal(1);
12787 unsigned BeginOffset = Idx * EltBitWidth;
12788 BitOffset += BeginOffset;
12789 V = V.getOperand(0);
12790 continue;
12791 }
12792 case ISD::INSERT_SUBVECTOR: {
12793 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
12794 int EltBitWidth = VOuter.getScalarValueSizeInBits();
12795 int Idx = (int)V.getConstantOperandVal(2);
12796 int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
12797 int BeginOffset = Idx * EltBitWidth;
12798 int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
12799 if (BeginOffset <= BitOffset && BitOffset < EndOffset) {
12800 BitOffset -= BeginOffset;
12801 V = VInner;
12802 } else {
12803 V = VOuter;
12804 }
12805 continue;
12806 }
12807 }
12808 break;
12809 }
12810 assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset");
12811 BroadcastIdx = BitOffset / NumEltBits;
12812
12813 // Do we need to bitcast the source to retrieve the original broadcast index?
12814 bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;
12815
12816 // Check if this is a broadcast of a scalar. We special case lowering
12817 // for scalars so that we can more effectively fold with loads.
12818 // If the original value has a larger element type than the shuffle, the
12819 // broadcast element is in essence truncated. Make that explicit to ease
12820 // folding.
12821 if (BitCastSrc && VT.isInteger())
12822 if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(
12823 DL, VT, V, BroadcastIdx, Subtarget, DAG))
12824 return TruncBroadcast;
12825
12826 // Also check the simpler case, where we can directly reuse the scalar.
12827 if (!BitCastSrc &&
12828 ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
12829 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {
12830 V = V.getOperand(BroadcastIdx);
12831
12832 // If we can't broadcast from a register, check that the input is a load.
12833 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
12834 return SDValue();
12835 } else if (ISD::isNormalLoad(V.getNode()) &&
12836 cast<LoadSDNode>(V)->isSimple()) {
12837 // We do not check for one-use of the vector load because a broadcast load
12838 // is expected to be a win for code size, register pressure, and possibly
12839 // uops even if the original vector load is not eliminated.
12840
12841 // Reduce the vector load and shuffle to a broadcasted scalar load.
12842 LoadSDNode *Ld = cast<LoadSDNode>(V);
12843 SDValue BaseAddr = Ld->getOperand(1);
12844 MVT SVT = VT.getScalarType();
12845 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
12846 assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
12847 SDValue NewAddr =
12849
12850 // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
12851 // than MOVDDUP.
12852 // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
12853 if (Opcode == X86ISD::VBROADCAST) {
12854 SDVTList Tys = DAG.getVTList(VT, MVT::Other);
12855 SDValue Ops[] = {Ld->getChain(), NewAddr};
12856 V = DAG.getMemIntrinsicNode(
12857 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
12859 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12861 return DAG.getBitcast(VT, V);
12862 }
12863 assert(SVT == MVT::f64 && "Unexpected VT!");
12864 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
12866 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
12868 } else if (!BroadcastFromReg) {
12869 // We can't broadcast from a vector register.
12870 return SDValue();
12871 } else if (BitOffset != 0) {
12872 // We can only broadcast from the zero-element of a vector register,
12873 // but it can be advantageous to broadcast from the zero-element of a
12874 // subvector.
12875 if (!VT.is256BitVector() && !VT.is512BitVector())
12876 return SDValue();
12877
12878 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
12879 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12880 return SDValue();
12881
12882 // If we are broadcasting an element from the lowest 128-bit subvector, try
12883 // to move the element in position.
12884 if (BitOffset < 128 && NumActiveElts > 1 &&
12885 V.getScalarValueSizeInBits() == NumEltBits) {
12886 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
12887 "Unexpected bit-offset");
12888 SmallVector<int, 16> ExtractMask(128 / NumEltBits, SM_SentinelUndef);
12889 ExtractMask[0] = BitOffset / V.getScalarValueSizeInBits();
12890 V = extractSubVector(V, 0, DAG, DL, 128);
12891 V = DAG.getVectorShuffle(V.getValueType(), DL, V, V, ExtractMask);
12892 } else {
12893 // Only broadcast the zero-element of a 128-bit subvector.
12894 if ((BitOffset % 128) != 0)
12895 return SDValue();
12896
12897 assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&
12898 "Unexpected bit-offset");
12899 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
12900 "Unexpected vector size");
12901 unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();
12902 V = extract128BitVector(V, ExtractIdx, DAG, DL);
12903 }
12904 }
12905
12906 // On AVX we can use VBROADCAST directly for scalar sources.
12907 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {
12908 V = DAG.getBitcast(MVT::f64, V);
12909 if (Subtarget.hasAVX()) {
12910 V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);
12911 return DAG.getBitcast(VT, V);
12912 }
12913 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);
12914 }
12915
12916 // If this is a scalar, do the broadcast on this type and bitcast.
12917 if (!V.getValueType().isVector()) {
12918 assert(V.getScalarValueSizeInBits() == NumEltBits &&
12919 "Unexpected scalar size");
12920 MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
12922 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
12923 }
12924
12925 // We only support broadcasting from 128-bit vectors to minimize the
12926 // number of patterns we need to deal with in isel. So extract down to
12927 // 128-bits, removing as many bitcasts as possible.
12928 if (V.getValueSizeInBits() > 128)
12930
12931 // Otherwise cast V to a vector with the same element type as VT, but
12932 // possibly narrower than VT. Then perform the broadcast.
12933 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
12934 MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
12935 return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
12936}
12937
12938// Check for whether we can use INSERTPS to perform the shuffle. We only use
12939// INSERTPS when the V1 elements are already in the correct locations
12940// because otherwise we can just always use two SHUFPS instructions which
12941// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
12942// perform INSERTPS if a single V1 element is out of place and all V2
12943// elements are zeroable.
12945 unsigned &InsertPSMask,
12946 const APInt &Zeroable,
12947 ArrayRef<int> Mask, SelectionDAG &DAG) {
12948 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
12949 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
12950 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12951
12952 // Attempt to match INSERTPS with one element from VA or VB being
12953 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
12954 // are updated.
12955 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
12956 ArrayRef<int> CandidateMask) {
12957 unsigned ZMask = 0;
12958 int VADstIndex = -1;
12959 int VBDstIndex = -1;
12960 bool VAUsedInPlace = false;
12961
12962 for (int i = 0; i < 4; ++i) {
12963 // Synthesize a zero mask from the zeroable elements (includes undefs).
12964 if (Zeroable[i]) {
12965 ZMask |= 1 << i;
12966 continue;
12967 }
12968
12969 // Flag if we use any VA inputs in place.
12970 if (i == CandidateMask[i]) {
12971 VAUsedInPlace = true;
12972 continue;
12973 }
12974
12975 // We can only insert a single non-zeroable element.
12976 if (VADstIndex >= 0 || VBDstIndex >= 0)
12977 return false;
12978
12979 if (CandidateMask[i] < 4) {
12980 // VA input out of place for insertion.
12981 VADstIndex = i;
12982 } else {
12983 // VB input for insertion.
12984 VBDstIndex = i;
12985 }
12986 }
12987
12988 // Don't bother if we have no (non-zeroable) element for insertion.
12989 if (VADstIndex < 0 && VBDstIndex < 0)
12990 return false;
12991
12992 // Determine element insertion src/dst indices. The src index is from the
12993 // start of the inserted vector, not the start of the concatenated vector.
12994 unsigned VBSrcIndex = 0;
12995 if (VADstIndex >= 0) {
12996 // If we have a VA input out of place, we use VA as the V2 element
12997 // insertion and don't use the original V2 at all.
12998 VBSrcIndex = CandidateMask[VADstIndex];
12999 VBDstIndex = VADstIndex;
13000 VB = VA;
13001 } else {
13002 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
13003 }
13004
13005 // If no V1 inputs are used in place, then the result is created only from
13006 // the zero mask and the V2 insertion - so remove V1 dependency.
13007 if (!VAUsedInPlace)
13008 VA = DAG.getUNDEF(MVT::v4f32);
13009
13010 // Update V1, V2 and InsertPSMask accordingly.
13011 V1 = VA;
13012 V2 = VB;
13013
13014 // Insert the V2 element into the desired position.
13015 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
13016 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
13017 return true;
13018 };
13019
13020 if (matchAsInsertPS(V1, V2, Mask))
13021 return true;
13022
13023 // Commute and try again.
13024 SmallVector<int, 4> CommutedMask(Mask);
13026 if (matchAsInsertPS(V2, V1, CommutedMask))
13027 return true;
13028
13029 return false;
13030}
13031
13033 ArrayRef<int> Mask, const APInt &Zeroable,
13034 SelectionDAG &DAG) {
13035 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13036 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13037
13038 // Attempt to match the insertps pattern.
13039 unsigned InsertPSMask = 0;
13040 if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
13041 return SDValue();
13042
13043 // Insert the V2 element into the desired position.
13044 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
13045 DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
13046}
13047
13048/// Handle lowering of 2-lane 64-bit floating point shuffles.
13049///
13050/// This is the basis function for the 2-lane 64-bit shuffles as we have full
13051/// support for floating point shuffles but not integer shuffles. These
13052/// instructions will incur a domain crossing penalty on some chips though so
13053/// it is better to avoid lowering through this for integer vectors where
13054/// possible.
13056 const APInt &Zeroable, SDValue V1, SDValue V2,
13057 const X86Subtarget &Subtarget,
13058 SelectionDAG &DAG) {
13059 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13060 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
13061 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13062
13063 if (V2.isUndef()) {
13064 // Check for being able to broadcast a single element.
13065 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,
13066 Mask, Subtarget, DAG))
13067 return Broadcast;
13068
13069 // Straight shuffle of a single input vector. Simulate this by using the
13070 // single input as both of the "inputs" to this instruction..
13071 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
13072
13073 if (Subtarget.hasAVX()) {
13074 // If we have AVX, we can use VPERMILPS which will allow folding a load
13075 // into the shuffle.
13076 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
13077 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13078 }
13079
13080 return DAG.getNode(
13081 X86ISD::SHUFP, DL, MVT::v2f64,
13082 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13083 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
13084 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13085 }
13086 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13087 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
13088 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13089 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13090
13091 if (Subtarget.hasAVX2())
13092 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13093 return Extract;
13094
13095 // When loading a scalar and then shuffling it into a vector we can often do
13096 // the insertion cheaply.
13098 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13099 return Insertion;
13100 // Try inverting the insertion since for v2 masks it is easy to do and we
13101 // can't reliably sort the mask one way or the other.
13102 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
13103 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
13105 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13106 return Insertion;
13107
13108 // Try to use one of the special instruction patterns to handle two common
13109 // blend patterns if a zero-blend above didn't work.
13110 if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||
13111 isShuffleEquivalent(Mask, {1, 3}, V1, V2))
13112 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
13113 // We can either use a special instruction to load over the low double or
13114 // to move just the low double.
13115 return DAG.getNode(
13116 X86ISD::MOVSD, DL, MVT::v2f64, V2,
13117 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
13118
13119 if (Subtarget.hasSSE41())
13120 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
13121 Zeroable, Subtarget, DAG))
13122 return Blend;
13123
13124 // Use dedicated unpack instructions for masks that match their pattern.
13125 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, V1, V2, Mask, DAG))
13126 return V;
13127
13128 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
13129 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
13130 DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));
13131}
13132
13133/// Handle lowering of 2-lane 64-bit integer shuffles.
13134///
13135/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
13136/// the integer unit to minimize domain crossing penalties. However, for blends
13137/// it falls back to the floating point shuffle operation with appropriate bit
13138/// casting.
13140 const APInt &Zeroable, SDValue V1, SDValue V2,
13141 const X86Subtarget &Subtarget,
13142 SelectionDAG &DAG) {
13143 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13144 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
13145 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
13146
13147 if (V2.isUndef()) {
13148 // Check for being able to broadcast a single element.
13149 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,
13150 Mask, Subtarget, DAG))
13151 return Broadcast;
13152
13153 // Straight shuffle of a single input vector. For everything from SSE2
13154 // onward this has a single fast instruction with no scary immediates.
13155 // We have to map the mask as it is actually a v4i32 shuffle instruction.
13156 V1 = DAG.getBitcast(MVT::v4i32, V1);
13157 int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),
13158 Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),
13159 Mask[1] < 0 ? -1 : (Mask[1] * 2),
13160 Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};
13161 return DAG.getBitcast(
13162 MVT::v2i64,
13163 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13164 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
13165 }
13166 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
13167 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
13168 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
13169 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
13170
13171 if (Subtarget.hasAVX2())
13172 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13173 return Extract;
13174
13175 // Try to use shift instructions.
13176 if (SDValue Shift =
13177 lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,
13178 DAG, /*BitwiseOnly*/ false))
13179 return Shift;
13180
13181 // When loading a scalar and then shuffling it into a vector we can often do
13182 // the insertion cheaply.
13184 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
13185 return Insertion;
13186 // Try inverting the insertion since for v2 masks it is easy to do and we
13187 // can't reliably sort the mask one way or the other.
13188 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
13190 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
13191 return Insertion;
13192
13193 // We have different paths for blend lowering, but they all must use the
13194 // *exact* same predicate.
13195 bool IsBlendSupported = Subtarget.hasSSE41();
13196 if (IsBlendSupported)
13197 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
13198 Zeroable, Subtarget, DAG))
13199 return Blend;
13200
13201 // Use dedicated unpack instructions for masks that match their pattern.
13202 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, V1, V2, Mask, DAG))
13203 return V;
13204
13205 // Try to use byte rotation instructions.
13206 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13207 if (Subtarget.hasSSSE3()) {
13208 if (Subtarget.hasVLX())
13209 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
13210 Zeroable, Subtarget, DAG))
13211 return Rotate;
13212
13213 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,
13214 Subtarget, DAG))
13215 return Rotate;
13216 }
13217
13218 // If we have direct support for blends, we should lower by decomposing into
13219 // a permute. That will be faster than the domain cross.
13220 if (IsBlendSupported)
13221 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,
13222 Zeroable, Subtarget, DAG);
13223
13224 // We implement this with SHUFPD which is pretty lame because it will likely
13225 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
13226 // However, all the alternatives are still more cycles and newer chips don't
13227 // have this problem. It would be really nice if x86 had better shuffles here.
13228 V1 = DAG.getBitcast(MVT::v2f64, V1);
13229 V2 = DAG.getBitcast(MVT::v2f64, V2);
13230 return DAG.getBitcast(MVT::v2i64,
13231 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
13232}
13233
13234/// Lower a vector shuffle using the SHUFPS instruction.
13235///
13236/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
13237/// It makes no assumptions about whether this is the *best* lowering, it simply
13238/// uses it.
13240 ArrayRef<int> Mask, SDValue V1,
13241 SDValue V2, SelectionDAG &DAG) {
13242 SDValue LowV = V1, HighV = V2;
13243 SmallVector<int, 4> NewMask(Mask);
13244 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13245
13246 if (NumV2Elements == 1) {
13247 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
13248
13249 // Compute the index adjacent to V2Index and in the same half by toggling
13250 // the low bit.
13251 int V2AdjIndex = V2Index ^ 1;
13252
13253 if (Mask[V2AdjIndex] < 0) {
13254 // Handles all the cases where we have a single V2 element and an undef.
13255 // This will only ever happen in the high lanes because we commute the
13256 // vector otherwise.
13257 if (V2Index < 2)
13258 std::swap(LowV, HighV);
13259 NewMask[V2Index] -= 4;
13260 } else {
13261 // Handle the case where the V2 element ends up adjacent to a V1 element.
13262 // To make this work, blend them together as the first step.
13263 int V1Index = V2AdjIndex;
13264 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
13265 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
13266 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13267
13268 // Now proceed to reconstruct the final blend as we have the necessary
13269 // high or low half formed.
13270 if (V2Index < 2) {
13271 LowV = V2;
13272 HighV = V1;
13273 } else {
13274 HighV = V2;
13275 }
13276 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
13277 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
13278 }
13279 } else if (NumV2Elements == 2) {
13280 if (Mask[0] < 4 && Mask[1] < 4) {
13281 // Handle the easy case where we have V1 in the low lanes and V2 in the
13282 // high lanes.
13283 NewMask[2] -= 4;
13284 NewMask[3] -= 4;
13285 } else if (Mask[2] < 4 && Mask[3] < 4) {
13286 // We also handle the reversed case because this utility may get called
13287 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
13288 // arrange things in the right direction.
13289 NewMask[0] -= 4;
13290 NewMask[1] -= 4;
13291 HighV = V1;
13292 LowV = V2;
13293 } else {
13294 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
13295 // trying to place elements directly, just blend them and set up the final
13296 // shuffle to place them.
13297
13298 // The first two blend mask elements are for V1, the second two are for
13299 // V2.
13300 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
13301 Mask[2] < 4 ? Mask[2] : Mask[3],
13302 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
13303 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
13304 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13305 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
13306
13307 // Now we do a normal shuffle of V1 by giving V1 as both operands to
13308 // a blend.
13309 LowV = HighV = V1;
13310 NewMask[0] = Mask[0] < 4 ? 0 : 2;
13311 NewMask[1] = Mask[0] < 4 ? 2 : 0;
13312 NewMask[2] = Mask[2] < 4 ? 1 : 3;
13313 NewMask[3] = Mask[2] < 4 ? 3 : 1;
13314 }
13315 } else if (NumV2Elements == 3) {
13316 // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but
13317 // we can get here due to other paths (e.g repeated mask matching) that we
13318 // don't want to do another round of lowerVECTOR_SHUFFLE.
13320 return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);
13321 }
13322 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
13323 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
13324}
13325
13326/// Lower 4-lane 32-bit floating point shuffles.
13327///
13328/// Uses instructions exclusively from the floating point unit to minimize
13329/// domain crossing penalties, as these are sufficient to implement all v4f32
13330/// shuffles.
13332 const APInt &Zeroable, SDValue V1, SDValue V2,
13333 const X86Subtarget &Subtarget,
13334 SelectionDAG &DAG) {
13335 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13336 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
13337 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13338
13339 if (Subtarget.hasSSE41())
13340 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
13341 Zeroable, Subtarget, DAG))
13342 return Blend;
13343
13344 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13345
13346 if (NumV2Elements == 0) {
13347 // Check for being able to broadcast a single element.
13348 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,
13349 Mask, Subtarget, DAG))
13350 return Broadcast;
13351
13352 // Use even/odd duplicate instructions for masks that match their pattern.
13353 if (Subtarget.hasSSE3()) {
13354 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
13355 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
13356 if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))
13357 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
13358 }
13359
13360 if (Subtarget.hasAVX()) {
13361 // If we have AVX, we can use VPERMILPS which will allow folding a load
13362 // into the shuffle.
13363 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
13364 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13365 }
13366
13367 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
13368 // in SSE1 because otherwise they are widened to v2f64 and never get here.
13369 if (!Subtarget.hasSSE2()) {
13370 if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))
13371 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
13372 if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))
13373 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
13374 }
13375
13376 // Otherwise, use a straight shuffle of a single input vector. We pass the
13377 // input vector to both operands to simulate this with a SHUFPS.
13378 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
13379 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13380 }
13381
13382 if (Subtarget.hasSSE2())
13384 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
13385 ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
13386 return ZExt;
13387 }
13388
13389 if (Subtarget.hasAVX2())
13390 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13391 return Extract;
13392
13393 // There are special ways we can lower some single-element blends. However, we
13394 // have custom ways we can lower more complex single-element blends below that
13395 // we defer to if both this and BLENDPS fail to match, so restrict this to
13396 // when the V2 input is targeting element 0 of the mask -- that is the fast
13397 // case here.
13398 if (NumV2Elements == 1 && Mask[0] >= 4)
13400 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13401 return V;
13402
13403 if (Subtarget.hasSSE41()) {
13404 // Use INSERTPS if we can complete the shuffle efficiently.
13405 if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
13406 return V;
13407
13408 if (!isSingleSHUFPSMask(Mask))
13409 if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,
13410 V2, Mask, DAG))
13411 return BlendPerm;
13412 }
13413
13414 // Use low/high mov instructions. These are only valid in SSE1 because
13415 // otherwise they are widened to v2f64 and never get here.
13416 if (!Subtarget.hasSSE2()) {
13417 if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))
13418 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
13419 if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))
13420 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
13421 }
13422
13423 // Use dedicated unpack instructions for masks that match their pattern.
13424 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, V1, V2, Mask, DAG))
13425 return V;
13426
13427 // Otherwise fall back to a SHUFPS lowering strategy.
13428 return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
13429}
13430
13431/// Lower 4-lane i32 vector shuffles.
13432///
13433/// We try to handle these with integer-domain shuffles where we can, but for
13434/// blends we use the floating point domain blend instructions.
13436 const APInt &Zeroable, SDValue V1, SDValue V2,
13437 const X86Subtarget &Subtarget,
13438 SelectionDAG &DAG) {
13439 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13440 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
13441 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13442
13443 // Whenever we can lower this as a zext, that instruction is strictly faster
13444 // than any alternative. It also allows us to fold memory operands into the
13445 // shuffle in many cases.
13446 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,
13447 Zeroable, Subtarget, DAG))
13448 return ZExt;
13449
13450 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
13451
13452 // Try to use shift instructions if fast.
13453 if (Subtarget.preferLowerShuffleAsShift()) {
13454 if (SDValue Shift =
13455 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,
13456 Subtarget, DAG, /*BitwiseOnly*/ true))
13457 return Shift;
13458 if (NumV2Elements == 0)
13459 if (SDValue Rotate =
13460 lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))
13461 return Rotate;
13462 }
13463
13464 if (NumV2Elements == 0) {
13465 // Try to use broadcast unless the mask only has one non-undef element.
13466 if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {
13467 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,
13468 Mask, Subtarget, DAG))
13469 return Broadcast;
13470 }
13471
13472 // Straight shuffle of a single input vector. For everything from SSE2
13473 // onward this has a single fast instruction with no scary immediates.
13474 // We coerce the shuffle pattern to be compatible with UNPCK instructions
13475 // but we aren't actually going to use the UNPCK instruction because doing
13476 // so prevents folding a load into this instruction or making a copy.
13477 const int UnpackLoMask[] = {0, 0, 1, 1};
13478 const int UnpackHiMask[] = {2, 2, 3, 3};
13479 if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))
13480 Mask = UnpackLoMask;
13481 else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))
13482 Mask = UnpackHiMask;
13483
13484 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
13485 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13486 }
13487
13488 if (Subtarget.hasAVX2())
13489 if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
13490 return Extract;
13491
13492 // Try to use shift instructions.
13493 if (SDValue Shift =
13494 lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,
13495 DAG, /*BitwiseOnly*/ false))
13496 return Shift;
13497
13498 // There are special ways we can lower some single-element blends.
13499 if (NumV2Elements == 1)
13501 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13502 return V;
13503
13504 // We have different paths for blend lowering, but they all must use the
13505 // *exact* same predicate.
13506 bool IsBlendSupported = Subtarget.hasSSE41();
13507 if (IsBlendSupported)
13508 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
13509 Zeroable, Subtarget, DAG))
13510 return Blend;
13511
13512 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
13513 Zeroable, Subtarget, DAG))
13514 return Masked;
13515
13516 // Use dedicated unpack instructions for masks that match their pattern.
13517 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, V1, V2, Mask, DAG))
13518 return V;
13519
13520 // Try to use byte rotation instructions.
13521 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
13522 if (Subtarget.hasSSSE3()) {
13523 if (Subtarget.hasVLX())
13524 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
13525 Zeroable, Subtarget, DAG))
13526 return Rotate;
13527
13528 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,
13529 Subtarget, DAG))
13530 return Rotate;
13531 }
13532
13533 // Assume that a single SHUFPS is faster than an alternative sequence of
13534 // multiple instructions (even if the CPU has a domain penalty).
13535 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13536 if (!isSingleSHUFPSMask(Mask)) {
13537 // If we have direct support for blends, we should lower by decomposing into
13538 // a permute. That will be faster than the domain cross.
13539 if (IsBlendSupported)
13540 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,
13541 Zeroable, Subtarget, DAG);
13542
13543 // Try to lower by permuting the inputs into an unpack instruction.
13544 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,
13545 Mask, Subtarget, DAG))
13546 return Unpack;
13547 }
13548
13549 // We implement this with SHUFPS because it can blend from two vectors.
13550 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
13551 // up the inputs, bypassing domain shift penalties that we would incur if we
13552 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
13553 // relevant.
13554 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
13555 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
13556 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
13557 return DAG.getBitcast(MVT::v4i32, ShufPS);
13558}
13559
13560/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
13561/// shuffle lowering, and the most complex part.
13562///
13563/// The lowering strategy is to try to form pairs of input lanes which are
13564/// targeted at the same half of the final vector, and then use a dword shuffle
13565/// to place them onto the right half, and finally unpack the paired lanes into
13566/// their final position.
13567///
13568/// The exact breakdown of how to form these dword pairs and align them on the
13569/// correct sides is really tricky. See the comments within the function for
13570/// more of the details.
13571///
13572/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
13573/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
13574/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
13575/// vector, form the analogous 128-bit 8-element Mask.
13577 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
13578 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13579 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
13580 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
13581
13582 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
13583 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
13584 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
13585
13586 // Attempt to directly match PSHUFLW or PSHUFHW.
13587 if (isUndefOrInRange(LoMask, 0, 4) &&
13588 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
13589 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
13590 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
13591 }
13592 if (isUndefOrInRange(HiMask, 4, 8) &&
13593 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
13594 for (int i = 0; i != 4; ++i)
13595 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
13596 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
13597 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
13598 }
13599
13600 SmallVector<int, 4> LoInputs;
13601 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
13602 array_pod_sort(LoInputs.begin(), LoInputs.end());
13603 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
13604 SmallVector<int, 4> HiInputs;
13605 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
13606 array_pod_sort(HiInputs.begin(), HiInputs.end());
13607 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
13608 int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();
13609 int NumHToL = LoInputs.size() - NumLToL;
13610 int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();
13611 int NumHToH = HiInputs.size() - NumLToH;
13612 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
13613 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
13614 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
13615 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
13616
13617 // If we are shuffling values from one half - check how many different DWORD
13618 // pairs we need to create. If only 1 or 2 then we can perform this as a
13619 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
13620 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
13621 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
13622 V = DAG.getNode(ShufWOp, DL, VT, V,
13623 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13624 V = DAG.getBitcast(PSHUFDVT, V);
13625 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
13626 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
13627 return DAG.getBitcast(VT, V);
13628 };
13629
13630 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
13631 int PSHUFDMask[4] = { -1, -1, -1, -1 };
13632 SmallVector<std::pair<int, int>, 4> DWordPairs;
13633 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
13634
13635 // Collect the different DWORD pairs.
13636 for (int DWord = 0; DWord != 4; ++DWord) {
13637 int M0 = Mask[2 * DWord + 0];
13638 int M1 = Mask[2 * DWord + 1];
13639 M0 = (M0 >= 0 ? M0 % 4 : M0);
13640 M1 = (M1 >= 0 ? M1 % 4 : M1);
13641 if (M0 < 0 && M1 < 0)
13642 continue;
13643
13644 bool Match = false;
13645 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
13646 auto &DWordPair = DWordPairs[j];
13647 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
13648 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
13649 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
13650 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
13651 PSHUFDMask[DWord] = DOffset + j;
13652 Match = true;
13653 break;
13654 }
13655 }
13656 if (!Match) {
13657 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
13658 DWordPairs.push_back(std::make_pair(M0, M1));
13659 }
13660 }
13661
13662 if (DWordPairs.size() <= 2) {
13663 DWordPairs.resize(2, std::make_pair(-1, -1));
13664 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
13665 DWordPairs[1].first, DWordPairs[1].second};
13666 if ((NumHToL + NumHToH) == 0)
13667 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
13668 if ((NumLToL + NumLToH) == 0)
13669 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
13670 }
13671 }
13672
13673 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
13674 // such inputs we can swap two of the dwords across the half mark and end up
13675 // with <=2 inputs to each half in each half. Once there, we can fall through
13676 // to the generic code below. For example:
13677 //
13678 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13679 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
13680 //
13681 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
13682 // and an existing 2-into-2 on the other half. In this case we may have to
13683 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
13684 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
13685 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
13686 // because any other situation (including a 3-into-1 or 1-into-3 in the other
13687 // half than the one we target for fixing) will be fixed when we re-enter this
13688 // path. We will also combine away any sequence of PSHUFD instructions that
13689 // result into a single instruction. Here is an example of the tricky case:
13690 //
13691 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
13692 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
13693 //
13694 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
13695 //
13696 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
13697 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
13698 //
13699 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
13700 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
13701 //
13702 // The result is fine to be handled by the generic logic.
13703 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
13704 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
13705 int AOffset, int BOffset) {
13706 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
13707 "Must call this with A having 3 or 1 inputs from the A half.");
13708 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
13709 "Must call this with B having 1 or 3 inputs from the B half.");
13710 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
13711 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
13712
13713 bool ThreeAInputs = AToAInputs.size() == 3;
13714
13715 // Compute the index of dword with only one word among the three inputs in
13716 // a half by taking the sum of the half with three inputs and subtracting
13717 // the sum of the actual three inputs. The difference is the remaining
13718 // slot.
13719 int ADWord = 0, BDWord = 0;
13720 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
13721 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
13722 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
13723 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
13724 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
13725 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
13726 int TripleNonInputIdx =
13727 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
13728 TripleDWord = TripleNonInputIdx / 2;
13729
13730 // We use xor with one to compute the adjacent DWord to whichever one the
13731 // OneInput is in.
13732 OneInputDWord = (OneInput / 2) ^ 1;
13733
13734 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
13735 // and BToA inputs. If there is also such a problem with the BToB and AToB
13736 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
13737 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
13738 // is essential that we don't *create* a 3<-1 as then we might oscillate.
13739 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
13740 // Compute how many inputs will be flipped by swapping these DWords. We
13741 // need
13742 // to balance this to ensure we don't form a 3-1 shuffle in the other
13743 // half.
13744 int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +
13745 llvm::count(AToBInputs, 2 * ADWord + 1);
13746 int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +
13747 llvm::count(BToBInputs, 2 * BDWord + 1);
13748 if ((NumFlippedAToBInputs == 1 &&
13749 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
13750 (NumFlippedBToBInputs == 1 &&
13751 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
13752 // We choose whether to fix the A half or B half based on whether that
13753 // half has zero flipped inputs. At zero, we may not be able to fix it
13754 // with that half. We also bias towards fixing the B half because that
13755 // will more commonly be the high half, and we have to bias one way.
13756 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
13757 ArrayRef<int> Inputs) {
13758 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
13759 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
13760 // Determine whether the free index is in the flipped dword or the
13761 // unflipped dword based on where the pinned index is. We use this bit
13762 // in an xor to conditionally select the adjacent dword.
13763 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
13764 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13765 if (IsFixIdxInput == IsFixFreeIdxInput)
13766 FixFreeIdx += 1;
13767 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
13768 assert(IsFixIdxInput != IsFixFreeIdxInput &&
13769 "We need to be changing the number of flipped inputs!");
13770 int PSHUFHalfMask[] = {0, 1, 2, 3};
13771 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
13772 V = DAG.getNode(
13773 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
13774 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
13775 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
13776
13777 for (int &M : Mask)
13778 if (M >= 0 && M == FixIdx)
13779 M = FixFreeIdx;
13780 else if (M >= 0 && M == FixFreeIdx)
13781 M = FixIdx;
13782 };
13783 if (NumFlippedBToBInputs != 0) {
13784 int BPinnedIdx =
13785 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
13786 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
13787 } else {
13788 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
13789 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
13790 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
13791 }
13792 }
13793 }
13794
13795 int PSHUFDMask[] = {0, 1, 2, 3};
13796 PSHUFDMask[ADWord] = BDWord;
13797 PSHUFDMask[BDWord] = ADWord;
13798 V = DAG.getBitcast(
13799 VT,
13800 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
13801 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13802
13803 // Adjust the mask to match the new locations of A and B.
13804 for (int &M : Mask)
13805 if (M >= 0 && M/2 == ADWord)
13806 M = 2 * BDWord + M % 2;
13807 else if (M >= 0 && M/2 == BDWord)
13808 M = 2 * ADWord + M % 2;
13809
13810 // Recurse back into this routine to re-compute state now that this isn't
13811 // a 3 and 1 problem.
13812 return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);
13813 };
13814 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
13815 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
13816 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
13817 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
13818
13819 // At this point there are at most two inputs to the low and high halves from
13820 // each half. That means the inputs can always be grouped into dwords and
13821 // those dwords can then be moved to the correct half with a dword shuffle.
13822 // We use at most one low and one high word shuffle to collect these paired
13823 // inputs into dwords, and finally a dword shuffle to place them.
13824 int PSHUFLMask[4] = {-1, -1, -1, -1};
13825 int PSHUFHMask[4] = {-1, -1, -1, -1};
13826 int PSHUFDMask[4] = {-1, -1, -1, -1};
13827
13828 // First fix the masks for all the inputs that are staying in their
13829 // original halves. This will then dictate the targets of the cross-half
13830 // shuffles.
13831 auto fixInPlaceInputs =
13832 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
13833 MutableArrayRef<int> SourceHalfMask,
13834 MutableArrayRef<int> HalfMask, int HalfOffset) {
13835 if (InPlaceInputs.empty())
13836 return;
13837 if (InPlaceInputs.size() == 1) {
13838 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13839 InPlaceInputs[0] - HalfOffset;
13840 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
13841 return;
13842 }
13843 if (IncomingInputs.empty()) {
13844 // Just fix all of the in place inputs.
13845 for (int Input : InPlaceInputs) {
13846 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
13847 PSHUFDMask[Input / 2] = Input / 2;
13848 }
13849 return;
13850 }
13851
13852 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
13853 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
13854 InPlaceInputs[0] - HalfOffset;
13855 // Put the second input next to the first so that they are packed into
13856 // a dword. We find the adjacent index by toggling the low bit.
13857 int AdjIndex = InPlaceInputs[0] ^ 1;
13858 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
13859 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
13860 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
13861 };
13862 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
13863 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
13864
13865 // Now gather the cross-half inputs and place them into a free dword of
13866 // their target half.
13867 // FIXME: This operation could almost certainly be simplified dramatically to
13868 // look more like the 3-1 fixing operation.
13869 auto moveInputsToRightHalf = [&PSHUFDMask](
13870 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
13871 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
13872 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
13873 int DestOffset) {
13874 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
13875 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
13876 };
13877 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
13878 int Word) {
13879 int LowWord = Word & ~1;
13880 int HighWord = Word | 1;
13881 return isWordClobbered(SourceHalfMask, LowWord) ||
13882 isWordClobbered(SourceHalfMask, HighWord);
13883 };
13884
13885 if (IncomingInputs.empty())
13886 return;
13887
13888 if (ExistingInputs.empty()) {
13889 // Map any dwords with inputs from them into the right half.
13890 for (int Input : IncomingInputs) {
13891 // If the source half mask maps over the inputs, turn those into
13892 // swaps and use the swapped lane.
13893 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
13894 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
13895 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
13896 Input - SourceOffset;
13897 // We have to swap the uses in our half mask in one sweep.
13898 for (int &M : HalfMask)
13899 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
13900 M = Input;
13901 else if (M == Input)
13902 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13903 } else {
13904 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
13905 Input - SourceOffset &&
13906 "Previous placement doesn't match!");
13907 }
13908 // Note that this correctly re-maps both when we do a swap and when
13909 // we observe the other side of the swap above. We rely on that to
13910 // avoid swapping the members of the input list directly.
13911 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
13912 }
13913
13914 // Map the input's dword into the correct half.
13915 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
13916 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
13917 else
13918 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
13919 Input / 2 &&
13920 "Previous placement doesn't match!");
13921 }
13922
13923 // And just directly shift any other-half mask elements to be same-half
13924 // as we will have mirrored the dword containing the element into the
13925 // same position within that half.
13926 for (int &M : HalfMask)
13927 if (M >= SourceOffset && M < SourceOffset + 4) {
13928 M = M - SourceOffset + DestOffset;
13929 assert(M >= 0 && "This should never wrap below zero!");
13930 }
13931 return;
13932 }
13933
13934 // Ensure we have the input in a viable dword of its current half. This
13935 // is particularly tricky because the original position may be clobbered
13936 // by inputs being moved and *staying* in that half.
13937 if (IncomingInputs.size() == 1) {
13938 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13939 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
13940 SourceOffset;
13941 SourceHalfMask[InputFixed - SourceOffset] =
13942 IncomingInputs[0] - SourceOffset;
13943 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
13944 InputFixed);
13945 IncomingInputs[0] = InputFixed;
13946 }
13947 } else if (IncomingInputs.size() == 2) {
13948 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
13949 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
13950 // We have two non-adjacent or clobbered inputs we need to extract from
13951 // the source half. To do this, we need to map them into some adjacent
13952 // dword slot in the source mask.
13953 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
13954 IncomingInputs[1] - SourceOffset};
13955
13956 // If there is a free slot in the source half mask adjacent to one of
13957 // the inputs, place the other input in it. We use (Index XOR 1) to
13958 // compute an adjacent index.
13959 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
13960 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
13961 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
13962 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
13963 InputsFixed[1] = InputsFixed[0] ^ 1;
13964 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
13965 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
13966 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
13967 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
13968 InputsFixed[0] = InputsFixed[1] ^ 1;
13969 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
13970 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
13971 // The two inputs are in the same DWord but it is clobbered and the
13972 // adjacent DWord isn't used at all. Move both inputs to the free
13973 // slot.
13974 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
13975 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
13976 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
13977 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
13978 } else {
13979 // The only way we hit this point is if there is no clobbering
13980 // (because there are no off-half inputs to this half) and there is no
13981 // free slot adjacent to one of the inputs. In this case, we have to
13982 // swap an input with a non-input.
13983 for (int i = 0; i < 4; ++i)
13984 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
13985 "We can't handle any clobbers here!");
13986 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
13987 "Cannot have adjacent inputs here!");
13988
13989 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
13990 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
13991
13992 // We also have to update the final source mask in this case because
13993 // it may need to undo the above swap.
13994 for (int &M : FinalSourceHalfMask)
13995 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
13996 M = InputsFixed[1] + SourceOffset;
13997 else if (M == InputsFixed[1] + SourceOffset)
13998 M = (InputsFixed[0] ^ 1) + SourceOffset;
13999
14000 InputsFixed[1] = InputsFixed[0] ^ 1;
14001 }
14002
14003 // Point everything at the fixed inputs.
14004 for (int &M : HalfMask)
14005 if (M == IncomingInputs[0])
14006 M = InputsFixed[0] + SourceOffset;
14007 else if (M == IncomingInputs[1])
14008 M = InputsFixed[1] + SourceOffset;
14009
14010 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
14011 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
14012 }
14013 } else {
14014 llvm_unreachable("Unhandled input size!");
14015 }
14016
14017 // Now hoist the DWord down to the right half.
14018 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
14019 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
14020 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
14021 for (int &M : HalfMask)
14022 for (int Input : IncomingInputs)
14023 if (M == Input)
14024 M = FreeDWord * 2 + Input % 2;
14025 };
14026 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
14027 /*SourceOffset*/ 4, /*DestOffset*/ 0);
14028 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
14029 /*SourceOffset*/ 0, /*DestOffset*/ 4);
14030
14031 // Now enact all the shuffles we've computed to move the inputs into their
14032 // target half.
14033 if (!isNoopShuffleMask(PSHUFLMask))
14034 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14035 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
14036 if (!isNoopShuffleMask(PSHUFHMask))
14037 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14038 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
14039 if (!isNoopShuffleMask(PSHUFDMask))
14040 V = DAG.getBitcast(
14041 VT,
14042 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
14043 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14044
14045 // At this point, each half should contain all its inputs, and we can then
14046 // just shuffle them into their final position.
14047 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
14048 "Failed to lift all the high half inputs to the low mask!");
14049 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
14050 "Failed to lift all the low half inputs to the high mask!");
14051
14052 // Do a half shuffle for the low mask.
14053 if (!isNoopShuffleMask(LoMask))
14054 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
14055 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
14056
14057 // Do a half shuffle with the high mask after shifting its values down.
14058 for (int &M : HiMask)
14059 if (M >= 0)
14060 M -= 4;
14061 if (!isNoopShuffleMask(HiMask))
14062 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
14063 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
14064
14065 return V;
14066}
14067
14068/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
14069/// blend if only one input is used.
14071 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
14072 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {
14074 "Lane crossing shuffle masks not supported");
14075
14076 int NumBytes = VT.getSizeInBits() / 8;
14077 int Size = Mask.size();
14078 int Scale = NumBytes / Size;
14079
14080 SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14081 SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));
14082 V1InUse = false;
14083 V2InUse = false;
14084
14085 for (int i = 0; i < NumBytes; ++i) {
14086 int M = Mask[i / Scale];
14087 if (M < 0)
14088 continue;
14089
14090 const int ZeroMask = 0x80;
14091 int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;
14092 int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;
14093 if (Zeroable[i / Scale])
14094 V1Idx = V2Idx = ZeroMask;
14095
14096 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
14097 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
14098 V1InUse |= (ZeroMask != V1Idx);
14099 V2InUse |= (ZeroMask != V2Idx);
14100 }
14101
14102 MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);
14103 if (V1InUse)
14104 V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),
14105 DAG.getBuildVector(ShufVT, DL, V1Mask));
14106 if (V2InUse)
14107 V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),
14108 DAG.getBuildVector(ShufVT, DL, V2Mask));
14109
14110 // If we need shuffled inputs from both, blend the two.
14111 SDValue V;
14112 if (V1InUse && V2InUse)
14113 V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);
14114 else
14115 V = V1InUse ? V1 : V2;
14116
14117 // Cast the result back to the correct type.
14118 return DAG.getBitcast(VT, V);
14119}
14120
14121/// Generic lowering of 8-lane i16 shuffles.
14122///
14123/// This handles both single-input shuffles and combined shuffle/blends with
14124/// two inputs. The single input shuffles are immediately delegated to
14125/// a dedicated lowering routine.
14126///
14127/// The blends are lowered in one of three fundamental ways. If there are few
14128/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
14129/// of the input is significantly cheaper when lowered as an interleaving of
14130/// the two inputs, try to interleave them. Otherwise, blend the low and high
14131/// halves of the inputs separately (making them have relatively few inputs)
14132/// and then concatenate them.
14134 const APInt &Zeroable, SDValue V1, SDValue V2,
14135 const X86Subtarget &Subtarget,
14136 SelectionDAG &DAG) {
14137 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14138 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
14139 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14140
14141 // Whenever we can lower this as a zext, that instruction is strictly faster
14142 // than any alternative.
14143 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,
14144 Zeroable, Subtarget, DAG))
14145 return ZExt;
14146
14147 // Try to use lower using a truncation.
14148 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14149 Subtarget, DAG))
14150 return V;
14151
14152 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
14153
14154 if (NumV2Inputs == 0) {
14155 // Try to use shift instructions.
14156 if (SDValue Shift =
14157 lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,
14158 Subtarget, DAG, /*BitwiseOnly*/ false))
14159 return Shift;
14160
14161 // Check for being able to broadcast a single element.
14162 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,
14163 Mask, Subtarget, DAG))
14164 return Broadcast;
14165
14166 // Try to use bit rotation instructions.
14167 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
14168 Subtarget, DAG))
14169 return Rotate;
14170
14171 // Use dedicated unpack instructions for masks that match their pattern.
14172 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14173 return V;
14174
14175 // Use dedicated pack instructions for masks that match their pattern.
14176 if (SDValue V =
14177 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14178 return V;
14179
14180 // Try to use byte rotation instructions.
14181 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,
14182 Subtarget, DAG))
14183 return Rotate;
14184
14185 // Make a copy of the mask so it can be modified.
14186 SmallVector<int, 8> MutableMask(Mask);
14187 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,
14188 Subtarget, DAG);
14189 }
14190
14191 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
14192 "All single-input shuffles should be canonicalized to be V1-input "
14193 "shuffles.");
14194
14195 // Try to use shift instructions.
14196 if (SDValue Shift =
14197 lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,
14198 DAG, /*BitwiseOnly*/ false))
14199 return Shift;
14200
14201 // See if we can use SSE4A Extraction / Insertion.
14202 if (Subtarget.hasSSE4A())
14203 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
14204 Zeroable, DAG))
14205 return V;
14206
14207 // There are special ways we can lower some single-element blends.
14208 if (NumV2Inputs == 1)
14210 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14211 return V;
14212
14213 // We have different paths for blend lowering, but they all must use the
14214 // *exact* same predicate.
14215 bool IsBlendSupported = Subtarget.hasSSE41();
14216 if (IsBlendSupported)
14217 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
14218 Zeroable, Subtarget, DAG))
14219 return Blend;
14220
14221 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
14222 Zeroable, Subtarget, DAG))
14223 return Masked;
14224
14225 // Use dedicated unpack instructions for masks that match their pattern.
14226 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, V1, V2, Mask, DAG))
14227 return V;
14228
14229 // Use dedicated pack instructions for masks that match their pattern.
14230 if (SDValue V =
14231 lowerShuffleWithPACK(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
14232 return V;
14233
14234 // Try to use lower using a truncation.
14235 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
14236 Subtarget, DAG))
14237 return V;
14238
14239 // Try to use byte rotation instructions.
14240 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
14241 Subtarget, DAG))
14242 return Rotate;
14243
14244 if (SDValue BitBlend =
14245 lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
14246 return BitBlend;
14247
14248 // Try to use byte shift instructions to mask.
14249 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
14250 Zeroable, Subtarget, DAG))
14251 return V;
14252
14253 // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
14254 int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);
14255 if ((NumEvenDrops == 1 || (NumEvenDrops == 2 && Subtarget.hasSSE41())) &&
14256 !Subtarget.hasVLX()) {
14257 // Check if this is part of a 256-bit vector truncation.
14258 unsigned PackOpc = 0;
14259 if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&
14262 SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);
14263 V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,
14264 getZeroVector(MVT::v16i16, Subtarget, DAG, DL),
14265 DAG.getTargetConstant(0xEE, DL, MVT::i8));
14266 V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);
14267 V1 = extract128BitVector(V1V2, 0, DAG, DL);
14268 V2 = extract128BitVector(V1V2, 4, DAG, DL);
14269 PackOpc = X86ISD::PACKUS;
14270 } else if (Subtarget.hasSSE41()) {
14271 SmallVector<SDValue, 4> DWordClearOps(4,
14272 DAG.getConstant(0, DL, MVT::i32));
14273 for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
14274 DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
14275 SDValue DWordClearMask =
14276 DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
14277 V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
14278 DWordClearMask);
14279 V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
14280 DWordClearMask);
14281 PackOpc = X86ISD::PACKUS;
14282 } else if (!Subtarget.hasSSSE3()) {
14283 SDValue ShAmt = DAG.getTargetConstant(16, DL, MVT::i8);
14284 V1 = DAG.getBitcast(MVT::v4i32, V1);
14285 V2 = DAG.getBitcast(MVT::v4i32, V2);
14286 V1 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V1, ShAmt);
14287 V2 = DAG.getNode(X86ISD::VSHLI, DL, MVT::v4i32, V2, ShAmt);
14288 V1 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V1, ShAmt);
14289 V2 = DAG.getNode(X86ISD::VSRAI, DL, MVT::v4i32, V2, ShAmt);
14290 PackOpc = X86ISD::PACKSS;
14291 }
14292 if (PackOpc) {
14293 // Now pack things back together.
14294 SDValue Result = DAG.getNode(PackOpc, DL, MVT::v8i16, V1, V2);
14295 if (NumEvenDrops == 2) {
14296 Result = DAG.getBitcast(MVT::v4i32, Result);
14297 Result = DAG.getNode(PackOpc, DL, MVT::v8i16, Result, Result);
14298 }
14299 return Result;
14300 }
14301 }
14302
14303 // When compacting odd (upper) elements, use PACKSS pre-SSE41.
14304 int NumOddDrops = canLowerByDroppingElements(Mask, false, false);
14305 if (NumOddDrops == 1) {
14306 bool HasSSE41 = Subtarget.hasSSE41();
14307 V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14308 DAG.getBitcast(MVT::v4i32, V1),
14309 DAG.getTargetConstant(16, DL, MVT::i8));
14310 V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,
14311 DAG.getBitcast(MVT::v4i32, V2),
14312 DAG.getTargetConstant(16, DL, MVT::i8));
14313 return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,
14314 MVT::v8i16, V1, V2);
14315 }
14316
14317 // Try to lower by permuting the inputs into an unpack instruction.
14318 if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
14319 Mask, Subtarget, DAG))
14320 return Unpack;
14321
14322 // If we can't directly blend but can use PSHUFB, that will be better as it
14323 // can both shuffle and set up the inefficient blend.
14324 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
14325 bool V1InUse, V2InUse;
14326 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
14327 Zeroable, DAG, V1InUse, V2InUse);
14328 }
14329
14330 // We can always bit-blend if we have to so the fallback strategy is to
14331 // decompose into single-input permutes and blends/unpacks.
14332 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, Mask,
14333 Zeroable, Subtarget, DAG);
14334}
14335
14336/// Lower 8-lane 16-bit floating point shuffles.
14338 const APInt &Zeroable, SDValue V1, SDValue V2,
14339 const X86Subtarget &Subtarget,
14340 SelectionDAG &DAG) {
14341 assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14342 assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!");
14343 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14344 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
14345
14346 if (Subtarget.hasFP16()) {
14347 if (NumV2Elements == 0) {
14348 // Check for being able to broadcast a single element.
14349 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,
14350 Mask, Subtarget, DAG))
14351 return Broadcast;
14352 }
14353 if (NumV2Elements == 1 && Mask[0] >= 8)
14355 DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14356 return V;
14357 }
14358
14359 V1 = DAG.getBitcast(MVT::v8i16, V1);
14360 V2 = DAG.getBitcast(MVT::v8i16, V2);
14361 return DAG.getBitcast(MVT::v8f16,
14362 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14363}
14364
14365// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,
14366// sub-512-bit shuffles are padded to 512-bits for the shuffle and then
14367// the active subvector is extracted.
14369 ArrayRef<int> OriginalMask, SDValue V1,
14370 SDValue V2, const X86Subtarget &Subtarget,
14371 SelectionDAG &DAG) {
14372 // Commute binary inputs so V2 is a load to simplify VPERMI2/T2 folds.
14373 SmallVector<int, 32> Mask(OriginalMask);
14374 if (!V2.isUndef() && isShuffleFoldableLoad(V1) &&
14375 !isShuffleFoldableLoad(V2)) {
14377 std::swap(V1, V2);
14378 }
14379
14380 MVT MaskVT = VT.changeTypeToInteger();
14381 SDValue MaskNode;
14382 MVT ShuffleVT = VT;
14383 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
14384 V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);
14385 V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);
14386 ShuffleVT = V1.getSimpleValueType();
14387
14388 // Adjust mask to correct indices for the second input.
14389 int NumElts = VT.getVectorNumElements();
14390 unsigned Scale = 512 / VT.getSizeInBits();
14391 SmallVector<int, 32> AdjustedMask(Mask);
14392 for (int &M : AdjustedMask)
14393 if (NumElts <= M)
14394 M += (Scale - 1) * NumElts;
14395 MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);
14396 MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);
14397 } else {
14398 MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);
14399 }
14400
14401 SDValue Result;
14402 if (V2.isUndef())
14403 Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);
14404 else
14405 Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);
14406
14407 if (VT != ShuffleVT)
14408 Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());
14409
14410 return Result;
14411}
14412
14413/// Generic lowering of v16i8 shuffles.
14414///
14415/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
14416/// detect any complexity reducing interleaving. If that doesn't help, it uses
14417/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
14418/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
14419/// back together.
14421 const APInt &Zeroable, SDValue V1, SDValue V2,
14422 const X86Subtarget &Subtarget,
14423 SelectionDAG &DAG) {
14424 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14425 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
14426 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14427
14428 // Try to use shift instructions.
14429 if (SDValue Shift =
14430 lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,
14431 DAG, /*BitwiseOnly*/ false))
14432 return Shift;
14433
14434 // Try to use byte rotation instructions.
14435 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,
14436 Subtarget, DAG))
14437 return Rotate;
14438
14439 // Use dedicated pack instructions for masks that match their pattern.
14440 if (SDValue V =
14441 lowerShuffleWithPACK(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14442 return V;
14443
14444 // Try to use a zext lowering.
14445 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,
14446 Zeroable, Subtarget, DAG))
14447 return ZExt;
14448
14449 // Try to use lower using a truncation.
14450 if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14451 Subtarget, DAG))
14452 return V;
14453
14454 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
14455 Subtarget, DAG))
14456 return V;
14457
14458 // See if we can use SSE4A Extraction / Insertion.
14459 if (Subtarget.hasSSE4A())
14460 if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
14461 Zeroable, DAG))
14462 return V;
14463
14464 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
14465
14466 // For single-input shuffles, there are some nicer lowering tricks we can use.
14467 if (NumV2Elements == 0) {
14468 // Check for being able to broadcast a single element.
14469 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,
14470 Mask, Subtarget, DAG))
14471 return Broadcast;
14472
14473 // Try to use bit rotation instructions.
14474 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
14475 Subtarget, DAG))
14476 return Rotate;
14477
14478 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14479 return V;
14480
14481 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
14482 // Notably, this handles splat and partial-splat shuffles more efficiently.
14483 // However, it only makes sense if the pre-duplication shuffle simplifies
14484 // things significantly. Currently, this means we need to be able to
14485 // express the pre-duplication shuffle as an i16 shuffle.
14486 //
14487 // FIXME: We should check for other patterns which can be widened into an
14488 // i16 shuffle as well.
14489 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
14490 for (int i = 0; i < 16; i += 2)
14491 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
14492 return false;
14493
14494 return true;
14495 };
14496 auto tryToWidenViaDuplication = [&]() -> SDValue {
14497 if (!canWidenViaDuplication(Mask))
14498 return SDValue();
14499 SmallVector<int, 4> LoInputs;
14500 copy_if(Mask, std::back_inserter(LoInputs),
14501 [](int M) { return M >= 0 && M < 8; });
14502 array_pod_sort(LoInputs.begin(), LoInputs.end());
14503 LoInputs.erase(llvm::unique(LoInputs), LoInputs.end());
14504 SmallVector<int, 4> HiInputs;
14505 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
14506 array_pod_sort(HiInputs.begin(), HiInputs.end());
14507 HiInputs.erase(llvm::unique(HiInputs), HiInputs.end());
14508
14509 bool TargetLo = LoInputs.size() >= HiInputs.size();
14510 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
14511 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
14512
14513 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
14515 for (int I : InPlaceInputs) {
14516 PreDupI16Shuffle[I/2] = I/2;
14517 LaneMap[I] = I;
14518 }
14519 int j = TargetLo ? 0 : 4, je = j + 4;
14520 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
14521 // Check if j is already a shuffle of this input. This happens when
14522 // there are two adjacent bytes after we move the low one.
14523 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
14524 // If we haven't yet mapped the input, search for a slot into which
14525 // we can map it.
14526 while (j < je && PreDupI16Shuffle[j] >= 0)
14527 ++j;
14528
14529 if (j == je)
14530 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
14531 return SDValue();
14532
14533 // Map this input with the i16 shuffle.
14534 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
14535 }
14536
14537 // Update the lane map based on the mapping we ended up with.
14538 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
14539 }
14540 V1 = DAG.getBitcast(
14541 MVT::v16i8,
14542 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14543 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
14544
14545 // Unpack the bytes to form the i16s that will be shuffled into place.
14546 bool EvenInUse = false, OddInUse = false;
14547 for (int i = 0; i < 16; i += 2) {
14548 EvenInUse |= (Mask[i + 0] >= 0);
14549 OddInUse |= (Mask[i + 1] >= 0);
14550 if (EvenInUse && OddInUse)
14551 break;
14552 }
14553 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
14554 MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),
14555 OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));
14556
14557 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
14558 for (int i = 0; i < 16; ++i)
14559 if (Mask[i] >= 0) {
14560 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
14561 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
14562 if (PostDupI16Shuffle[i / 2] < 0)
14563 PostDupI16Shuffle[i / 2] = MappedMask;
14564 else
14565 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
14566 "Conflicting entries in the original shuffle!");
14567 }
14568 return DAG.getBitcast(
14569 MVT::v16i8,
14570 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
14571 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
14572 };
14573 if (SDValue V = tryToWidenViaDuplication())
14574 return V;
14575 }
14576
14577 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
14578 Zeroable, Subtarget, DAG))
14579 return Masked;
14580
14581 // Use dedicated unpack instructions for masks that match their pattern.
14582 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, V1, V2, Mask, DAG))
14583 return V;
14584
14585 // Try to use byte shift instructions to mask.
14586 if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
14587 Zeroable, Subtarget, DAG))
14588 return V;
14589
14590 // Check for compaction patterns.
14591 bool IsSingleInput = V2.isUndef();
14592 int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);
14593
14594 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
14595 // with PSHUFB. It is important to do this before we attempt to generate any
14596 // blends but after all of the single-input lowerings. If the single input
14597 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
14598 // want to preserve that and we can DAG combine any longer sequences into
14599 // a PSHUFB in the end. But once we start blending from multiple inputs,
14600 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
14601 // and there are *very* few patterns that would actually be faster than the
14602 // PSHUFB approach because of its ability to zero lanes.
14603 //
14604 // If the mask is a binary compaction, we can more efficiently perform this
14605 // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14606 //
14607 // FIXME: The only exceptions to the above are blends which are exact
14608 // interleavings with direct instructions supporting them. We currently don't
14609 // handle those well here.
14610 if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
14611 bool V1InUse = false;
14612 bool V2InUse = false;
14613
14615 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
14616
14617 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
14618 // do so. This avoids using them to handle blends-with-zero which is
14619 // important as a single pshufb is significantly faster for that.
14620 if (V1InUse && V2InUse) {
14621 if (Subtarget.hasSSE41())
14622 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,
14623 Zeroable, Subtarget, DAG))
14624 return Blend;
14625
14626 // We can use an unpack to do the blending rather than an or in some
14627 // cases. Even though the or may be (very minorly) more efficient, we
14628 // preference this lowering because there are common cases where part of
14629 // the complexity of the shuffles goes away when we do the final blend as
14630 // an unpack.
14631 // FIXME: It might be worth trying to detect if the unpack-feeding
14632 // shuffles will both be pshufb, in which case we shouldn't bother with
14633 // this.
14635 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14636 return Unpack;
14637
14638 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
14639 if (Subtarget.hasVBMI())
14640 return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,
14641 DAG);
14642
14643 // If we have XOP we can use one VPPERM instead of multiple PSHUFBs.
14644 if (Subtarget.hasXOP()) {
14645 SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);
14646 return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);
14647 }
14648
14649 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
14650 // PALIGNR will be cheaper than the second PSHUFB+OR.
14652 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
14653 return V;
14654 }
14655
14656 return PSHUFB;
14657 }
14658
14659 // There are special ways we can lower some single-element blends.
14660 if (NumV2Elements == 1)
14662 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14663 return V;
14664
14665 if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
14666 return Blend;
14667
14668 // Check whether a compaction lowering can be done. This handles shuffles
14669 // which take every Nth element for some even N. See the helper function for
14670 // details.
14671 //
14672 // We special case these as they can be particularly efficiently handled with
14673 // the PACKUSB instruction on x86 and they show up in common patterns of
14674 // rearranging bytes to truncate wide elements.
14675 if (NumEvenDrops) {
14676 // NumEvenDrops is the power of two stride of the elements. Another way of
14677 // thinking about it is that we need to drop the even elements this many
14678 // times to get the original input.
14679
14680 // First we need to zero all the dropped bytes.
14681 assert(NumEvenDrops <= 3 &&
14682 "No support for dropping even elements more than 3 times.");
14683 SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
14684 for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
14685 WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
14686 SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
14687 V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
14688 WordClearMask);
14689 if (!IsSingleInput)
14690 V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
14691 WordClearMask);
14692
14693 // Now pack things back together.
14694 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14695 IsSingleInput ? V1 : V2);
14696 for (int i = 1; i < NumEvenDrops; ++i) {
14697 Result = DAG.getBitcast(MVT::v8i16, Result);
14698 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
14699 }
14700 return Result;
14701 }
14702
14703 int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);
14704 if (NumOddDrops == 1) {
14705 V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14706 DAG.getBitcast(MVT::v8i16, V1),
14707 DAG.getTargetConstant(8, DL, MVT::i8));
14708 if (!IsSingleInput)
14709 V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,
14710 DAG.getBitcast(MVT::v8i16, V2),
14711 DAG.getTargetConstant(8, DL, MVT::i8));
14712 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
14713 IsSingleInput ? V1 : V2);
14714 }
14715
14716 // Handle multi-input cases by blending/unpacking single-input shuffles.
14717 if (NumV2Elements > 0)
14718 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,
14719 Zeroable, Subtarget, DAG);
14720
14721 // The fallback path for single-input shuffles widens this into two v8i16
14722 // vectors with unpacks, shuffles those, and then pulls them back together
14723 // with a pack.
14724 SDValue V = V1;
14725
14726 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14727 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
14728 for (int i = 0; i < 16; ++i)
14729 if (Mask[i] >= 0)
14730 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
14731
14732 SDValue VLoHalf, VHiHalf;
14733 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
14734 // them out and avoid using UNPCK{L,H} to extract the elements of V as
14735 // i16s.
14736 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
14737 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
14738 // Use a mask to drop the high bytes.
14739 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
14740 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
14741 DAG.getConstant(0x00FF, DL, MVT::v8i16));
14742
14743 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
14744 VHiHalf = DAG.getUNDEF(MVT::v8i16);
14745
14746 // Squash the masks to point directly into VLoHalf.
14747 for (int &M : LoBlendMask)
14748 if (M >= 0)
14749 M /= 2;
14750 for (int &M : HiBlendMask)
14751 if (M >= 0)
14752 M /= 2;
14753 } else {
14754 // Otherwise just unpack the low half of V into VLoHalf and the high half into
14755 // VHiHalf so that we can blend them as i16s.
14756 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
14757
14758 VLoHalf = DAG.getBitcast(
14759 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
14760 VHiHalf = DAG.getBitcast(
14761 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
14762 }
14763
14764 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
14765 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
14766
14767 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
14768}
14769
14770/// Dispatching routine to lower various 128-bit x86 vector shuffles.
14771///
14772/// This routine breaks down the specific type of 128-bit shuffle and
14773/// dispatches to the lowering routines accordingly.
14775 MVT VT, SDValue V1, SDValue V2,
14776 const APInt &Zeroable,
14777 const X86Subtarget &Subtarget,
14778 SelectionDAG &DAG) {
14779 if (VT == MVT::v8bf16) {
14780 V1 = DAG.getBitcast(MVT::v8i16, V1);
14781 V2 = DAG.getBitcast(MVT::v8i16, V2);
14782 return DAG.getBitcast(VT,
14783 DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));
14784 }
14785
14786 switch (VT.SimpleTy) {
14787 case MVT::v2i64:
14788 return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14789 case MVT::v2f64:
14790 return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14791 case MVT::v4i32:
14792 return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14793 case MVT::v4f32:
14794 return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14795 case MVT::v8i16:
14796 return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14797 case MVT::v8f16:
14798 return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14799 case MVT::v16i8:
14800 return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14801
14802 default:
14803 llvm_unreachable("Unimplemented!");
14804 }
14805}
14806
14807/// Generic routine to split vector shuffle into half-sized shuffles.
14808///
14809/// This routine just extracts two subvectors, shuffles them independently, and
14810/// then concatenates them back together. This should work effectively with all
14811/// AVX vector shuffle types.
14813 SDValue V2, ArrayRef<int> Mask,
14814 SelectionDAG &DAG, bool SimpleOnly) {
14815 assert(VT.getSizeInBits() >= 256 &&
14816 "Only for 256-bit or wider vector shuffles!");
14817 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
14818 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
14819
14820 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
14821 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
14822
14823 int NumElements = VT.getVectorNumElements();
14824 int SplitNumElements = NumElements / 2;
14825 MVT ScalarVT = VT.getVectorElementType();
14826 MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
14827
14828 // Use splitVector/extractSubVector so that split build-vectors just build two
14829 // narrower build vectors. This helps shuffling with splats and zeros.
14830 auto SplitVector = [&](SDValue V) {
14831 SDValue LoV, HiV;
14832 std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
14833 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
14834 DAG.getBitcast(SplitVT, HiV));
14835 };
14836
14837 SDValue LoV1, HiV1, LoV2, HiV2;
14838 std::tie(LoV1, HiV1) = SplitVector(V1);
14839 std::tie(LoV2, HiV2) = SplitVector(V2);
14840
14841 // Now create two 4-way blends of these half-width vectors.
14842 auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
14843 bool &UseHiV1, bool &UseLoV2,
14844 bool &UseHiV2) {
14845 UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
14846 for (int i = 0; i < SplitNumElements; ++i) {
14847 int M = HalfMask[i];
14848 if (M >= NumElements) {
14849 if (M >= NumElements + SplitNumElements)
14850 UseHiV2 = true;
14851 else
14852 UseLoV2 = true;
14853 } else if (M >= 0) {
14854 if (M >= SplitNumElements)
14855 UseHiV1 = true;
14856 else
14857 UseLoV1 = true;
14858 }
14859 }
14860 };
14861
14862 auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
14863 if (!SimpleOnly)
14864 return true;
14865
14866 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14867 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14868
14869 return !(UseHiV1 || UseHiV2);
14870 };
14871
14872 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
14873 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
14874 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
14875 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
14876 for (int i = 0; i < SplitNumElements; ++i) {
14877 int M = HalfMask[i];
14878 if (M >= NumElements) {
14879 V2BlendMask[i] = M - NumElements;
14880 BlendMask[i] = SplitNumElements + i;
14881 } else if (M >= 0) {
14882 V1BlendMask[i] = M;
14883 BlendMask[i] = i;
14884 }
14885 }
14886
14887 bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
14888 GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
14889
14890 // Because the lowering happens after all combining takes place, we need to
14891 // manually combine these blend masks as much as possible so that we create
14892 // a minimal number of high-level vector shuffle nodes.
14893 assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple");
14894
14895 // First try just blending the halves of V1 or V2.
14896 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
14897 return DAG.getUNDEF(SplitVT);
14898 if (!UseLoV2 && !UseHiV2)
14899 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14900 if (!UseLoV1 && !UseHiV1)
14901 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14902
14903 SDValue V1Blend, V2Blend;
14904 if (UseLoV1 && UseHiV1) {
14905 V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
14906 } else {
14907 // We only use half of V1 so map the usage down into the final blend mask.
14908 V1Blend = UseLoV1 ? LoV1 : HiV1;
14909 for (int i = 0; i < SplitNumElements; ++i)
14910 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
14911 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
14912 }
14913 if (UseLoV2 && UseHiV2) {
14914 V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
14915 } else {
14916 // We only use half of V2 so map the usage down into the final blend mask.
14917 V2Blend = UseLoV2 ? LoV2 : HiV2;
14918 for (int i = 0; i < SplitNumElements; ++i)
14919 if (BlendMask[i] >= SplitNumElements)
14920 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
14921 }
14922 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
14923 };
14924
14925 if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
14926 return SDValue();
14927
14928 SDValue Lo = HalfBlend(LoMask);
14929 SDValue Hi = HalfBlend(HiMask);
14930 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
14931}
14932
14933/// Either split a vector in halves or decompose the shuffles and the
14934/// blend/unpack.
14935///
14936/// This is provided as a good fallback for many lowerings of non-single-input
14937/// shuffles with more than one 128-bit lane. In those cases, we want to select
14938/// between splitting the shuffle into 128-bit components and stitching those
14939/// back together vs. extracting the single-input shuffles and blending those
14940/// results.
14942 SDValue V2, ArrayRef<int> Mask,
14943 const APInt &Zeroable,
14944 const X86Subtarget &Subtarget,
14945 SelectionDAG &DAG) {
14946 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
14947 "shuffles as it could then recurse on itself.");
14948 int Size = Mask.size();
14949
14950 // If this can be modeled as a broadcast of two elements followed by a blend,
14951 // prefer that lowering. This is especially important because broadcasts can
14952 // often fold with memory operands.
14953 auto DoBothBroadcast = [&] {
14954 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
14955 for (int M : Mask)
14956 if (M >= Size) {
14957 if (V2BroadcastIdx < 0)
14958 V2BroadcastIdx = M - Size;
14959 else if (M - Size != V2BroadcastIdx)
14960 return false;
14961 } else if (M >= 0) {
14962 if (V1BroadcastIdx < 0)
14963 V1BroadcastIdx = M;
14964 else if (M != V1BroadcastIdx)
14965 return false;
14966 }
14967 return true;
14968 };
14969 if (DoBothBroadcast())
14970 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
14971 Subtarget, DAG);
14972
14973 // If the inputs all stem from a single 128-bit lane of each input, then we
14974 // split them rather than blending because the split will decompose to
14975 // unusually few instructions.
14976 int LaneCount = VT.getSizeInBits() / 128;
14977 int LaneSize = Size / LaneCount;
14978 SmallBitVector LaneInputs[2];
14979 LaneInputs[0].resize(LaneCount, false);
14980 LaneInputs[1].resize(LaneCount, false);
14981 for (int i = 0; i < Size; ++i)
14982 if (Mask[i] >= 0)
14983 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
14984 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
14985 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
14986 /*SimpleOnly*/ false);
14987
14988 // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
14989 // requires that the decomposed single-input shuffles don't end up here.
14990 return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Zeroable,
14991 Subtarget, DAG);
14992}
14993
14994// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
14995// TODO: Extend to support v8f32 (+ 512-bit shuffles).
14997 SDValue V1, SDValue V2,
14998 ArrayRef<int> Mask,
14999 SelectionDAG &DAG) {
15000 assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
15001
15002 int LHSMask[4] = {-1, -1, -1, -1};
15003 int RHSMask[4] = {-1, -1, -1, -1};
15004 int SHUFPDMask[4] = {-1, -1, -1, -1};
15005
15006 // As SHUFPD uses a single LHS/RHS element per lane, we can always
15007 // perform the shuffle once the lanes have been shuffled in place.
15008 for (int i = 0; i != 4; ++i) {
15009 int M = Mask[i];
15010 if (M < 0)
15011 continue;
15012 int LaneBase = i & ~1;
15013 auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
15014 LaneMask[LaneBase + (M & 1)] = M;
15015 SHUFPDMask[i] = M & 1;
15016 }
15017
15018 SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
15019 SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
15020 return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
15021 getSHUFPDImmForMask(SHUFPDMask, DL, DAG));
15022}
15023
15024/// Lower a vector shuffle crossing multiple 128-bit lanes as
15025/// a lane permutation followed by a per-lane permutation.
15026///
15027/// This is mainly for cases where we can have non-repeating permutes
15028/// in each lane.
15029///
15030/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,
15031/// we should investigate merging them.
15033 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15034 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15035 int NumElts = VT.getVectorNumElements();
15036 int NumLanes = VT.getSizeInBits() / 128;
15037 int NumEltsPerLane = NumElts / NumLanes;
15038 bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();
15039
15040 /// Attempts to find a sublane permute with the given size
15041 /// that gets all elements into their target lanes.
15042 ///
15043 /// If successful, fills CrossLaneMask and InLaneMask and returns true.
15044 /// If unsuccessful, returns false and may overwrite InLaneMask.
15045 auto getSublanePermute = [&](int NumSublanes) -> SDValue {
15046 int NumSublanesPerLane = NumSublanes / NumLanes;
15047 int NumEltsPerSublane = NumElts / NumSublanes;
15048
15049 SmallVector<int, 16> CrossLaneMask;
15050 SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);
15051 // CrossLaneMask but one entry == one sublane.
15052 SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);
15053 APInt DemandedCrossLane = APInt::getZero(NumElts);
15054
15055 for (int i = 0; i != NumElts; ++i) {
15056 int M = Mask[i];
15057 if (M < 0)
15058 continue;
15059
15060 int SrcSublane = M / NumEltsPerSublane;
15061 int DstLane = i / NumEltsPerLane;
15062
15063 // We only need to get the elements into the right lane, not sublane.
15064 // So search all sublanes that make up the destination lane.
15065 bool Found = false;
15066 int DstSubStart = DstLane * NumSublanesPerLane;
15067 int DstSubEnd = DstSubStart + NumSublanesPerLane;
15068 for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {
15069 if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))
15070 continue;
15071
15072 Found = true;
15073 CrossLaneMaskLarge[DstSublane] = SrcSublane;
15074 int DstSublaneOffset = DstSublane * NumEltsPerSublane;
15075 InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;
15076 DemandedCrossLane.setBit(InLaneMask[i]);
15077 break;
15078 }
15079 if (!Found)
15080 return SDValue();
15081 }
15082
15083 // Fill CrossLaneMask using CrossLaneMaskLarge.
15084 narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);
15085
15086 if (!CanUseSublanes) {
15087 // If we're only shuffling a single lowest lane and the rest are identity
15088 // then don't bother.
15089 // TODO - isShuffleMaskInputInPlace could be extended to something like
15090 // this.
15091 int NumIdentityLanes = 0;
15092 bool OnlyShuffleLowestLane = true;
15093 for (int i = 0; i != NumLanes; ++i) {
15094 int LaneOffset = i * NumEltsPerLane;
15095 if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,
15096 i * NumEltsPerLane))
15097 NumIdentityLanes++;
15098 else if (CrossLaneMask[LaneOffset] != 0)
15099 OnlyShuffleLowestLane = false;
15100 }
15101 if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))
15102 return SDValue();
15103 }
15104
15105 // Avoid returning the same shuffle operation. For example,
15106 // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
15107 // undef:v16i16
15108 if (CrossLaneMask == Mask || InLaneMask == Mask)
15109 return SDValue();
15110
15111 // Simplify CrossLaneMask based on the actual demanded elements.
15112 if (V1.hasOneUse())
15113 for (int i = 0; i != NumElts; ++i)
15114 if (!DemandedCrossLane[i])
15115 CrossLaneMask[i] = SM_SentinelUndef;
15116
15117 SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
15118 return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
15119 InLaneMask);
15120 };
15121
15122 // First attempt a solution with full lanes.
15123 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))
15124 return V;
15125
15126 // The rest of the solutions use sublanes.
15127 if (!CanUseSublanes)
15128 return SDValue();
15129
15130 // Then attempt a solution with 64-bit sublanes (vpermq).
15131 if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))
15132 return V;
15133
15134 // If that doesn't work and we have fast variable cross-lane shuffle,
15135 // attempt 32-bit sublanes (vpermd).
15136 if (!Subtarget.hasFastVariableCrossLaneShuffle())
15137 return SDValue();
15138
15139 return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
15140}
15141
15142/// Helper to get compute inlane shuffle mask for a complete shuffle mask.
15143static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
15144 SmallVector<int> &InLaneMask) {
15145 int Size = Mask.size();
15146 InLaneMask.assign(Mask.begin(), Mask.end());
15147 for (int i = 0; i < Size; ++i) {
15148 int &M = InLaneMask[i];
15149 if (M < 0)
15150 continue;
15151 if (((M % Size) / LaneSize) != (i / LaneSize))
15152 M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
15153 }
15154}
15155
15156/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
15157/// source with a lane permutation.
15158///
15159/// This lowering strategy results in four instructions in the worst case for a
15160/// single-input cross lane shuffle which is lower than any other fully general
15161/// cross-lane shuffle strategy I'm aware of. Special cases for each particular
15162/// shuffle pattern should be handled prior to trying this lowering.
15164 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15165 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
15166 // FIXME: This should probably be generalized for 512-bit vectors as well.
15167 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
15168 int Size = Mask.size();
15169 int LaneSize = Size / 2;
15170
15171 // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
15172 // Only do this if the elements aren't all from the lower lane,
15173 // otherwise we're (probably) better off doing a split.
15174 if (VT == MVT::v4f64 &&
15175 !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
15176 return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);
15177
15178 // If there are only inputs from one 128-bit lane, splitting will in fact be
15179 // less expensive. The flags track whether the given lane contains an element
15180 // that crosses to another lane.
15181 bool AllLanes;
15182 if (!Subtarget.hasAVX2()) {
15183 bool LaneCrossing[2] = {false, false};
15184 for (int i = 0; i < Size; ++i)
15185 if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
15186 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
15187 AllLanes = LaneCrossing[0] && LaneCrossing[1];
15188 } else {
15189 bool LaneUsed[2] = {false, false};
15190 for (int i = 0; i < Size; ++i)
15191 if (Mask[i] >= 0)
15192 LaneUsed[(Mask[i] % Size) / LaneSize] = true;
15193 AllLanes = LaneUsed[0] && LaneUsed[1];
15194 }
15195
15196 // TODO - we could support shuffling V2 in the Flipped input.
15197 assert(V2.isUndef() &&
15198 "This last part of this routine only works on single input shuffles");
15199
15200 SmallVector<int> InLaneMask;
15201 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
15202
15203 assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
15204 "In-lane shuffle mask expected");
15205
15206 // If we're not using both lanes in each lane and the inlane mask is not
15207 // repeating, then we're better off splitting.
15208 if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
15209 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
15210 /*SimpleOnly*/ false);
15211
15212 // Flip the lanes, and shuffle the results which should now be in-lane.
15213 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
15214 SDValue Flipped = DAG.getBitcast(PVT, V1);
15215 Flipped =
15216 DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});
15217 Flipped = DAG.getBitcast(VT, Flipped);
15218 return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);
15219}
15220
15221/// Handle lowering 2-lane 128-bit shuffles.
15223 SDValue V2, ArrayRef<int> Mask,
15224 const APInt &Zeroable,
15225 const X86Subtarget &Subtarget,
15226 SelectionDAG &DAG) {
15227 if (V2.isUndef()) {
15228 // Attempt to match VBROADCAST*128 subvector broadcast load.
15229 bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
15230 bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
15231 if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
15233 MVT MemVT = VT.getHalfNumVectorElementsVT();
15234 unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();
15235 auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
15237 VT, MemVT, Ld, Ofs, DAG))
15238 return BcstLd;
15239 }
15240
15241 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
15242 if (Subtarget.hasAVX2())
15243 return SDValue();
15244 }
15245
15246 bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
15247
15248 SmallVector<int, 4> WidenedMask;
15249 if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
15250 return SDValue();
15251
15252 bool IsLowZero = (Zeroable & 0x3) == 0x3;
15253 bool IsHighZero = (Zeroable & 0xc) == 0xc;
15254
15255 // Try to use an insert into a zero vector.
15256 if (WidenedMask[0] == 0 && IsHighZero) {
15257 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15258 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
15259 DAG.getIntPtrConstant(0, DL));
15260 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
15261 getZeroVector(VT, Subtarget, DAG, DL), LoV,
15262 DAG.getIntPtrConstant(0, DL));
15263 }
15264
15265 // TODO: If minimizing size and one of the inputs is a zero vector and the
15266 // the zero vector has only one use, we could use a VPERM2X128 to save the
15267 // instruction bytes needed to explicitly generate the zero vector.
15268
15269 // Blends are faster and handle all the non-lane-crossing cases.
15270 if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,
15271 Subtarget, DAG))
15272 return Blend;
15273
15274 // If either input operand is a zero vector, use VPERM2X128 because its mask
15275 // allows us to replace the zero input with an implicit zero.
15276 if (!IsLowZero && !IsHighZero) {
15277 // Check for patterns which can be matched with a single insert of a 128-bit
15278 // subvector.
15279 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);
15280 if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {
15281
15282 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
15283 // this will likely become vinsertf128 which can't fold a 256-bit memop.
15284 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
15285 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
15286 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
15287 OnlyUsesV1 ? V1 : V2,
15288 DAG.getIntPtrConstant(0, DL));
15289 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
15290 DAG.getIntPtrConstant(2, DL));
15291 }
15292 }
15293
15294 // Try to use SHUF128 if possible.
15295 if (Subtarget.hasVLX()) {
15296 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
15297 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
15298 ((WidenedMask[1] % 2) << 1);
15299 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
15300 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15301 }
15302 }
15303 }
15304
15305 // Otherwise form a 128-bit permutation. After accounting for undefs,
15306 // convert the 64-bit shuffle mask selection values into 128-bit
15307 // selection bits by dividing the indexes by 2 and shifting into positions
15308 // defined by a vperm2*128 instruction's immediate control byte.
15309
15310 // The immediate permute control byte looks like this:
15311 // [1:0] - select 128 bits from sources for low half of destination
15312 // [2] - ignore
15313 // [3] - zero low half of destination
15314 // [5:4] - select 128 bits from sources for high half of destination
15315 // [6] - ignore
15316 // [7] - zero high half of destination
15317
15318 assert((WidenedMask[0] >= 0 || IsLowZero) &&
15319 (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?");
15320
15321 unsigned PermMask = 0;
15322 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
15323 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
15324
15325 // Check the immediate mask and replace unused sources with undef.
15326 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
15327 V1 = DAG.getUNDEF(VT);
15328 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
15329 V2 = DAG.getUNDEF(VT);
15330
15331 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
15332 DAG.getTargetConstant(PermMask, DL, MVT::i8));
15333}
15334
15335/// Lower a vector shuffle by first fixing the 128-bit lanes and then
15336/// shuffling each lane.
15337///
15338/// This attempts to create a repeated lane shuffle where each lane uses one
15339/// or two of the lanes of the inputs. The lanes of the input vectors are
15340/// shuffled in one or two independent shuffles to get the lanes into the
15341/// position needed by the final shuffle.
15343 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15344 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15345 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
15346
15347 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15348 return SDValue();
15349
15350 int NumElts = Mask.size();
15351 int NumLanes = VT.getSizeInBits() / 128;
15352 int NumLaneElts = 128 / VT.getScalarSizeInBits();
15353 SmallVector<int, 16> RepeatMask(NumLaneElts, -1);
15354 SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});
15355
15356 // First pass will try to fill in the RepeatMask from lanes that need two
15357 // sources.
15358 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15359 int Srcs[2] = {-1, -1};
15360 SmallVector<int, 16> InLaneMask(NumLaneElts, -1);
15361 for (int i = 0; i != NumLaneElts; ++i) {
15362 int M = Mask[(Lane * NumLaneElts) + i];
15363 if (M < 0)
15364 continue;
15365 // Determine which of the possible input lanes (NumLanes from each source)
15366 // this element comes from. Assign that as one of the sources for this
15367 // lane. We can assign up to 2 sources for this lane. If we run out
15368 // sources we can't do anything.
15369 int LaneSrc = M / NumLaneElts;
15370 int Src;
15371 if (Srcs[0] < 0 || Srcs[0] == LaneSrc)
15372 Src = 0;
15373 else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)
15374 Src = 1;
15375 else
15376 return SDValue();
15377
15378 Srcs[Src] = LaneSrc;
15379 InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;
15380 }
15381
15382 // If this lane has two sources, see if it fits with the repeat mask so far.
15383 if (Srcs[1] < 0)
15384 continue;
15385
15386 LaneSrcs[Lane][0] = Srcs[0];
15387 LaneSrcs[Lane][1] = Srcs[1];
15388
15389 auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {
15390 assert(M1.size() == M2.size() && "Unexpected mask size");
15391 for (int i = 0, e = M1.size(); i != e; ++i)
15392 if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])
15393 return false;
15394 return true;
15395 };
15396
15397 auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {
15398 assert(Mask.size() == MergedMask.size() && "Unexpected mask size");
15399 for (int i = 0, e = MergedMask.size(); i != e; ++i) {
15400 int M = Mask[i];
15401 if (M < 0)
15402 continue;
15403 assert((MergedMask[i] < 0 || MergedMask[i] == M) &&
15404 "Unexpected mask element");
15405 MergedMask[i] = M;
15406 }
15407 };
15408
15409 if (MatchMasks(InLaneMask, RepeatMask)) {
15410 // Merge this lane mask into the final repeat mask.
15411 MergeMasks(InLaneMask, RepeatMask);
15412 continue;
15413 }
15414
15415 // Didn't find a match. Swap the operands and try again.
15416 std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);
15418
15419 if (MatchMasks(InLaneMask, RepeatMask)) {
15420 // Merge this lane mask into the final repeat mask.
15421 MergeMasks(InLaneMask, RepeatMask);
15422 continue;
15423 }
15424
15425 // Couldn't find a match with the operands in either order.
15426 return SDValue();
15427 }
15428
15429 // Now handle any lanes with only one source.
15430 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15431 // If this lane has already been processed, skip it.
15432 if (LaneSrcs[Lane][0] >= 0)
15433 continue;
15434
15435 for (int i = 0; i != NumLaneElts; ++i) {
15436 int M = Mask[(Lane * NumLaneElts) + i];
15437 if (M < 0)
15438 continue;
15439
15440 // If RepeatMask isn't defined yet we can define it ourself.
15441 if (RepeatMask[i] < 0)
15442 RepeatMask[i] = M % NumLaneElts;
15443
15444 if (RepeatMask[i] < NumElts) {
15445 if (RepeatMask[i] != M % NumLaneElts)
15446 return SDValue();
15447 LaneSrcs[Lane][0] = M / NumLaneElts;
15448 } else {
15449 if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))
15450 return SDValue();
15451 LaneSrcs[Lane][1] = M / NumLaneElts;
15452 }
15453 }
15454
15455 if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)
15456 return SDValue();
15457 }
15458
15459 SmallVector<int, 16> NewMask(NumElts, -1);
15460 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15461 int Src = LaneSrcs[Lane][0];
15462 for (int i = 0; i != NumLaneElts; ++i) {
15463 int M = -1;
15464 if (Src >= 0)
15465 M = Src * NumLaneElts + i;
15466 NewMask[Lane * NumLaneElts + i] = M;
15467 }
15468 }
15469 SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15470 // Ensure we didn't get back the shuffle we started with.
15471 // FIXME: This is a hack to make up for some splat handling code in
15472 // getVectorShuffle.
15473 if (isa<ShuffleVectorSDNode>(NewV1) &&
15474 cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)
15475 return SDValue();
15476
15477 for (int Lane = 0; Lane != NumLanes; ++Lane) {
15478 int Src = LaneSrcs[Lane][1];
15479 for (int i = 0; i != NumLaneElts; ++i) {
15480 int M = -1;
15481 if (Src >= 0)
15482 M = Src * NumLaneElts + i;
15483 NewMask[Lane * NumLaneElts + i] = M;
15484 }
15485 }
15486 SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15487 // Ensure we didn't get back the shuffle we started with.
15488 // FIXME: This is a hack to make up for some splat handling code in
15489 // getVectorShuffle.
15490 if (isa<ShuffleVectorSDNode>(NewV2) &&
15491 cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)
15492 return SDValue();
15493
15494 for (int i = 0; i != NumElts; ++i) {
15495 if (Mask[i] < 0) {
15496 NewMask[i] = -1;
15497 continue;
15498 }
15499 NewMask[i] = RepeatMask[i % NumLaneElts];
15500 if (NewMask[i] < 0)
15501 continue;
15502
15503 NewMask[i] += (i / NumLaneElts) * NumLaneElts;
15504 }
15505 return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);
15506}
15507
15508/// If the input shuffle mask results in a vector that is undefined in all upper
15509/// or lower half elements and that mask accesses only 2 halves of the
15510/// shuffle's operands, return true. A mask of half the width with mask indexes
15511/// adjusted to access the extracted halves of the original shuffle operands is
15512/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or
15513/// lower half of each input operand is accessed.
15514static bool
15516 int &HalfIdx1, int &HalfIdx2) {
15517 assert((Mask.size() == HalfMask.size() * 2) &&
15518 "Expected input mask to be twice as long as output");
15519
15520 // Exactly one half of the result must be undef to allow narrowing.
15521 bool UndefLower = isUndefLowerHalf(Mask);
15522 bool UndefUpper = isUndefUpperHalf(Mask);
15523 if (UndefLower == UndefUpper)
15524 return false;
15525
15526 unsigned HalfNumElts = HalfMask.size();
15527 unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;
15528 HalfIdx1 = -1;
15529 HalfIdx2 = -1;
15530 for (unsigned i = 0; i != HalfNumElts; ++i) {
15531 int M = Mask[i + MaskIndexOffset];
15532 if (M < 0) {
15533 HalfMask[i] = M;
15534 continue;
15535 }
15536
15537 // Determine which of the 4 half vectors this element is from.
15538 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
15539 int HalfIdx = M / HalfNumElts;
15540
15541 // Determine the element index into its half vector source.
15542 int HalfElt = M % HalfNumElts;
15543
15544 // We can shuffle with up to 2 half vectors, set the new 'half'
15545 // shuffle mask accordingly.
15546 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
15547 HalfMask[i] = HalfElt;
15548 HalfIdx1 = HalfIdx;
15549 continue;
15550 }
15551 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
15552 HalfMask[i] = HalfElt + HalfNumElts;
15553 HalfIdx2 = HalfIdx;
15554 continue;
15555 }
15556
15557 // Too many half vectors referenced.
15558 return false;
15559 }
15560
15561 return true;
15562}
15563
15564/// Given the output values from getHalfShuffleMask(), create a half width
15565/// shuffle of extracted vectors followed by an insert back to full width.
15567 ArrayRef<int> HalfMask, int HalfIdx1,
15568 int HalfIdx2, bool UndefLower,
15569 SelectionDAG &DAG, bool UseConcat = false) {
15570 assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?");
15571 assert(V1.getValueType().isSimple() && "Expecting only simple types");
15572
15573 MVT VT = V1.getSimpleValueType();
15574 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15575 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15576
15577 auto getHalfVector = [&](int HalfIdx) {
15578 if (HalfIdx < 0)
15579 return DAG.getUNDEF(HalfVT);
15580 SDValue V = (HalfIdx < 2 ? V1 : V2);
15581 HalfIdx = (HalfIdx % 2) * HalfNumElts;
15582 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
15583 DAG.getIntPtrConstant(HalfIdx, DL));
15584 };
15585
15586 // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset
15587 SDValue Half1 = getHalfVector(HalfIdx1);
15588 SDValue Half2 = getHalfVector(HalfIdx2);
15589 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
15590 if (UseConcat) {
15591 SDValue Op0 = V;
15592 SDValue Op1 = DAG.getUNDEF(HalfVT);
15593 if (UndefLower)
15594 std::swap(Op0, Op1);
15595 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);
15596 }
15597
15598 unsigned Offset = UndefLower ? HalfNumElts : 0;
15599 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
15601}
15602
15603/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
15604/// This allows for fast cases such as subvector extraction/insertion
15605/// or shuffling smaller vector types which can lower more efficiently.
15607 SDValue V2, ArrayRef<int> Mask,
15608 const X86Subtarget &Subtarget,
15609 SelectionDAG &DAG) {
15610 assert((VT.is256BitVector() || VT.is512BitVector()) &&
15611 "Expected 256-bit or 512-bit vector");
15612
15613 bool UndefLower = isUndefLowerHalf(Mask);
15614 if (!UndefLower && !isUndefUpperHalf(Mask))
15615 return SDValue();
15616
15617 assert((!UndefLower || !isUndefUpperHalf(Mask)) &&
15618 "Completely undef shuffle mask should have been simplified already");
15619
15620 // Upper half is undef and lower half is whole upper subvector.
15621 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
15622 MVT HalfVT = VT.getHalfNumVectorElementsVT();
15623 unsigned HalfNumElts = HalfVT.getVectorNumElements();
15624 if (!UndefLower &&
15625 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
15626 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15627 DAG.getIntPtrConstant(HalfNumElts, DL));
15628 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15629 DAG.getIntPtrConstant(0, DL));
15630 }
15631
15632 // Lower half is undef and upper half is whole lower subvector.
15633 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
15634 if (UndefLower &&
15635 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
15636 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
15637 DAG.getIntPtrConstant(0, DL));
15638 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
15639 DAG.getIntPtrConstant(HalfNumElts, DL));
15640 }
15641
15642 int HalfIdx1, HalfIdx2;
15643 SmallVector<int, 8> HalfMask(HalfNumElts);
15644 if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))
15645 return SDValue();
15646
15647 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
15648
15649 // Only shuffle the halves of the inputs when useful.
15650 unsigned NumLowerHalves =
15651 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
15652 unsigned NumUpperHalves =
15653 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
15654 assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed");
15655
15656 // Determine the larger pattern of undef/halves, then decide if it's worth
15657 // splitting the shuffle based on subtarget capabilities and types.
15658 unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
15659 if (!UndefLower) {
15660 // XXXXuuuu: no insert is needed.
15661 // Always extract lowers when setting lower - these are all free subreg ops.
15662 if (NumUpperHalves == 0)
15663 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15664 UndefLower, DAG);
15665
15666 if (NumUpperHalves == 1) {
15667 // AVX2 has efficient 32/64-bit element cross-lane shuffles.
15668 if (Subtarget.hasAVX2()) {
15669 // extract128 + vunpckhps/vshufps, is better than vblend + vpermps.
15670 if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&
15671 !is128BitUnpackShuffleMask(HalfMask, DAG) &&
15672 (!isSingleSHUFPSMask(HalfMask) ||
15673 Subtarget.hasFastVariableCrossLaneShuffle()))
15674 return SDValue();
15675 // If this is a unary shuffle (assume that the 2nd operand is
15676 // canonicalized to undef), then we can use vpermpd. Otherwise, we
15677 // are better off extracting the upper half of 1 operand and using a
15678 // narrow shuffle.
15679 if (EltWidth == 64 && V2.isUndef())
15680 return SDValue();
15681 }
15682 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15683 if (Subtarget.hasAVX512() && VT.is512BitVector())
15684 return SDValue();
15685 // Extract + narrow shuffle is better than the wide alternative.
15686 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15687 UndefLower, DAG);
15688 }
15689
15690 // Don't extract both uppers, instead shuffle and then extract.
15691 assert(NumUpperHalves == 2 && "Half vector count went wrong");
15692 return SDValue();
15693 }
15694
15695 // UndefLower - uuuuXXXX: an insert to high half is required if we split this.
15696 if (NumUpperHalves == 0) {
15697 // AVX2 has efficient 64-bit element cross-lane shuffles.
15698 // TODO: Refine to account for unary shuffle, splat, and other masks?
15699 if (Subtarget.hasAVX2() && EltWidth == 64)
15700 return SDValue();
15701 // AVX512 has efficient cross-lane shuffles for all legal 512-bit types.
15702 if (Subtarget.hasAVX512() && VT.is512BitVector())
15703 return SDValue();
15704 // Narrow shuffle + insert is better than the wide alternative.
15705 return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,
15706 UndefLower, DAG);
15707 }
15708
15709 // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.
15710 return SDValue();
15711}
15712
15713/// Handle case where shuffle sources are coming from the same 128-bit lane and
15714/// every lane can be represented as the same repeating mask - allowing us to
15715/// shuffle the sources with the repeating shuffle and then permute the result
15716/// to the destination lanes.
15718 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
15719 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15720 int NumElts = VT.getVectorNumElements();
15721 int NumLanes = VT.getSizeInBits() / 128;
15722 int NumLaneElts = NumElts / NumLanes;
15723
15724 // On AVX2 we may be able to just shuffle the lowest elements and then
15725 // broadcast the result.
15726 if (Subtarget.hasAVX2()) {
15727 for (unsigned BroadcastSize : {16, 32, 64}) {
15728 if (BroadcastSize <= VT.getScalarSizeInBits())
15729 continue;
15730 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
15731
15732 // Attempt to match a repeating pattern every NumBroadcastElts,
15733 // accounting for UNDEFs but only references the lowest 128-bit
15734 // lane of the inputs.
15735 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
15736 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15737 for (int j = 0; j != NumBroadcastElts; ++j) {
15738 int M = Mask[i + j];
15739 if (M < 0)
15740 continue;
15741 int &R = RepeatMask[j];
15742 if (0 != ((M % NumElts) / NumLaneElts))
15743 return false;
15744 if (0 <= R && R != M)
15745 return false;
15746 R = M;
15747 }
15748 return true;
15749 };
15750
15751 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
15752 if (!FindRepeatingBroadcastMask(RepeatMask))
15753 continue;
15754
15755 // Shuffle the (lowest) repeated elements in place for broadcast.
15756 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
15757
15758 // Shuffle the actual broadcast.
15759 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
15760 for (int i = 0; i != NumElts; i += NumBroadcastElts)
15761 for (int j = 0; j != NumBroadcastElts; ++j)
15762 BroadcastMask[i + j] = j;
15763
15764 // Avoid returning the same shuffle operation. For example,
15765 // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
15766 if (BroadcastMask == Mask)
15767 return SDValue();
15768
15769 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
15770 BroadcastMask);
15771 }
15772 }
15773
15774 // Bail if the shuffle mask doesn't cross 128-bit lanes.
15775 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
15776 return SDValue();
15777
15778 // Bail if we already have a repeated lane shuffle mask.
15779 if (is128BitLaneRepeatedShuffleMask(VT, Mask))
15780 return SDValue();
15781
15782 // Helper to look for repeated mask in each split sublane, and that those
15783 // sublanes can then be permuted into place.
15784 auto ShuffleSubLanes = [&](int SubLaneScale) {
15785 int NumSubLanes = NumLanes * SubLaneScale;
15786 int NumSubLaneElts = NumLaneElts / SubLaneScale;
15787
15788 // Check that all the sources are coming from the same lane and see if we
15789 // can form a repeating shuffle mask (local to each sub-lane). At the same
15790 // time, determine the source sub-lane for each destination sub-lane.
15791 int TopSrcSubLane = -1;
15792 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
15793 SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(
15794 SubLaneScale,
15795 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));
15796
15797 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
15798 // Extract the sub-lane mask, check that it all comes from the same lane
15799 // and normalize the mask entries to come from the first lane.
15800 int SrcLane = -1;
15801 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
15802 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15803 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
15804 if (M < 0)
15805 continue;
15806 int Lane = (M % NumElts) / NumLaneElts;
15807 if ((0 <= SrcLane) && (SrcLane != Lane))
15808 return SDValue();
15809 SrcLane = Lane;
15810 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
15811 SubLaneMask[Elt] = LocalM;
15812 }
15813
15814 // Whole sub-lane is UNDEF.
15815 if (SrcLane < 0)
15816 continue;
15817
15818 // Attempt to match against the candidate repeated sub-lane masks.
15819 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
15820 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
15821 for (int i = 0; i != NumSubLaneElts; ++i) {
15822 if (M1[i] < 0 || M2[i] < 0)
15823 continue;
15824 if (M1[i] != M2[i])
15825 return false;
15826 }
15827 return true;
15828 };
15829
15830 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
15831 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
15832 continue;
15833
15834 // Merge the sub-lane mask into the matching repeated sub-lane mask.
15835 for (int i = 0; i != NumSubLaneElts; ++i) {
15836 int M = SubLaneMask[i];
15837 if (M < 0)
15838 continue;
15839 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
15840 "Unexpected mask element");
15841 RepeatedSubLaneMask[i] = M;
15842 }
15843
15844 // Track the top most source sub-lane - by setting the remaining to
15845 // UNDEF we can greatly simplify shuffle matching.
15846 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
15847 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
15848 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
15849 break;
15850 }
15851
15852 // Bail if we failed to find a matching repeated sub-lane mask.
15853 if (Dst2SrcSubLanes[DstSubLane] < 0)
15854 return SDValue();
15855 }
15856 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
15857 "Unexpected source lane");
15858
15859 // Create a repeating shuffle mask for the entire vector.
15860 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
15861 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
15862 int Lane = SubLane / SubLaneScale;
15863 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
15864 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
15865 int M = RepeatedSubLaneMask[Elt];
15866 if (M < 0)
15867 continue;
15868 int Idx = (SubLane * NumSubLaneElts) + Elt;
15869 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
15870 }
15871 }
15872
15873 // Shuffle each source sub-lane to its destination.
15874 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
15875 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
15876 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
15877 if (SrcSubLane < 0)
15878 continue;
15879 for (int j = 0; j != NumSubLaneElts; ++j)
15880 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
15881 }
15882
15883 // Avoid returning the same shuffle operation.
15884 // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32
15885 if (RepeatedMask == Mask || SubLaneMask == Mask)
15886 return SDValue();
15887
15888 SDValue RepeatedShuffle =
15889 DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
15890
15891 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
15892 SubLaneMask);
15893 };
15894
15895 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
15896 // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,
15897 // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.
15898 // Otherwise we can only permute whole 128-bit lanes.
15899 int MinSubLaneScale = 1, MaxSubLaneScale = 1;
15900 if (Subtarget.hasAVX2() && VT.is256BitVector()) {
15901 bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);
15902 MinSubLaneScale = 2;
15903 MaxSubLaneScale =
15904 (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;
15905 }
15906 if (Subtarget.hasBWI() && VT == MVT::v64i8)
15907 MinSubLaneScale = MaxSubLaneScale = 4;
15908
15909 for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)
15910 if (SDValue Shuffle = ShuffleSubLanes(Scale))
15911 return Shuffle;
15912
15913 return SDValue();
15914}
15915
15917 bool &ForceV1Zero, bool &ForceV2Zero,
15918 unsigned &ShuffleImm, ArrayRef<int> Mask,
15919 const APInt &Zeroable) {
15920 int NumElts = VT.getVectorNumElements();
15921 assert(VT.getScalarSizeInBits() == 64 &&
15922 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
15923 "Unexpected data type for VSHUFPD");
15924 assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&
15925 "Illegal shuffle mask");
15926
15927 bool ZeroLane[2] = { true, true };
15928 for (int i = 0; i < NumElts; ++i)
15929 ZeroLane[i & 1] &= Zeroable[i];
15930
15931 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
15932 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
15933 bool IsSHUFPD = true;
15934 bool IsCommutable = true;
15935 SmallVector<int, 8> SHUFPDMask(NumElts, -1);
15936 for (int i = 0; i < NumElts; ++i) {
15937 if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])
15938 continue;
15939 if (Mask[i] < 0)
15940 return false;
15941 int Val = (i & 6) + NumElts * (i & 1);
15942 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
15943 if (Mask[i] < Val || Mask[i] > Val + 1)
15944 IsSHUFPD = false;
15945 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
15946 IsCommutable = false;
15947 SHUFPDMask[i] = Mask[i] % 2;
15948 }
15949
15950 if (!IsSHUFPD && !IsCommutable)
15951 return false;
15952
15953 if (!IsSHUFPD && IsCommutable)
15954 std::swap(V1, V2);
15955
15956 ForceV1Zero = ZeroLane[0];
15957 ForceV2Zero = ZeroLane[1];
15958 ShuffleImm = getSHUFPDImm(SHUFPDMask);
15959 return true;
15960}
15961
15963 SDValue V2, ArrayRef<int> Mask,
15964 const APInt &Zeroable,
15965 const X86Subtarget &Subtarget,
15966 SelectionDAG &DAG) {
15967 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&
15968 "Unexpected data type for VSHUFPD");
15969
15970 unsigned Immediate = 0;
15971 bool ForceV1Zero = false, ForceV2Zero = false;
15972 if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,
15973 Mask, Zeroable))
15974 return SDValue();
15975
15976 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
15977 if (ForceV1Zero)
15978 V1 = getZeroVector(VT, Subtarget, DAG, DL);
15979 if (ForceV2Zero)
15980 V2 = getZeroVector(VT, Subtarget, DAG, DL);
15981
15982 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
15983 DAG.getTargetConstant(Immediate, DL, MVT::i8));
15984}
15985
15986// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
15987// by zeroable elements in the remaining 24 elements. Turn this into two
15988// vmovqb instructions shuffled together.
15990 SDValue V1, SDValue V2,
15991 ArrayRef<int> Mask,
15992 const APInt &Zeroable,
15993 SelectionDAG &DAG) {
15994 assert(VT == MVT::v32i8 && "Unexpected type!");
15995
15996 // The first 8 indices should be every 8th element.
15997 if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
15998 return SDValue();
15999
16000 // Remaining elements need to be zeroable.
16001 if (Zeroable.countl_one() < (Mask.size() - 8))
16002 return SDValue();
16003
16004 V1 = DAG.getBitcast(MVT::v4i64, V1);
16005 V2 = DAG.getBitcast(MVT::v4i64, V2);
16006
16007 V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
16008 V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
16009
16010 // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
16011 // the upper bits of the result using an unpckldq.
16012 SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
16013 { 0, 1, 2, 3, 16, 17, 18, 19,
16014 4, 5, 6, 7, 20, 21, 22, 23 });
16015 // Insert the unpckldq into a zero vector to widen to v32i8.
16016 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
16017 DAG.getConstant(0, DL, MVT::v32i8), Unpack,
16018 DAG.getIntPtrConstant(0, DL));
16019}
16020
16021// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
16022// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
16023// =>
16024// ul = unpckl v1, v2
16025// uh = unpckh v1, v2
16026// a = vperm ul, uh
16027// b = vperm ul, uh
16028//
16029// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
16030// and permute. We cannot directly match v3 because it is split into two
16031// 256-bit vectors in earlier isel stages. Therefore, this function matches a
16032// pair of 256-bit shuffles and makes sure the masks are consecutive.
16033//
16034// Once unpck and permute nodes are created, the permute corresponding to this
16035// shuffle is returned, while the other permute replaces the other half of the
16036// shuffle in the selection dag.
16038 SDValue V1, SDValue V2,
16039 ArrayRef<int> Mask,
16040 SelectionDAG &DAG) {
16041 if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
16042 VT != MVT::v32i8)
16043 return SDValue();
16044 // <B0, B1, B0+1, B1+1, ..., >
16045 auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
16046 unsigned Begin1) {
16047 size_t Size = Mask.size();
16048 assert(Size % 2 == 0 && "Expected even mask size");
16049 for (unsigned I = 0; I < Size; I += 2) {
16050 if (Mask[I] != (int)(Begin0 + I / 2) ||
16051 Mask[I + 1] != (int)(Begin1 + I / 2))
16052 return false;
16053 }
16054 return true;
16055 };
16056 // Check which half is this shuffle node
16057 int NumElts = VT.getVectorNumElements();
16058 size_t FirstQtr = NumElts / 2;
16059 size_t ThirdQtr = NumElts + NumElts / 2;
16060 bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
16061 bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
16062 if (!IsFirstHalf && !IsSecondHalf)
16063 return SDValue();
16064
16065 // Find the intersection between shuffle users of V1 and V2.
16066 SmallVector<SDNode *, 2> Shuffles;
16067 for (SDNode *User : V1->users())
16068 if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
16069 User->getOperand(1) == V2)
16070 Shuffles.push_back(User);
16071 // Limit user size to two for now.
16072 if (Shuffles.size() != 2)
16073 return SDValue();
16074 // Find out which half of the 512-bit shuffles is each smaller shuffle
16075 auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
16076 auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
16077 SDNode *FirstHalf;
16078 SDNode *SecondHalf;
16079 if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
16080 IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
16081 FirstHalf = Shuffles[0];
16082 SecondHalf = Shuffles[1];
16083 } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
16084 IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
16085 FirstHalf = Shuffles[1];
16086 SecondHalf = Shuffles[0];
16087 } else {
16088 return SDValue();
16089 }
16090 // Lower into unpck and perm. Return the perm of this shuffle and replace
16091 // the other.
16092 SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
16093 SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
16094 SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16095 DAG.getTargetConstant(0x20, DL, MVT::i8));
16096 SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
16097 DAG.getTargetConstant(0x31, DL, MVT::i8));
16098 if (IsFirstHalf) {
16099 DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
16100 return Perm1;
16101 }
16102 DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
16103 return Perm2;
16104}
16105
16106/// Handle lowering of 4-lane 64-bit floating point shuffles.
16107///
16108/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
16109/// isn't available.
16111 const APInt &Zeroable, SDValue V1, SDValue V2,
16112 const X86Subtarget &Subtarget,
16113 SelectionDAG &DAG) {
16114 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16115 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
16116 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16117
16118 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16119 Subtarget, DAG))
16120 return V;
16121
16122 if (V2.isUndef()) {
16123 // Check for being able to broadcast a single element.
16124 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,
16125 Mask, Subtarget, DAG))
16126 return Broadcast;
16127
16128 // Use low duplicate instructions for masks that match their pattern.
16129 if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))
16130 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
16131
16132 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
16133 // Non-half-crossing single input shuffles can be lowered with an
16134 // interleaved permutation.
16135 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
16136 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
16137 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
16138 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
16139 }
16140
16141 // With AVX2 we have direct support for this permutation.
16142 if (Subtarget.hasAVX2())
16143 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
16144 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16145
16146 // Try to create an in-lane repeating shuffle mask and then shuffle the
16147 // results into the target lanes.
16149 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16150 return V;
16151
16152 // Try to permute the lanes and then use a per-lane permute.
16153 if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,
16154 Mask, DAG, Subtarget))
16155 return V;
16156
16157 // Otherwise, fall back.
16158 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,
16159 DAG, Subtarget);
16160 }
16161
16162 // Use dedicated unpack instructions for masks that match their pattern.
16163 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, V1, V2, Mask, DAG))
16164 return V;
16165
16166 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
16167 Zeroable, Subtarget, DAG))
16168 return Blend;
16169
16170 // Check if the blend happens to exactly fit that of SHUFPD.
16171 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,
16172 Zeroable, Subtarget, DAG))
16173 return Op;
16174
16175 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16176 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16177
16178 // If we have lane crossing shuffles AND they don't all come from the lower
16179 // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
16180 // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
16181 // canonicalize to a blend of splat which isn't necessary for this combine.
16182 if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
16183 !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
16184 (V1.getOpcode() != ISD::BUILD_VECTOR) &&
16185 (V2.getOpcode() != ISD::BUILD_VECTOR))
16186 return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);
16187
16188 // If we have one input in place, then we can permute the other input and
16189 // blend the result.
16190 if (V1IsInPlace || V2IsInPlace)
16191 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16192 Zeroable, Subtarget, DAG);
16193
16194 // Try to create an in-lane repeating shuffle mask and then shuffle the
16195 // results into the target lanes.
16197 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16198 return V;
16199
16200 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16201 // shuffle. However, if we have AVX2 and either inputs are already in place,
16202 // we will be able to shuffle even across lanes the other input in a single
16203 // instruction so skip this pattern.
16204 if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))
16206 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
16207 return V;
16208
16209 // If we have VLX support, we can use VEXPAND.
16210 if (Subtarget.hasVLX())
16211 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4f64, V1, V2, Mask,
16212 Zeroable, Subtarget, DAG))
16213 return V;
16214
16215 // If we have AVX2 then we always want to lower with a blend because an v4 we
16216 // can fully permute the elements.
16217 if (Subtarget.hasAVX2())
16218 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,
16219 Zeroable, Subtarget, DAG);
16220
16221 // Otherwise fall back on generic lowering.
16222 return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, Zeroable,
16223 Subtarget, DAG);
16224}
16225
16226/// Handle lowering of 4-lane 64-bit integer shuffles.
16227///
16228/// This routine is only called when we have AVX2 and thus a reasonable
16229/// instruction set for v4i64 shuffling..
16231 const APInt &Zeroable, SDValue V1, SDValue V2,
16232 const X86Subtarget &Subtarget,
16233 SelectionDAG &DAG) {
16234 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16235 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
16236 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
16237 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
16238
16239 if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16240 Subtarget, DAG))
16241 return V;
16242
16243 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
16244 Zeroable, Subtarget, DAG))
16245 return Blend;
16246
16247 // Check for being able to broadcast a single element.
16248 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,
16249 Subtarget, DAG))
16250 return Broadcast;
16251
16252 // Try to use shift instructions if fast.
16253 if (Subtarget.preferLowerShuffleAsShift())
16254 if (SDValue Shift =
16255 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,
16256 Subtarget, DAG, /*BitwiseOnly*/ true))
16257 return Shift;
16258
16259 if (V2.isUndef()) {
16260 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
16261 // can use lower latency instructions that will operate on both lanes.
16262 SmallVector<int, 2> RepeatedMask;
16263 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
16264 SmallVector<int, 4> PSHUFDMask;
16265 narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
16266 return DAG.getBitcast(
16267 MVT::v4i64,
16268 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
16269 DAG.getBitcast(MVT::v8i32, V1),
16270 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
16271 }
16272
16273 // AVX2 provides a direct instruction for permuting a single input across
16274 // lanes.
16275 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
16276 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
16277 }
16278
16279 // Try to use shift instructions.
16280 if (SDValue Shift =
16281 lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,
16282 DAG, /*BitwiseOnly*/ false))
16283 return Shift;
16284
16285 // If we have VLX support, we can use VALIGN or VEXPAND.
16286 if (Subtarget.hasVLX()) {
16287 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
16288 Zeroable, Subtarget, DAG))
16289 return Rotate;
16290
16291 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4i64, V1, V2, Mask,
16292 Zeroable, Subtarget, DAG))
16293 return V;
16294 }
16295
16296 // Try to use PALIGNR.
16297 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,
16298 Subtarget, DAG))
16299 return Rotate;
16300
16301 // Use dedicated unpack instructions for masks that match their pattern.
16302 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, V1, V2, Mask, DAG))
16303 return V;
16304
16305 bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);
16306 bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);
16307
16308 // If we have one input in place, then we can permute the other input and
16309 // blend the result.
16310 if (V1IsInPlace || V2IsInPlace)
16311 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16312 Zeroable, Subtarget, DAG);
16313
16314 // Try to create an in-lane repeating shuffle mask and then shuffle the
16315 // results into the target lanes.
16317 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16318 return V;
16319
16320 // Try to lower to PERMQ(BLENDD(V1,V2)).
16321 if (SDValue V =
16322 lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))
16323 return V;
16324
16325 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16326 // shuffle. However, if we have AVX2 and either inputs are already in place,
16327 // we will be able to shuffle even across lanes the other input in a single
16328 // instruction so skip this pattern.
16329 if (!V1IsInPlace && !V2IsInPlace)
16331 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
16332 return Result;
16333
16334 // Otherwise fall back on generic blend lowering.
16335 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,
16336 Zeroable, Subtarget, DAG);
16337}
16338
16339/// Handle lowering of 8-lane 32-bit floating point shuffles.
16340///
16341/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
16342/// isn't available.
16344 const APInt &Zeroable, SDValue V1, SDValue V2,
16345 const X86Subtarget &Subtarget,
16346 SelectionDAG &DAG) {
16347 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16348 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
16349 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16350
16351 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
16352 Zeroable, Subtarget, DAG))
16353 return Blend;
16354
16355 // Check for being able to broadcast a single element.
16356 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,
16357 Subtarget, DAG))
16358 return Broadcast;
16359
16360 if (!Subtarget.hasAVX2()) {
16361 SmallVector<int> InLaneMask;
16362 computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
16363
16364 if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
16365 if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
16366 /*SimpleOnly*/ true))
16367 return R;
16368 }
16369 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16370 Zeroable, Subtarget, DAG))
16371 return DAG.getBitcast(MVT::v8f32, ZExt);
16372
16373 // If the shuffle mask is repeated in each 128-bit lane, we have many more
16374 // options to efficiently lower the shuffle.
16375 SmallVector<int, 4> RepeatedMask;
16376 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
16377 assert(RepeatedMask.size() == 4 &&
16378 "Repeated masks must be half the mask width!");
16379
16380 // Use even/odd duplicate instructions for masks that match their pattern.
16381 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
16382 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
16383 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
16384 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
16385
16386 if (V2.isUndef())
16387 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
16388 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16389
16390 // Use dedicated unpack instructions for masks that match their pattern.
16391 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, V1, V2, Mask, DAG))
16392 return V;
16393
16394 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
16395 // have already handled any direct blends.
16396 return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
16397 }
16398
16399 // Try to create an in-lane repeating shuffle mask and then shuffle the
16400 // results into the target lanes.
16402 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16403 return V;
16404
16405 // If we have a single input shuffle with different shuffle patterns in the
16406 // two 128-bit lanes use the variable mask to VPERMILPS.
16407 if (V2.isUndef()) {
16408 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
16409 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16410 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
16411 }
16412 if (Subtarget.hasAVX2()) {
16413 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16414 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
16415 }
16416 // Otherwise, fall back.
16417 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
16418 DAG, Subtarget);
16419 }
16420
16421 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16422 // shuffle.
16424 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
16425 return Result;
16426
16427 // If we have VLX support, we can use VEXPAND.
16428 if (Subtarget.hasVLX())
16429 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f32, V1, V2, Mask,
16430 Zeroable, Subtarget, DAG))
16431 return V;
16432
16433 // Try to match an interleave of two v8f32s and lower them as unpck and
16434 // permutes using ymms. This needs to go before we try to split the vectors.
16435 //
16436 // TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits
16437 // this path inadvertently.
16438 if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())
16439 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
16440 Mask, DAG))
16441 return V;
16442
16443 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16444 // since after split we get a more efficient code using vpunpcklwd and
16445 // vpunpckhwd instrs than vblend.
16446 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))
16447 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16448 Subtarget, DAG);
16449
16450 // If we have AVX2 then we always want to lower with a blend because at v8 we
16451 // can fully permute the elements.
16452 if (Subtarget.hasAVX2())
16453 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,
16454 Zeroable, Subtarget, DAG);
16455
16456 // Otherwise fall back on generic lowering.
16457 return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Zeroable,
16458 Subtarget, DAG);
16459}
16460
16461/// Handle lowering of 8-lane 32-bit integer shuffles.
16462///
16463/// This routine is only called when we have AVX2 and thus a reasonable
16464/// instruction set for v8i32 shuffling..
16466 const APInt &Zeroable, SDValue V1, SDValue V2,
16467 const X86Subtarget &Subtarget,
16468 SelectionDAG &DAG) {
16469 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16470 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
16471 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
16472 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
16473
16474 int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });
16475
16476 // Whenever we can lower this as a zext, that instruction is strictly faster
16477 // than any alternative. It also allows us to fold memory operands into the
16478 // shuffle in many cases.
16479 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
16480 Zeroable, Subtarget, DAG))
16481 return ZExt;
16482
16483 // Try to match an interleave of two v8i32s and lower them as unpck and
16484 // permutes using ymms. This needs to go before we try to split the vectors.
16485 if (!Subtarget.hasAVX512())
16486 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
16487 Mask, DAG))
16488 return V;
16489
16490 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
16491 // since after split we get a more efficient code than vblend by using
16492 // vpunpcklwd and vpunpckhwd instrs.
16493 if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&
16494 !Subtarget.hasAVX512())
16495 return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16496 Subtarget, DAG);
16497
16498 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
16499 Zeroable, Subtarget, DAG))
16500 return Blend;
16501
16502 // Check for being able to broadcast a single element.
16503 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,
16504 Subtarget, DAG))
16505 return Broadcast;
16506
16507 // Try to use shift instructions if fast.
16508 if (Subtarget.preferLowerShuffleAsShift()) {
16509 if (SDValue Shift =
16510 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,
16511 Subtarget, DAG, /*BitwiseOnly*/ true))
16512 return Shift;
16513 if (NumV2Elements == 0)
16514 if (SDValue Rotate =
16515 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16516 return Rotate;
16517 }
16518
16519 // If the shuffle mask is repeated in each 128-bit lane we can use more
16520 // efficient instructions that mirror the shuffles across the two 128-bit
16521 // lanes.
16522 SmallVector<int, 4> RepeatedMask;
16523 bool Is128BitLaneRepeatedShuffle =
16524 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
16525 if (Is128BitLaneRepeatedShuffle) {
16526 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
16527 if (V2.isUndef())
16528 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
16529 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
16530
16531 // Use dedicated unpack instructions for masks that match their pattern.
16532 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, V1, V2, Mask, DAG))
16533 return V;
16534 }
16535
16536 // Try to use shift instructions.
16537 if (SDValue Shift =
16538 lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,
16539 DAG, /*BitwiseOnly*/ false))
16540 return Shift;
16541
16542 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)
16543 if (SDValue Rotate =
16544 lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))
16545 return Rotate;
16546
16547 // If we have VLX support, we can use VALIGN or EXPAND.
16548 if (Subtarget.hasVLX()) {
16549 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
16550 Zeroable, Subtarget, DAG))
16551 return Rotate;
16552
16553 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i32, V1, V2, Mask,
16554 Zeroable, Subtarget, DAG))
16555 return V;
16556 }
16557
16558 // Try to use byte rotation instructions.
16559 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,
16560 Subtarget, DAG))
16561 return Rotate;
16562
16563 // Try to create an in-lane repeating shuffle mask and then shuffle the
16564 // results into the target lanes.
16566 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16567 return V;
16568
16569 if (V2.isUndef()) {
16570 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16571 // because that should be faster than the variable permute alternatives.
16572 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, V1, V2, Mask, DAG))
16573 return V;
16574
16575 // If the shuffle patterns aren't repeated but it's a single input, directly
16576 // generate a cross-lane VPERMD instruction.
16577 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
16578 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
16579 }
16580
16581 // Assume that a single SHUFPS is faster than an alternative sequence of
16582 // multiple instructions (even if the CPU has a domain penalty).
16583 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
16584 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
16585 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
16586 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
16587 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
16588 CastV1, CastV2, DAG);
16589 return DAG.getBitcast(MVT::v8i32, ShufPS);
16590 }
16591
16592 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16593 // shuffle.
16595 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
16596 return Result;
16597
16598 // Otherwise fall back on generic blend lowering.
16599 return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,
16600 Zeroable, Subtarget, DAG);
16601}
16602
16603/// Handle lowering of 16-lane 16-bit integer shuffles.
16604///
16605/// This routine is only called when we have AVX2 and thus a reasonable
16606/// instruction set for v16i16 shuffling..
16608 const APInt &Zeroable, SDValue V1, SDValue V2,
16609 const X86Subtarget &Subtarget,
16610 SelectionDAG &DAG) {
16611 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16612 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
16613 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
16614 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
16615
16616 // Whenever we can lower this as a zext, that instruction is strictly faster
16617 // than any alternative. It also allows us to fold memory operands into the
16618 // shuffle in many cases.
16620 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
16621 return ZExt;
16622
16623 // Check for being able to broadcast a single element.
16624 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,
16625 Subtarget, DAG))
16626 return Broadcast;
16627
16628 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
16629 Zeroable, Subtarget, DAG))
16630 return Blend;
16631
16632 // Use dedicated unpack instructions for masks that match their pattern.
16633 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, V1, V2, Mask, DAG))
16634 return V;
16635
16636 // Use dedicated pack instructions for masks that match their pattern.
16637 if (SDValue V =
16638 lowerShuffleWithPACK(DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16639 return V;
16640
16641 // Try to use lower using a truncation.
16642 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16643 Subtarget, DAG))
16644 return V;
16645
16646 // Try to use shift instructions.
16647 if (SDValue Shift =
16648 lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16649 Subtarget, DAG, /*BitwiseOnly*/ false))
16650 return Shift;
16651
16652 // Try to use byte rotation instructions.
16653 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,
16654 Subtarget, DAG))
16655 return Rotate;
16656
16657 // Try to create an in-lane repeating shuffle mask and then shuffle the
16658 // results into the target lanes.
16660 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16661 return V;
16662
16663 if (V2.isUndef()) {
16664 // Try to use bit rotation instructions.
16665 if (SDValue Rotate =
16666 lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
16667 return Rotate;
16668
16669 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16670 // because that should be faster than the variable permute alternatives.
16671 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, V1, V2, Mask, DAG))
16672 return V;
16673
16674 // There are no generalized cross-lane shuffle operations available on i16
16675 // element types.
16676 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
16678 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16679 return V;
16680
16681 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,
16682 DAG, Subtarget);
16683 }
16684
16685 SmallVector<int, 8> RepeatedMask;
16686 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
16687 // As this is a single-input shuffle, the repeated mask should be
16688 // a strictly valid v8i16 mask that we can pass through to the v8i16
16689 // lowering to handle even the v16 case.
16691 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
16692 }
16693 }
16694
16695 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,
16696 Zeroable, Subtarget, DAG))
16697 return PSHUFB;
16698
16699 // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).
16700 if (Subtarget.hasBWI())
16701 return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);
16702
16703 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16704 // shuffle.
16706 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
16707 return Result;
16708
16709 // Try to permute the lanes and then use a per-lane permute.
16711 DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
16712 return V;
16713
16714 // Try to match an interleave of two v16i16s and lower them as unpck and
16715 // permutes using ymms.
16716 if (!Subtarget.hasAVX512())
16717 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
16718 Mask, DAG))
16719 return V;
16720
16721 // Otherwise fall back on generic lowering.
16722 return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
16723 Subtarget, DAG);
16724}
16725
16726/// Handle lowering of 32-lane 8-bit integer shuffles.
16727///
16728/// This routine is only called when we have AVX2 and thus a reasonable
16729/// instruction set for v32i8 shuffling..
16731 const APInt &Zeroable, SDValue V1, SDValue V2,
16732 const X86Subtarget &Subtarget,
16733 SelectionDAG &DAG) {
16734 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
16735 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
16736 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
16737 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
16738
16739 // Whenever we can lower this as a zext, that instruction is strictly faster
16740 // than any alternative. It also allows us to fold memory operands into the
16741 // shuffle in many cases.
16742 if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,
16743 Zeroable, Subtarget, DAG))
16744 return ZExt;
16745
16746 // Check for being able to broadcast a single element.
16747 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,
16748 Subtarget, DAG))
16749 return Broadcast;
16750
16751 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
16752 Zeroable, Subtarget, DAG))
16753 return Blend;
16754
16755 // Use dedicated unpack instructions for masks that match their pattern.
16756 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, V1, V2, Mask, DAG))
16757 return V;
16758
16759 // Use dedicated pack instructions for masks that match their pattern.
16760 if (SDValue V =
16761 lowerShuffleWithPACK(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16762 return V;
16763
16764 // Try to use lower using a truncation.
16765 if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
16766 Subtarget, DAG))
16767 return V;
16768
16769 // Try to use shift instructions.
16770 if (SDValue Shift =
16771 lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,
16772 DAG, /*BitwiseOnly*/ false))
16773 return Shift;
16774
16775 // Try to use byte rotation instructions.
16776 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,
16777 Subtarget, DAG))
16778 return Rotate;
16779
16780 // Try to use bit rotation instructions.
16781 if (V2.isUndef())
16782 if (SDValue Rotate =
16783 lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
16784 return Rotate;
16785
16786 // Try to create an in-lane repeating shuffle mask and then shuffle the
16787 // results into the target lanes.
16789 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16790 return V;
16791
16792 // There are no generalized cross-lane shuffle operations available on i8
16793 // element types.
16794 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
16795 // Try to produce a fixed cross-128-bit lane permute followed by unpack
16796 // because that should be faster than the variable permute alternatives.
16797 if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, V1, V2, Mask, DAG))
16798 return V;
16799
16801 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16802 return V;
16803
16804 return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,
16805 DAG, Subtarget);
16806 }
16807
16808 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,
16809 Zeroable, Subtarget, DAG))
16810 return PSHUFB;
16811
16812 // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).
16813 if (Subtarget.hasVBMI())
16814 return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);
16815
16816 // Try to simplify this by merging 128-bit lanes to enable a lane-based
16817 // shuffle.
16819 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
16820 return Result;
16821
16822 // Try to permute the lanes and then use a per-lane permute.
16824 DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
16825 return V;
16826
16827 // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
16828 // by zeroable elements in the remaining 24 elements. Turn this into two
16829 // vmovqb instructions shuffled together.
16830 if (Subtarget.hasVLX())
16831 if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
16832 Mask, Zeroable, DAG))
16833 return V;
16834
16835 // Try to match an interleave of two v32i8s and lower them as unpck and
16836 // permutes using ymms.
16837 if (!Subtarget.hasAVX512())
16838 if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
16839 Mask, DAG))
16840 return V;
16841
16842 // Otherwise fall back on generic lowering.
16843 return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
16844 Subtarget, DAG);
16845}
16846
16847/// High-level routine to lower various 256-bit x86 vector shuffles.
16848///
16849/// This routine either breaks down the specific type of a 256-bit x86 vector
16850/// shuffle or splits it into two 128-bit shuffles and fuses the results back
16851/// together based on the available instructions.
16853 SDValue V1, SDValue V2, const APInt &Zeroable,
16854 const X86Subtarget &Subtarget,
16855 SelectionDAG &DAG) {
16856 // If we have a single input to the zero element, insert that into V1 if we
16857 // can do so cheaply.
16858 int NumElts = VT.getVectorNumElements();
16859 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
16860
16861 if (NumV2Elements == 1 && Mask[0] >= NumElts)
16863 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
16864 return Insertion;
16865
16866 // Handle special cases where the lower or upper half is UNDEF.
16867 if (SDValue V =
16868 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
16869 return V;
16870
16871 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
16872 // can check for those subtargets here and avoid much of the subtarget
16873 // querying in the per-vector-type lowering routines. With AVX1 we have
16874 // essentially *zero* ability to manipulate a 256-bit vector with integer
16875 // types. Since we'll use floating point types there eventually, just
16876 // immediately cast everything to a float and operate entirely in that domain.
16877 if (VT.isInteger() && !Subtarget.hasAVX2()) {
16878 int ElementBits = VT.getScalarSizeInBits();
16879 if (ElementBits < 32) {
16880 // No floating point type available, if we can't use the bit operations
16881 // for masking/blending then decompose into 128-bit vectors.
16882 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
16883 Subtarget, DAG))
16884 return V;
16885 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
16886 return V;
16887 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
16888 }
16889
16890 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
16892 V1 = DAG.getBitcast(FpVT, V1);
16893 V2 = DAG.getBitcast(FpVT, V2);
16894 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
16895 }
16896
16897 if (VT == MVT::v16f16 || VT == MVT::v16bf16) {
16898 V1 = DAG.getBitcast(MVT::v16i16, V1);
16899 V2 = DAG.getBitcast(MVT::v16i16, V2);
16900 return DAG.getBitcast(VT,
16901 DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));
16902 }
16903
16904 switch (VT.SimpleTy) {
16905 case MVT::v4f64:
16906 return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16907 case MVT::v4i64:
16908 return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16909 case MVT::v8f32:
16910 return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16911 case MVT::v8i32:
16912 return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16913 case MVT::v16i16:
16914 return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16915 case MVT::v32i8:
16916 return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
16917
16918 default:
16919 llvm_unreachable("Not a valid 256-bit x86 vector type!");
16920 }
16921}
16922
16923/// Try to lower a vector shuffle as a 128-bit shuffles.
16925 const APInt &Zeroable, SDValue V1, SDValue V2,
16926 const X86Subtarget &Subtarget,
16927 SelectionDAG &DAG) {
16928 assert(VT.getScalarSizeInBits() == 64 &&
16929 "Unexpected element type size for 128bit shuffle.");
16930
16931 // To handle 256 bit vector requires VLX and most probably
16932 // function lowerV2X128VectorShuffle() is better solution.
16933 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
16934
16935 // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
16936 SmallVector<int, 4> Widened128Mask;
16937 if (!canWidenShuffleElements(Mask, Widened128Mask))
16938 return SDValue();
16939 assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
16940
16941 // Try to use an insert into a zero vector.
16942 if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
16943 (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
16944 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
16945 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
16946 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
16947 DAG.getIntPtrConstant(0, DL));
16948 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
16949 getZeroVector(VT, Subtarget, DAG, DL), LoV,
16950 DAG.getIntPtrConstant(0, DL));
16951 }
16952
16953 // Check for patterns which can be matched with a single insert of a 256-bit
16954 // subvector.
16955 bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);
16956 if (OnlyUsesV1 ||
16957 isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {
16958 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
16959 SDValue SubVec =
16960 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
16961 DAG.getIntPtrConstant(0, DL));
16962 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
16963 DAG.getIntPtrConstant(4, DL));
16964 }
16965
16966 // See if this is an insertion of the lower 128-bits of V2 into V1.
16967 bool IsInsert = true;
16968 int V2Index = -1;
16969 for (int i = 0; i < 4; ++i) {
16970 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
16971 if (Widened128Mask[i] < 0)
16972 continue;
16973
16974 // Make sure all V1 subvectors are in place.
16975 if (Widened128Mask[i] < 4) {
16976 if (Widened128Mask[i] != i) {
16977 IsInsert = false;
16978 break;
16979 }
16980 } else {
16981 // Make sure we only have a single V2 index and its the lowest 128-bits.
16982 if (V2Index >= 0 || Widened128Mask[i] != 4) {
16983 IsInsert = false;
16984 break;
16985 }
16986 V2Index = i;
16987 }
16988 }
16989 if (IsInsert && V2Index >= 0) {
16990 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
16991 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
16992 DAG.getIntPtrConstant(0, DL));
16993 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
16994 }
16995
16996 // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
16997 // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
16998 // possible we at least ensure the lanes stay sequential to help later
16999 // combines.
17000 SmallVector<int, 2> Widened256Mask;
17001 if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
17002 Widened128Mask.clear();
17003 narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
17004 }
17005
17006 // Try to lower to vshuf64x2/vshuf32x4.
17007 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
17008 int PermMask[4] = {-1, -1, -1, -1};
17009 // Ensure elements came from the same Op.
17010 for (int i = 0; i < 4; ++i) {
17011 assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
17012 if (Widened128Mask[i] < 0)
17013 continue;
17014
17015 SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
17016 unsigned OpIndex = i / 2;
17017 if (Ops[OpIndex].isUndef())
17018 Ops[OpIndex] = Op;
17019 else if (Ops[OpIndex] != Op)
17020 return SDValue();
17021
17022 PermMask[i] = Widened128Mask[i] % 4;
17023 }
17024
17025 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
17026 getV4X86ShuffleImm8ForMask(PermMask, DL, DAG));
17027}
17028
17029/// Handle lowering of 8-lane 64-bit floating point shuffles.
17031 const APInt &Zeroable, SDValue V1, SDValue V2,
17032 const X86Subtarget &Subtarget,
17033 SelectionDAG &DAG) {
17034 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17035 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
17036 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17037
17038 if (V2.isUndef()) {
17039 // Use low duplicate instructions for masks that match their pattern.
17040 if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))
17041 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
17042
17043 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
17044 // Non-half-crossing single input shuffles can be lowered with an
17045 // interleaved permutation.
17046 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
17047 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
17048 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
17049 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
17050 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
17051 DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));
17052 }
17053
17054 SmallVector<int, 4> RepeatedMask;
17055 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
17056 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
17057 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17058 }
17059
17060 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,
17061 V2, Subtarget, DAG))
17062 return Shuf128;
17063
17064 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, V1, V2, Mask, DAG))
17065 return Unpck;
17066
17067 // Check if the blend happens to exactly fit that of SHUFPD.
17068 if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,
17069 Zeroable, Subtarget, DAG))
17070 return Op;
17071
17072 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f64, V1, V2, Mask, Zeroable,
17073 Subtarget, DAG))
17074 return V;
17075
17076 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
17077 Zeroable, Subtarget, DAG))
17078 return Blend;
17079
17080 return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);
17081}
17082
17083/// Handle lowering of 16-lane 32-bit floating point shuffles.
17085 const APInt &Zeroable, SDValue V1, SDValue V2,
17086 const X86Subtarget &Subtarget,
17087 SelectionDAG &DAG) {
17088 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17089 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
17090 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17091
17092 // If the shuffle mask is repeated in each 128-bit lane, we have many more
17093 // options to efficiently lower the shuffle.
17094 SmallVector<int, 4> RepeatedMask;
17095 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
17096 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17097
17098 // Use even/odd duplicate instructions for masks that match their pattern.
17099 if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))
17100 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
17101 if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))
17102 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
17103
17104 if (V2.isUndef())
17105 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
17106 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17107
17108 // Use dedicated unpack instructions for masks that match their pattern.
17109 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, V1, V2, Mask, DAG))
17110 return V;
17111
17112 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17113 Zeroable, Subtarget, DAG))
17114 return Blend;
17115
17116 // Otherwise, fall back to a SHUFPS sequence.
17117 return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
17118 }
17119
17120 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
17121 Zeroable, Subtarget, DAG))
17122 return Blend;
17123
17125 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17126 return DAG.getBitcast(MVT::v16f32, ZExt);
17127
17128 // Try to create an in-lane repeating shuffle mask and then shuffle the
17129 // results into the target lanes.
17131 DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
17132 return V;
17133
17134 // If we have a single input shuffle with different shuffle patterns in the
17135 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
17136 if (V2.isUndef() &&
17137 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
17138 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
17139 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
17140 }
17141
17142 // If we have AVX512F support, we can use VEXPAND.
17143 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16f32, V1, V2, Mask,
17144 Zeroable, Subtarget, DAG))
17145 return V;
17146
17147 return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
17148}
17149
17150/// Handle lowering of 8-lane 64-bit integer shuffles.
17152 const APInt &Zeroable, SDValue V1, SDValue V2,
17153 const X86Subtarget &Subtarget,
17154 SelectionDAG &DAG) {
17155 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17156 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
17157 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
17158
17159 // Try to use shift instructions if fast.
17160 if (Subtarget.preferLowerShuffleAsShift())
17161 if (SDValue Shift =
17162 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17163 Subtarget, DAG, /*BitwiseOnly*/ true))
17164 return Shift;
17165
17166 if (V2.isUndef()) {
17167 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
17168 // can use lower latency instructions that will operate on all four
17169 // 128-bit lanes.
17170 SmallVector<int, 2> Repeated128Mask;
17171 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
17172 SmallVector<int, 4> PSHUFDMask;
17173 narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
17174 return DAG.getBitcast(
17175 MVT::v8i64,
17176 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
17177 DAG.getBitcast(MVT::v16i32, V1),
17178 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
17179 }
17180
17181 SmallVector<int, 4> Repeated256Mask;
17182 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
17183 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
17184 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
17185 }
17186
17187 if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,
17188 V2, Subtarget, DAG))
17189 return Shuf128;
17190
17191 // Try to use shift instructions.
17192 if (SDValue Shift =
17193 lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,
17194 DAG, /*BitwiseOnly*/ false))
17195 return Shift;
17196
17197 // Try to use VALIGN.
17198 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
17199 Zeroable, Subtarget, DAG))
17200 return Rotate;
17201
17202 // Try to use PALIGNR.
17203 if (Subtarget.hasBWI())
17204 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,
17205 Subtarget, DAG))
17206 return Rotate;
17207
17208 if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, V1, V2, Mask, DAG))
17209 return Unpck;
17210
17211 // If we have AVX512F support, we can use VEXPAND.
17212 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
17213 Subtarget, DAG))
17214 return V;
17215
17216 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
17217 Zeroable, Subtarget, DAG))
17218 return Blend;
17219
17220 return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);
17221}
17222
17223/// Handle lowering of 16-lane 32-bit integer shuffles.
17225 const APInt &Zeroable, SDValue V1, SDValue V2,
17226 const X86Subtarget &Subtarget,
17227 SelectionDAG &DAG) {
17228 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17229 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
17230 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
17231
17232 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
17233
17234 // Whenever we can lower this as a zext, that instruction is strictly faster
17235 // than any alternative. It also allows us to fold memory operands into the
17236 // shuffle in many cases.
17238 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
17239 return ZExt;
17240
17241 // Try to use shift instructions if fast.
17242 if (Subtarget.preferLowerShuffleAsShift()) {
17243 if (SDValue Shift =
17244 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17245 Subtarget, DAG, /*BitwiseOnly*/ true))
17246 return Shift;
17247 if (NumV2Elements == 0)
17248 if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,
17249 Subtarget, DAG))
17250 return Rotate;
17251 }
17252
17253 // If the shuffle mask is repeated in each 128-bit lane we can use more
17254 // efficient instructions that mirror the shuffles across the four 128-bit
17255 // lanes.
17256 SmallVector<int, 4> RepeatedMask;
17257 bool Is128BitLaneRepeatedShuffle =
17258 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
17259 if (Is128BitLaneRepeatedShuffle) {
17260 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
17261 if (V2.isUndef())
17262 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
17263 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
17264
17265 // Use dedicated unpack instructions for masks that match their pattern.
17266 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, V1, V2, Mask, DAG))
17267 return V;
17268 }
17269
17270 // Try to use shift instructions.
17271 if (SDValue Shift =
17272 lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,
17273 Subtarget, DAG, /*BitwiseOnly*/ false))
17274 return Shift;
17275
17276 if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)
17277 if (SDValue Rotate =
17278 lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))
17279 return Rotate;
17280
17281 // Try to use VALIGN.
17282 if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
17283 Zeroable, Subtarget, DAG))
17284 return Rotate;
17285
17286 // Try to use byte rotation instructions.
17287 if (Subtarget.hasBWI())
17288 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,
17289 Subtarget, DAG))
17290 return Rotate;
17291
17292 // Assume that a single SHUFPS is faster than using a permv shuffle.
17293 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
17294 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
17295 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
17296 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
17297 SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
17298 CastV1, CastV2, DAG);
17299 return DAG.getBitcast(MVT::v16i32, ShufPS);
17300 }
17301
17302 // Try to create an in-lane repeating shuffle mask and then shuffle the
17303 // results into the target lanes.
17305 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
17306 return V;
17307
17308 // If we have AVX512F support, we can use VEXPAND.
17309 if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16i32, V1, V2, Mask,
17310 Zeroable, Subtarget, DAG))
17311 return V;
17312
17313 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
17314 Zeroable, Subtarget, DAG))
17315 return Blend;
17316
17317 return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);
17318}
17319
17320/// Handle lowering of 32-lane 16-bit integer shuffles.
17322 const APInt &Zeroable, SDValue V1, SDValue V2,
17323 const X86Subtarget &Subtarget,
17324 SelectionDAG &DAG) {
17325 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17326 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
17327 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
17328 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
17329
17330 // Whenever we can lower this as a zext, that instruction is strictly faster
17331 // than any alternative. It also allows us to fold memory operands into the
17332 // shuffle in many cases.
17334 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
17335 return ZExt;
17336
17337 // Use dedicated unpack instructions for masks that match their pattern.
17338 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, V1, V2, Mask, DAG))
17339 return V;
17340
17341 // Use dedicated pack instructions for masks that match their pattern.
17342 if (SDValue V =
17343 lowerShuffleWithPACK(DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17344 return V;
17345
17346 // Try to use shift instructions.
17347 if (SDValue Shift =
17348 lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,
17349 Subtarget, DAG, /*BitwiseOnly*/ false))
17350 return Shift;
17351
17352 // Try to use byte rotation instructions.
17353 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,
17354 Subtarget, DAG))
17355 return Rotate;
17356
17357 if (V2.isUndef()) {
17358 // Try to use bit rotation instructions.
17359 if (SDValue Rotate =
17360 lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
17361 return Rotate;
17362
17363 SmallVector<int, 8> RepeatedMask;
17364 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
17365 // As this is a single-input shuffle, the repeated mask should be
17366 // a strictly valid v8i16 mask that we can pass through to the v8i16
17367 // lowering to handle even the v32 case.
17368 return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
17369 RepeatedMask, Subtarget, DAG);
17370 }
17371 }
17372
17373 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
17374 Zeroable, Subtarget, DAG))
17375 return Blend;
17376
17377 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
17378 Zeroable, Subtarget, DAG))
17379 return PSHUFB;
17380
17381 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17382 // shuffle.
17383 if (!V2.isUndef())
17385 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
17386 return Result;
17387
17388 return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);
17389}
17390
17391/// Handle lowering of 64-lane 8-bit integer shuffles.
17393 const APInt &Zeroable, SDValue V1, SDValue V2,
17394 const X86Subtarget &Subtarget,
17395 SelectionDAG &DAG) {
17396 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17397 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
17398 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
17399 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
17400
17401 // Whenever we can lower this as a zext, that instruction is strictly faster
17402 // than any alternative. It also allows us to fold memory operands into the
17403 // shuffle in many cases.
17405 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
17406 return ZExt;
17407
17408 // Use dedicated unpack instructions for masks that match their pattern.
17409 if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, V1, V2, Mask, DAG))
17410 return V;
17411
17412 // Use dedicated pack instructions for masks that match their pattern.
17413 if (SDValue V =
17414 lowerShuffleWithPACK(DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17415 return V;
17416
17417 // Try to use shift instructions.
17418 if (SDValue Shift =
17419 lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,
17420 DAG, /*BitwiseOnly*/ false))
17421 return Shift;
17422
17423 // Try to use byte rotation instructions.
17424 if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,
17425 Subtarget, DAG))
17426 return Rotate;
17427
17428 // Try to use bit rotation instructions.
17429 if (V2.isUndef())
17430 if (SDValue Rotate =
17431 lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
17432 return Rotate;
17433
17434 // Lower as AND if possible.
17435 if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
17436 Zeroable, Subtarget, DAG))
17437 return Masked;
17438
17439 if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
17440 Zeroable, Subtarget, DAG))
17441 return PSHUFB;
17442
17443 // Try to create an in-lane repeating shuffle mask and then shuffle the
17444 // results into the target lanes.
17446 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17447 return V;
17448
17450 DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))
17451 return Result;
17452
17453 if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
17454 Zeroable, Subtarget, DAG))
17455 return Blend;
17456
17457 if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {
17458 // Use PALIGNR+Permute if possible - permute might become PSHUFB but the
17459 // PALIGNR will be cheaper than the second PSHUFB+OR.
17460 if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,
17461 Mask, Subtarget, DAG))
17462 return V;
17463
17464 // If we can't directly blend but can use PSHUFB, that will be better as it
17465 // can both shuffle and set up the inefficient blend.
17466 bool V1InUse, V2InUse;
17467 return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,
17468 DAG, V1InUse, V2InUse);
17469 }
17470
17471 // Try to simplify this by merging 128-bit lanes to enable a lane-based
17472 // shuffle.
17473 if (!V2.isUndef())
17475 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
17476 return Result;
17477
17478 // VBMI can use VPERMV/VPERMV3 byte shuffles.
17479 if (Subtarget.hasVBMI())
17480 return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
17481
17482 return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17483}
17484
17485/// High-level routine to lower various 512-bit x86 vector shuffles.
17486///
17487/// This routine either breaks down the specific type of a 512-bit x86 vector
17488/// shuffle or splits it into two 256-bit shuffles and fuses the results back
17489/// together based on the available instructions.
17491 MVT VT, SDValue V1, SDValue V2,
17492 const APInt &Zeroable,
17493 const X86Subtarget &Subtarget,
17494 SelectionDAG &DAG) {
17495 assert(Subtarget.hasAVX512() &&
17496 "Cannot lower 512-bit vectors w/ basic ISA!");
17497
17498 // If we have a single input to the zero element, insert that into V1 if we
17499 // can do so cheaply.
17500 int NumElts = Mask.size();
17501 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17502
17503 if (NumV2Elements == 1 && Mask[0] >= NumElts)
17505 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
17506 return Insertion;
17507
17508 // Handle special cases where the lower or upper half is UNDEF.
17509 if (SDValue V =
17510 lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
17511 return V;
17512
17513 // Check for being able to broadcast a single element.
17514 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
17515 Subtarget, DAG))
17516 return Broadcast;
17517
17518 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
17519 // Try using bit ops for masking and blending before falling back to
17520 // splitting.
17521 if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
17522 Subtarget, DAG))
17523 return V;
17524 if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
17525 return V;
17526
17527 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);
17528 }
17529
17530 if (VT == MVT::v32f16 || VT == MVT::v32bf16) {
17531 if (!Subtarget.hasBWI())
17532 return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17533 /*SimpleOnly*/ false);
17534
17535 V1 = DAG.getBitcast(MVT::v32i16, V1);
17536 V2 = DAG.getBitcast(MVT::v32i16, V2);
17537 return DAG.getBitcast(VT,
17538 DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));
17539 }
17540
17541 // Dispatch to each element type for lowering. If we don't have support for
17542 // specific element type shuffles at 512 bits, immediately split them and
17543 // lower them. Each lowering routine of a given type is allowed to assume that
17544 // the requisite ISA extensions for that element type are available.
17545 switch (VT.SimpleTy) {
17546 case MVT::v8f64:
17547 return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17548 case MVT::v16f32:
17549 return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17550 case MVT::v8i64:
17551 return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17552 case MVT::v16i32:
17553 return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17554 case MVT::v32i16:
17555 return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17556 case MVT::v64i8:
17557 return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
17558
17559 default:
17560 llvm_unreachable("Not a valid 512-bit x86 vector type!");
17561 }
17562}
17563
17565 MVT VT, SDValue V1, SDValue V2,
17566 const X86Subtarget &Subtarget,
17567 SelectionDAG &DAG) {
17568 // Shuffle should be unary.
17569 if (!V2.isUndef())
17570 return SDValue();
17571
17572 int ShiftAmt = -1;
17573 int NumElts = Mask.size();
17574 for (int i = 0; i != NumElts; ++i) {
17575 int M = Mask[i];
17576 assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&
17577 "Unexpected mask index.");
17578 if (M < 0)
17579 continue;
17580
17581 // The first non-undef element determines our shift amount.
17582 if (ShiftAmt < 0) {
17583 ShiftAmt = M - i;
17584 // Need to be shifting right.
17585 if (ShiftAmt <= 0)
17586 return SDValue();
17587 }
17588 // All non-undef elements must shift by the same amount.
17589 if (ShiftAmt != M - i)
17590 return SDValue();
17591 }
17592 assert(ShiftAmt >= 0 && "All undef?");
17593
17594 // Great we found a shift right.
17595 SDValue Res = widenMaskVector(V1, false, Subtarget, DAG, DL);
17596 Res = DAG.getNode(X86ISD::KSHIFTR, DL, Res.getValueType(), Res,
17597 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17598 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17599 DAG.getIntPtrConstant(0, DL));
17600}
17601
17602// Determine if this shuffle can be implemented with a KSHIFT instruction.
17603// Returns the shift amount if possible or -1 if not. This is a simplified
17604// version of matchShuffleAsShift.
17605static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,
17606 int MaskOffset, const APInt &Zeroable) {
17607 int Size = Mask.size();
17608
17609 auto CheckZeros = [&](int Shift, bool Left) {
17610 for (int j = 0; j < Shift; ++j)
17611 if (!Zeroable[j + (Left ? 0 : (Size - Shift))])
17612 return false;
17613
17614 return true;
17615 };
17616
17617 auto MatchShift = [&](int Shift, bool Left) {
17618 unsigned Pos = Left ? Shift : 0;
17619 unsigned Low = Left ? 0 : Shift;
17620 unsigned Len = Size - Shift;
17621 return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);
17622 };
17623
17624 for (int Shift = 1; Shift != Size; ++Shift)
17625 for (bool Left : {true, false})
17626 if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {
17628 return Shift;
17629 }
17630
17631 return -1;
17632}
17633
17634
17635// Lower vXi1 vector shuffles.
17636// There is no a dedicated instruction on AVX-512 that shuffles the masks.
17637// The only way to shuffle bits is to sign-extend the mask vector to SIMD
17638// vector, shuffle and then truncate it back.
17640 MVT VT, SDValue V1, SDValue V2,
17641 const APInt &Zeroable,
17642 const X86Subtarget &Subtarget,
17643 SelectionDAG &DAG) {
17644 assert(Subtarget.hasAVX512() &&
17645 "Cannot lower 512-bit vectors w/o basic ISA!");
17646
17647 int NumElts = Mask.size();
17648 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
17649
17650 // Try to recognize shuffles that are just padding a subvector with zeros.
17651 int SubvecElts = 0;
17652 int Src = -1;
17653 for (int i = 0; i != NumElts; ++i) {
17654 if (Mask[i] >= 0) {
17655 // Grab the source from the first valid mask. All subsequent elements need
17656 // to use this same source.
17657 if (Src < 0)
17658 Src = Mask[i] / NumElts;
17659 if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)
17660 break;
17661 }
17662
17663 ++SubvecElts;
17664 }
17665 assert(SubvecElts != NumElts && "Identity shuffle?");
17666
17667 // Clip to a power 2.
17668 SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);
17669
17670 // Make sure the number of zeroable bits in the top at least covers the bits
17671 // not covered by the subvector.
17672 if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {
17673 assert(Src >= 0 && "Expected a source!");
17674 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
17675 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
17676 Src == 0 ? V1 : V2,
17677 DAG.getIntPtrConstant(0, DL));
17678 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
17679 DAG.getConstant(0, DL, VT),
17680 Extract, DAG.getIntPtrConstant(0, DL));
17681 }
17682
17683 // Try a simple shift right with undef elements. Later we'll try with zeros.
17684 if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,
17685 DAG))
17686 return Shift;
17687
17688 // Try to match KSHIFTs.
17689 unsigned Offset = 0;
17690 for (SDValue V : { V1, V2 }) {
17691 unsigned Opcode;
17692 int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);
17693 if (ShiftAmt >= 0) {
17694 SDValue Res = widenMaskVector(V, false, Subtarget, DAG, DL);
17695 MVT WideVT = Res.getSimpleValueType();
17696 // Widened right shifts need two shifts to ensure we shift in zeroes.
17697 if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {
17698 int WideElts = WideVT.getVectorNumElements();
17699 // Shift left to put the original vector in the MSBs of the new size.
17700 Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,
17701 DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));
17702 // Increase the shift amount to account for the left shift.
17703 ShiftAmt += WideElts - NumElts;
17704 }
17705
17706 Res = DAG.getNode(Opcode, DL, WideVT, Res,
17707 DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));
17708 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
17709 DAG.getIntPtrConstant(0, DL));
17710 }
17711 Offset += NumElts; // Increment for next iteration.
17712 }
17713
17714 // If we're performing an unary shuffle on a SETCC result, try to shuffle the
17715 // ops instead.
17716 // TODO: What other unary shuffles would benefit from this?
17717 if (NumV2Elements == 0 && V1.getOpcode() == ISD::SETCC && V1->hasOneUse()) {
17718 SDValue Op0 = V1.getOperand(0);
17719 SDValue Op1 = V1.getOperand(1);
17720 ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();
17721 EVT OpVT = Op0.getValueType();
17722 if (OpVT.getScalarSizeInBits() >= 32 || isBroadcastShuffleMask(Mask))
17723 return DAG.getSetCC(
17724 DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),
17725 DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);
17726 }
17727
17728 MVT ExtVT;
17729 switch (VT.SimpleTy) {
17730 default:
17731 llvm_unreachable("Expected a vector of i1 elements");
17732 case MVT::v2i1:
17733 ExtVT = MVT::v2i64;
17734 break;
17735 case MVT::v4i1:
17736 ExtVT = MVT::v4i32;
17737 break;
17738 case MVT::v8i1:
17739 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
17740 // shuffle.
17741 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
17742 break;
17743 case MVT::v16i1:
17744 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17745 // 256-bit operation available.
17746 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
17747 break;
17748 case MVT::v32i1:
17749 // Take 512-bit type, unless we are avoiding 512-bit types and have the
17750 // 256-bit operation available.
17751 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
17752 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
17753 break;
17754 case MVT::v64i1:
17755 // Fall back to scalarization. FIXME: We can do better if the shuffle
17756 // can be partitioned cleanly.
17757 if (!Subtarget.useBWIRegs())
17758 return SDValue();
17759 ExtVT = MVT::v64i8;
17760 break;
17761 }
17762
17763 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
17764 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
17765
17766 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
17767 // i1 was sign extended we can use X86ISD::CVT2MASK.
17768 int NumElems = VT.getVectorNumElements();
17769 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
17770 (Subtarget.hasDQI() && (NumElems < 32)))
17771 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
17772 Shuffle, ISD::SETGT);
17773
17774 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
17775}
17776
17777/// Helper function that returns true if the shuffle mask should be
17778/// commuted to improve canonicalization.
17780 int NumElements = Mask.size();
17781
17782 int NumV1Elements = 0, NumV2Elements = 0;
17783 for (int M : Mask)
17784 if (M < 0)
17785 continue;
17786 else if (M < NumElements)
17787 ++NumV1Elements;
17788 else
17789 ++NumV2Elements;
17790
17791 // Commute the shuffle as needed such that more elements come from V1 than
17792 // V2. This allows us to match the shuffle pattern strictly on how many
17793 // elements come from V1 without handling the symmetric cases.
17794 if (NumV2Elements > NumV1Elements)
17795 return true;
17796
17797 assert(NumV1Elements > 0 && "No V1 indices");
17798
17799 if (NumV2Elements == 0)
17800 return false;
17801
17802 // When the number of V1 and V2 elements are the same, try to minimize the
17803 // number of uses of V2 in the low half of the vector. When that is tied,
17804 // ensure that the sum of indices for V1 is equal to or lower than the sum
17805 // indices for V2. When those are equal, try to ensure that the number of odd
17806 // indices for V1 is lower than the number of odd indices for V2.
17807 if (NumV1Elements == NumV2Elements) {
17808 int LowV1Elements = 0, LowV2Elements = 0;
17809 for (int M : Mask.slice(0, NumElements / 2))
17810 if (M >= NumElements)
17811 ++LowV2Elements;
17812 else if (M >= 0)
17813 ++LowV1Elements;
17814 if (LowV2Elements > LowV1Elements)
17815 return true;
17816 if (LowV2Elements == LowV1Elements) {
17817 int SumV1Indices = 0, SumV2Indices = 0;
17818 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17819 if (Mask[i] >= NumElements)
17820 SumV2Indices += i;
17821 else if (Mask[i] >= 0)
17822 SumV1Indices += i;
17823 if (SumV2Indices < SumV1Indices)
17824 return true;
17825 if (SumV2Indices == SumV1Indices) {
17826 int NumV1OddIndices = 0, NumV2OddIndices = 0;
17827 for (int i = 0, Size = Mask.size(); i < Size; ++i)
17828 if (Mask[i] >= NumElements)
17829 NumV2OddIndices += i % 2;
17830 else if (Mask[i] >= 0)
17831 NumV1OddIndices += i % 2;
17832 if (NumV2OddIndices < NumV1OddIndices)
17833 return true;
17834 }
17835 }
17836 }
17837
17838 return false;
17839}
17840
17842 const X86Subtarget &Subtarget) {
17843 if (!Subtarget.hasAVX512())
17844 return false;
17845
17846 if (!V.getValueType().isSimple())
17847 return false;
17848
17849 MVT VT = V.getSimpleValueType().getScalarType();
17850 if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())
17851 return false;
17852
17853 // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd
17854 // are preferable to blendw/blendvb/masked-mov.
17855 if ((VT == MVT::i16 || VT == MVT::i8) &&
17856 V.getSimpleValueType().getSizeInBits() < 512)
17857 return false;
17858
17859 auto HasMaskOperation = [&](SDValue V) {
17860 // TODO: Currently we only check limited opcode. We probably extend
17861 // it to all binary operation by checking TLI.isBinOp().
17862 switch (V->getOpcode()) {
17863 default:
17864 return false;
17865 case ISD::ADD:
17866 case ISD::SUB:
17867 case ISD::AND:
17868 case ISD::XOR:
17869 case ISD::OR:
17870 case ISD::SMAX:
17871 case ISD::SMIN:
17872 case ISD::UMAX:
17873 case ISD::UMIN:
17874 case ISD::ABS:
17875 case ISD::SHL:
17876 case ISD::SRL:
17877 case ISD::SRA:
17878 case ISD::MUL:
17879 break;
17880 }
17881 if (!V->hasOneUse())
17882 return false;
17883
17884 return true;
17885 };
17886
17887 if (HasMaskOperation(V))
17888 return true;
17889
17890 return false;
17891}
17892
17893// Forward declaration.
17896 unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,
17897 const X86Subtarget &Subtarget);
17898
17899 /// Top-level lowering for x86 vector shuffles.
17900///
17901/// This handles decomposition, canonicalization, and lowering of all x86
17902/// vector shuffles. Most of the specific lowering strategies are encapsulated
17903/// above in helper routines. The canonicalization attempts to widen shuffles
17904/// to involve fewer lanes of wider elements, consolidate symmetric patterns
17905/// s.t. only one of the two inputs needs to be tested, etc.
17907 SelectionDAG &DAG) {
17908 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
17909 ArrayRef<int> OrigMask = SVOp->getMask();
17910 SDValue V1 = Op.getOperand(0);
17911 SDValue V2 = Op.getOperand(1);
17912 MVT VT = Op.getSimpleValueType();
17913 int NumElements = VT.getVectorNumElements();
17914 SDLoc DL(Op);
17915 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
17916
17917 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
17918 "Can't lower MMX shuffles");
17919
17920 bool V1IsUndef = V1.isUndef();
17921 bool V2IsUndef = V2.isUndef();
17922 if (V1IsUndef && V2IsUndef)
17923 return DAG.getUNDEF(VT);
17924
17925 // When we create a shuffle node we put the UNDEF node to second operand,
17926 // but in some cases the first operand may be transformed to UNDEF.
17927 // In this case we should just commute the node.
17928 if (V1IsUndef)
17929 return DAG.getCommutedVectorShuffle(*SVOp);
17930
17931 // Check for non-undef masks pointing at an undef vector and make the masks
17932 // undef as well. This makes it easier to match the shuffle based solely on
17933 // the mask.
17934 if (V2IsUndef &&
17935 any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
17936 SmallVector<int, 8> NewMask(OrigMask);
17937 for (int &M : NewMask)
17938 if (M >= NumElements)
17939 M = -1;
17940 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
17941 }
17942
17943 // Check for illegal shuffle mask element index values.
17944 int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
17945 (void)MaskUpperLimit;
17946 assert(llvm::all_of(OrigMask,
17947 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
17948 "Out of bounds shuffle index");
17949
17950 // We actually see shuffles that are entirely re-arrangements of a set of
17951 // zero inputs. This mostly happens while decomposing complex shuffles into
17952 // simple ones. Directly lower these as a buildvector of zeros.
17953 APInt KnownUndef, KnownZero;
17954 computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
17955
17956 APInt Zeroable = KnownUndef | KnownZero;
17957 if (Zeroable.isAllOnes())
17958 return getZeroVector(VT, Subtarget, DAG, DL);
17959
17960 bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
17961
17962 // Try to collapse shuffles into using a vector type with fewer elements but
17963 // wider element types. We cap this to not form integers or floating point
17964 // elements wider than 64 bits. It does not seem beneficial to form i128
17965 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
17966 SmallVector<int, 16> WidenedMask;
17967 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
17968 !canCombineAsMaskOperation(V1, Subtarget) &&
17969 !canCombineAsMaskOperation(V2, Subtarget) &&
17970 canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
17971 // Shuffle mask widening should not interfere with a broadcast opportunity
17972 // by obfuscating the operands with bitcasts.
17973 // TODO: Avoid lowering directly from this top-level function: make this
17974 // a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
17975 if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
17976 Subtarget, DAG))
17977 return Broadcast;
17978
17979 MVT NewEltVT = VT.isFloatingPoint()
17982 int NewNumElts = NumElements / 2;
17983 MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);
17984 // Make sure that the new vector type is legal. For example, v2f64 isn't
17985 // legal on SSE1.
17986 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
17987 if (V2IsZero) {
17988 // Modify the new Mask to take all zeros from the all-zero vector.
17989 // Choose indices that are blend-friendly.
17990 bool UsedZeroVector = false;
17991 assert(is_contained(WidenedMask, SM_SentinelZero) &&
17992 "V2's non-undef elements are used?!");
17993 for (int i = 0; i != NewNumElts; ++i)
17994 if (WidenedMask[i] == SM_SentinelZero) {
17995 WidenedMask[i] = i + NewNumElts;
17996 UsedZeroVector = true;
17997 }
17998 // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits
17999 // some elements to be undef.
18000 if (UsedZeroVector)
18001 V2 = getZeroVector(NewVT, Subtarget, DAG, DL);
18002 }
18003 V1 = DAG.getBitcast(NewVT, V1);
18004 V2 = DAG.getBitcast(NewVT, V2);
18005 return DAG.getBitcast(
18006 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
18007 }
18008 }
18009
18010 SmallVector<SDValue> Ops = {V1, V2};
18011 SmallVector<int> Mask(OrigMask);
18012
18013 // Canonicalize the shuffle with any horizontal ops inputs.
18014 // NOTE: This may update Ops and Mask.
18016 Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))
18017 return DAG.getBitcast(VT, HOp);
18018
18019 V1 = DAG.getBitcast(VT, Ops[0]);
18020 V2 = DAG.getBitcast(VT, Ops[1]);
18021 assert(NumElements == (int)Mask.size() &&
18022 "canonicalizeShuffleMaskWithHorizOp "
18023 "shouldn't alter the shuffle mask size");
18024
18025 // Commute the shuffle if it will improve canonicalization.
18028 std::swap(V1, V2);
18029 }
18030
18031 // For each vector width, delegate to a specialized lowering routine.
18032 if (VT.is128BitVector())
18033 return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18034
18035 if (VT.is256BitVector())
18036 return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18037
18038 if (VT.is512BitVector())
18039 return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18040
18041 if (Is1BitVector)
18042 return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);
18043
18044 llvm_unreachable("Unimplemented!");
18045}
18046
18047// As legal vpcompress instructions depend on various AVX512 extensions, try to
18048// convert illegal vector sizes to legal ones to avoid expansion.
18050 SelectionDAG &DAG) {
18051 assert(Subtarget.hasAVX512() &&
18052 "Need AVX512 for custom VECTOR_COMPRESS lowering.");
18053
18054 SDLoc DL(Op);
18055 SDValue Vec = Op.getOperand(0);
18056 SDValue Mask = Op.getOperand(1);
18057 SDValue Passthru = Op.getOperand(2);
18058
18059 EVT VecVT = Vec.getValueType();
18060 EVT ElementVT = VecVT.getVectorElementType();
18061 unsigned NumElements = VecVT.getVectorNumElements();
18062 unsigned NumVecBits = VecVT.getFixedSizeInBits();
18063 unsigned NumElementBits = ElementVT.getFixedSizeInBits();
18064
18065 // 128- and 256-bit vectors with <= 16 elements can be converted to and
18066 // compressed as 512-bit vectors in AVX512F.
18067 if (NumVecBits != 128 && NumVecBits != 256)
18068 return SDValue();
18069
18070 if (NumElementBits == 32 || NumElementBits == 64) {
18071 unsigned NumLargeElements = 512 / NumElementBits;
18072 MVT LargeVecVT =
18073 MVT::getVectorVT(ElementVT.getSimpleVT(), NumLargeElements);
18074 MVT LargeMaskVT = MVT::getVectorVT(MVT::i1, NumLargeElements);
18075
18076 Vec = widenSubVector(LargeVecVT, Vec, /*ZeroNewElements=*/false, Subtarget,
18077 DAG, DL);
18078 Mask = widenSubVector(LargeMaskVT, Mask, /*ZeroNewElements=*/true,
18079 Subtarget, DAG, DL);
18080 Passthru = Passthru.isUndef() ? DAG.getUNDEF(LargeVecVT)
18081 : widenSubVector(LargeVecVT, Passthru,
18082 /*ZeroNewElements=*/false,
18083 Subtarget, DAG, DL);
18084
18085 SDValue Compressed =
18086 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18087 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, Compressed,
18088 DAG.getConstant(0, DL, MVT::i64));
18089 }
18090
18091 if (VecVT == MVT::v8i16 || VecVT == MVT::v8i8 || VecVT == MVT::v16i8 ||
18092 VecVT == MVT::v16i16) {
18093 MVT LageElementVT = MVT::getIntegerVT(512 / NumElements);
18094 EVT LargeVecVT = MVT::getVectorVT(LageElementVT, NumElements);
18095
18096 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Vec);
18097 Passthru = Passthru.isUndef()
18098 ? DAG.getUNDEF(LargeVecVT)
18099 : DAG.getNode(ISD::ANY_EXTEND, DL, LargeVecVT, Passthru);
18100
18101 SDValue Compressed =
18102 DAG.getNode(ISD::VECTOR_COMPRESS, DL, LargeVecVT, Vec, Mask, Passthru);
18103 return DAG.getNode(ISD::TRUNCATE, DL, VecVT, Compressed);
18104 }
18105
18106 return SDValue();
18107}
18108
18109/// Try to lower a VSELECT instruction to a vector shuffle.
18111 const X86Subtarget &Subtarget,
18112 SelectionDAG &DAG) {
18113 SDValue Cond = Op.getOperand(0);
18114 SDValue LHS = Op.getOperand(1);
18115 SDValue RHS = Op.getOperand(2);
18116 MVT VT = Op.getSimpleValueType();
18117
18118 // Only non-legal VSELECTs reach this lowering, convert those into generic
18119 // shuffles and re-use the shuffle lowering path for blends.
18123 return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);
18124 }
18125
18126 return SDValue();
18127}
18128
18129SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
18130 SDValue Cond = Op.getOperand(0);
18131 SDValue LHS = Op.getOperand(1);
18132 SDValue RHS = Op.getOperand(2);
18133
18134 SDLoc dl(Op);
18135 MVT VT = Op.getSimpleValueType();
18136 if (isSoftF16(VT, Subtarget)) {
18138 return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,
18139 DAG.getBitcast(NVT, LHS),
18140 DAG.getBitcast(NVT, RHS)));
18141 }
18142
18143 // A vselect where all conditions and data are constants can be optimized into
18144 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
18148 return SDValue();
18149
18150 // Try to lower this to a blend-style vector shuffle. This can handle all
18151 // constant condition cases.
18152 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
18153 return BlendOp;
18154
18155 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
18156 // with patterns on the mask registers on AVX-512.
18157 MVT CondVT = Cond.getSimpleValueType();
18158 unsigned CondEltSize = Cond.getScalarValueSizeInBits();
18159 if (CondEltSize == 1)
18160 return Op;
18161
18162 // Variable blends are only legal from SSE4.1 onward.
18163 if (!Subtarget.hasSSE41())
18164 return SDValue();
18165
18166 unsigned EltSize = VT.getScalarSizeInBits();
18167 unsigned NumElts = VT.getVectorNumElements();
18168
18169 // Expand v32i16/v64i8 without BWI.
18170 if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
18171 return SDValue();
18172
18173 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
18174 // into an i1 condition so that we can use the mask-based 512-bit blend
18175 // instructions.
18176 if (VT.getSizeInBits() == 512) {
18177 // Build a mask by testing the condition against zero.
18178 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
18179 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
18180 DAG.getConstant(0, dl, CondVT),
18181 ISD::SETNE);
18182 // Now return a new VSELECT using the mask.
18183 return DAG.getSelect(dl, VT, Mask, LHS, RHS);
18184 }
18185
18186 // SEXT/TRUNC cases where the mask doesn't match the destination size.
18187 if (CondEltSize != EltSize) {
18188 // If we don't have a sign splat, rely on the expansion.
18189 if (CondEltSize != DAG.ComputeNumSignBits(Cond))
18190 return SDValue();
18191
18192 MVT NewCondSVT = MVT::getIntegerVT(EltSize);
18193 MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);
18194 Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);
18195 return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);
18196 }
18197
18198 // v16i16/v32i8 selects without AVX2, if the condition and another operand
18199 // are free to split, then better to split before expanding the
18200 // select. Don't bother with XOP as it has the fast VPCMOV instruction.
18201 // TODO: This is very similar to narrowVectorSelect.
18202 // TODO: Add Load splitting to isFreeToSplitVector ?
18203 if (EltSize < 32 && VT.is256BitVector() && !Subtarget.hasAVX2() &&
18204 !Subtarget.hasXOP()) {
18205 bool FreeCond = isFreeToSplitVector(Cond.getNode(), DAG);
18206 bool FreeLHS = isFreeToSplitVector(LHS.getNode(), DAG) ||
18207 (ISD::isNormalLoad(LHS.getNode()) && LHS.hasOneUse());
18208 bool FreeRHS = isFreeToSplitVector(RHS.getNode(), DAG) ||
18209 (ISD::isNormalLoad(RHS.getNode()) && RHS.hasOneUse());
18210 if (FreeCond && (FreeLHS || FreeRHS))
18211 return splitVectorOp(Op, DAG, dl);
18212 }
18213
18214 // Only some types will be legal on some subtargets. If we can emit a legal
18215 // VSELECT-matching blend, return Op, and but if we need to expand, return
18216 // a null value.
18217 switch (VT.SimpleTy) {
18218 default:
18219 // Most of the vector types have blends past SSE4.1.
18220 return Op;
18221
18222 case MVT::v32i8:
18223 // The byte blends for AVX vectors were introduced only in AVX2.
18224 if (Subtarget.hasAVX2())
18225 return Op;
18226
18227 return SDValue();
18228
18229 case MVT::v8i16:
18230 case MVT::v16i16: {
18231 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
18232 MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);
18233 Cond = DAG.getBitcast(CastVT, Cond);
18234 LHS = DAG.getBitcast(CastVT, LHS);
18235 RHS = DAG.getBitcast(CastVT, RHS);
18236 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
18237 return DAG.getBitcast(VT, Select);
18238 }
18239 }
18240}
18241
18243 MVT VT = Op.getSimpleValueType();
18244 SDValue Vec = Op.getOperand(0);
18245 SDValue Idx = Op.getOperand(1);
18246 assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
18247 SDLoc dl(Op);
18248
18250 return SDValue();
18251
18252 if (VT.getSizeInBits() == 8) {
18253 // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
18254 // we're going to zero extend the register or fold the store.
18257 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
18258 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18259 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18260
18261 unsigned IdxVal = Idx->getAsZExtVal();
18262 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,
18263 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18264 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18265 }
18266
18267 if (VT == MVT::f32) {
18268 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
18269 // the result back to FR32 register. It's only worth matching if the
18270 // result has a single use which is a store or a bitcast to i32. And in
18271 // the case of a store, it's not worth it if the index is a constant 0,
18272 // because a MOVSSmr can be used instead, which is smaller and faster.
18273 if (!Op.hasOneUse())
18274 return SDValue();
18275 SDNode *User = *Op.getNode()->user_begin();
18276 if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
18277 (User->getOpcode() != ISD::BITCAST ||
18278 User->getValueType(0) != MVT::i32))
18279 return SDValue();
18280 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18281 DAG.getBitcast(MVT::v4i32, Vec), Idx);
18282 return DAG.getBitcast(MVT::f32, Extract);
18283 }
18284
18285 if (VT == MVT::i32 || VT == MVT::i64)
18286 return Op;
18287
18288 return SDValue();
18289}
18290
18291/// Extract one bit from mask vector, like v16i1 or v8i1.
18292/// AVX-512 feature.
18294 const X86Subtarget &Subtarget) {
18295 SDValue Vec = Op.getOperand(0);
18296 SDLoc dl(Vec);
18297 MVT VecVT = Vec.getSimpleValueType();
18298 SDValue Idx = Op.getOperand(1);
18299 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18300 MVT EltVT = Op.getSimpleValueType();
18301
18302 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
18303 "Unexpected vector type in ExtractBitFromMaskVector");
18304
18305 // variable index can't be handled in mask registers,
18306 // extend vector to VR512/128
18307 if (!IdxC) {
18308 unsigned NumElts = VecVT.getVectorNumElements();
18309 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
18310 // than extending to 128/256bit.
18311 if (NumElts == 1) {
18312 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18314 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, DAG.getBitcast(IntVT, Vec));
18315 }
18316 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18317 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18318 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
18319 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
18320 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
18321 }
18322
18323 unsigned IdxVal = IdxC->getZExtValue();
18324 if (IdxVal == 0) // the operation is legal
18325 return Op;
18326
18327 // Extend to natively supported kshift.
18328 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18329
18330 // Use kshiftr instruction to move to the lower element.
18331 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18332 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18333
18334 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18335 DAG.getIntPtrConstant(0, dl));
18336}
18337
18338// Helper to find all the extracted elements from a vector.
18340 MVT VT = N->getSimpleValueType(0);
18341 unsigned NumElts = VT.getVectorNumElements();
18342 APInt DemandedElts = APInt::getZero(NumElts);
18343 for (SDNode *User : N->users()) {
18344 switch (User->getOpcode()) {
18345 case X86ISD::PEXTRB:
18346 case X86ISD::PEXTRW:
18348 if (!isa<ConstantSDNode>(User->getOperand(1))) {
18349 DemandedElts.setAllBits();
18350 return DemandedElts;
18351 }
18352 DemandedElts.setBit(User->getConstantOperandVal(1));
18353 break;
18354 case ISD::BITCAST: {
18355 if (!User->getValueType(0).isSimple() ||
18356 !User->getValueType(0).isVector()) {
18357 DemandedElts.setAllBits();
18358 return DemandedElts;
18359 }
18360 APInt DemandedSrcElts = getExtractedDemandedElts(User);
18361 DemandedElts |= APIntOps::ScaleBitMask(DemandedSrcElts, NumElts);
18362 break;
18363 }
18364 default:
18365 DemandedElts.setAllBits();
18366 return DemandedElts;
18367 }
18368 }
18369 return DemandedElts;
18370}
18371
18372SDValue
18373X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
18374 SelectionDAG &DAG) const {
18375 SDLoc dl(Op);
18376 SDValue Vec = Op.getOperand(0);
18377 MVT VecVT = Vec.getSimpleValueType();
18378 SDValue Idx = Op.getOperand(1);
18379 auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
18380
18381 if (VecVT.getVectorElementType() == MVT::i1)
18382 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
18383
18384 if (!IdxC) {
18385 // Its more profitable to go through memory (1 cycles throughput)
18386 // than using VMOVD + VPERMV/PSHUFB sequence (2/3 cycles throughput)
18387 // IACA tool was used to get performance estimation
18388 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
18389 //
18390 // example : extractelement <16 x i8> %a, i32 %i
18391 //
18392 // Block Throughput: 3.00 Cycles
18393 // Throughput Bottleneck: Port5
18394 //
18395 // | Num Of | Ports pressure in cycles | |
18396 // | Uops | 0 - DV | 5 | 6 | 7 | |
18397 // ---------------------------------------------
18398 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
18399 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
18400 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
18401 // Total Num Of Uops: 4
18402 //
18403 //
18404 // Block Throughput: 1.00 Cycles
18405 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
18406 //
18407 // | | Ports pressure in cycles | |
18408 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
18409 // ---------------------------------------------------------
18410 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
18411 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
18412 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
18413 // Total Num Of Uops: 4
18414
18415 return SDValue();
18416 }
18417
18418 unsigned IdxVal = IdxC->getZExtValue();
18419
18420 // If this is a 256-bit vector result, first extract the 128-bit vector and
18421 // then extract the element from the 128-bit vector.
18422 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
18423 // Get the 128-bit vector.
18424 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
18425 MVT EltVT = VecVT.getVectorElementType();
18426
18427 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
18428 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
18429
18430 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
18431 // this can be done with a mask.
18432 IdxVal &= ElemsPerChunk - 1;
18433 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
18434 DAG.getIntPtrConstant(IdxVal, dl));
18435 }
18436
18437 assert(VecVT.is128BitVector() && "Unexpected vector length");
18438
18439 MVT VT = Op.getSimpleValueType();
18440
18441 if (VT == MVT::i16) {
18442 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
18443 // we're going to zero extend the register or fold the store (SSE41 only).
18444 if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&
18445 !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {
18446 if (Subtarget.hasFP16())
18447 return Op;
18448
18449 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
18450 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18451 DAG.getBitcast(MVT::v4i32, Vec), Idx));
18452 }
18453
18454 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,
18455 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18456 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
18457 }
18458
18459 if (Subtarget.hasSSE41())
18460 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
18461 return Res;
18462
18463 // Only extract a single element from a v16i8 source - determine the common
18464 // DWORD/WORD that all extractions share, and extract the sub-byte.
18465 // TODO: Add QWORD MOVQ extraction?
18466 if (VT == MVT::i8) {
18467 APInt DemandedElts = getExtractedDemandedElts(Vec.getNode());
18468 assert(DemandedElts.getBitWidth() == 16 && "Vector width mismatch");
18469
18470 // Extract either the lowest i32 or any i16, and extract the sub-byte.
18471 int DWordIdx = IdxVal / 4;
18472 if (DWordIdx == 0 && DemandedElts == (DemandedElts & 15)) {
18473 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
18474 DAG.getBitcast(MVT::v4i32, Vec),
18475 DAG.getIntPtrConstant(DWordIdx, dl));
18476 int ShiftVal = (IdxVal % 4) * 8;
18477 if (ShiftVal != 0)
18478 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
18479 DAG.getConstant(ShiftVal, dl, MVT::i8));
18480 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18481 }
18482
18483 int WordIdx = IdxVal / 2;
18484 if (DemandedElts == (DemandedElts & (3 << (WordIdx * 2)))) {
18485 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
18486 DAG.getBitcast(MVT::v8i16, Vec),
18487 DAG.getIntPtrConstant(WordIdx, dl));
18488 int ShiftVal = (IdxVal % 2) * 8;
18489 if (ShiftVal != 0)
18490 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
18491 DAG.getConstant(ShiftVal, dl, MVT::i8));
18492 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
18493 }
18494 }
18495
18496 if (VT == MVT::f16 || VT.getSizeInBits() == 32) {
18497 if (IdxVal == 0)
18498 return Op;
18499
18500 // Shuffle the element to the lowest element, then movss or movsh.
18502 Mask[0] = static_cast<int>(IdxVal);
18503 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18504 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18505 DAG.getIntPtrConstant(0, dl));
18506 }
18507
18508 if (VT.getSizeInBits() == 64) {
18509 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
18510 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
18511 // to match extract_elt for f64.
18512 if (IdxVal == 0)
18513 return Op;
18514
18515 // UNPCKHPD the element to the lowest double word, then movsd.
18516 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
18517 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
18518 int Mask[2] = { 1, -1 };
18519 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
18520 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
18521 DAG.getIntPtrConstant(0, dl));
18522 }
18523
18524 return SDValue();
18525}
18526
18527/// Insert one bit to mask vector, like v16i1 or v8i1.
18528/// AVX-512 feature.
18530 const X86Subtarget &Subtarget) {
18531 SDLoc dl(Op);
18532 SDValue Vec = Op.getOperand(0);
18533 SDValue Elt = Op.getOperand(1);
18534 SDValue Idx = Op.getOperand(2);
18535 MVT VecVT = Vec.getSimpleValueType();
18536
18537 if (!isa<ConstantSDNode>(Idx)) {
18538 // Non constant index. Extend source and destination,
18539 // insert element and then truncate the result.
18540 unsigned NumElts = VecVT.getVectorNumElements();
18541 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
18542 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
18543 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
18544 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
18545 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
18546 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
18547 }
18548
18549 // Copy into a k-register, extract to v1i1 and insert_subvector.
18550 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
18551 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
18552}
18553
18554SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
18555 SelectionDAG &DAG) const {
18556 MVT VT = Op.getSimpleValueType();
18557 MVT EltVT = VT.getVectorElementType();
18558 unsigned NumElts = VT.getVectorNumElements();
18559 unsigned EltSizeInBits = EltVT.getScalarSizeInBits();
18560
18561 if (EltVT == MVT::i1)
18562 return InsertBitToMaskVector(Op, DAG, Subtarget);
18563
18564 SDLoc dl(Op);
18565 SDValue N0 = Op.getOperand(0);
18566 SDValue N1 = Op.getOperand(1);
18567 SDValue N2 = Op.getOperand(2);
18568 auto *N2C = dyn_cast<ConstantSDNode>(N2);
18569
18570 if (EltVT == MVT::bf16) {
18572 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVT,
18573 DAG.getBitcast(IVT, N0),
18574 DAG.getBitcast(MVT::i16, N1), N2);
18575 return DAG.getBitcast(VT, Res);
18576 }
18577
18578 if (!N2C) {
18579 // Variable insertion indices, usually we're better off spilling to stack,
18580 // but AVX512 can use a variable compare+select by comparing against all
18581 // possible vector indices, and FP insertion has less gpr->simd traffic.
18582 if (!(Subtarget.hasBWI() ||
18583 (Subtarget.hasAVX512() && EltSizeInBits >= 32) ||
18584 (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))
18585 return SDValue();
18586
18587 MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);
18588 MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);
18589 if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))
18590 return SDValue();
18591
18592 SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);
18593 SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);
18594 SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);
18595
18596 SmallVector<SDValue, 16> RawIndices;
18597 for (unsigned I = 0; I != NumElts; ++I)
18598 RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));
18599 SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);
18600
18601 // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.
18602 return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,
18604 }
18605
18606 if (N2C->getAPIntValue().uge(NumElts))
18607 return SDValue();
18608 uint64_t IdxVal = N2C->getZExtValue();
18609
18610 bool IsZeroElt = X86::isZeroNode(N1);
18611 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
18612
18613 if (IsZeroElt || IsAllOnesElt) {
18614 // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.
18615 // We don't deal with i8 0 since it appears to be handled elsewhere.
18616 if (IsAllOnesElt &&
18617 ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||
18618 ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {
18619 SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());
18620 SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());
18621 SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);
18622 CstVectorElts[IdxVal] = OnesCst;
18623 SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);
18624 return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);
18625 }
18626 // See if we can do this more efficiently with a blend shuffle with a
18627 // rematerializable vector.
18628 if (Subtarget.hasSSE41() &&
18629 (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {
18630 SmallVector<int, 8> BlendMask;
18631 for (unsigned i = 0; i != NumElts; ++i)
18632 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18633 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
18634 : getOnesVector(VT, DAG, dl);
18635 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
18636 }
18637 }
18638
18639 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
18640 // into that, and then insert the subvector back into the result.
18641 if (VT.is256BitVector() || VT.is512BitVector()) {
18642 // With a 256-bit vector, we can insert into the zero element efficiently
18643 // using a blend if we have AVX or AVX2 and the right data type.
18644 if (VT.is256BitVector() && IdxVal == 0) {
18645 // TODO: It is worthwhile to cast integer to floating point and back
18646 // and incur a domain crossing penalty if that's what we'll end up
18647 // doing anyway after extracting to a 128-bit vector.
18648 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
18649 (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {
18650 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18651 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,
18652 DAG.getTargetConstant(1, dl, MVT::i8));
18653 }
18654 }
18655
18656 unsigned NumEltsIn128 = 128 / EltSizeInBits;
18657 assert(isPowerOf2_32(NumEltsIn128) &&
18658 "Vectors will always have power-of-two number of elements.");
18659
18660 // If we are not inserting into the low 128-bit vector chunk,
18661 // then prefer the broadcast+blend sequence.
18662 // FIXME: relax the profitability check iff all N1 uses are insertions.
18663 if (IdxVal >= NumEltsIn128 &&
18664 ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
18665 (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
18666 X86::mayFoldLoad(N1, Subtarget)))) {
18667 SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
18668 SmallVector<int, 8> BlendMask;
18669 for (unsigned i = 0; i != NumElts; ++i)
18670 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
18671 return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);
18672 }
18673
18674 // Get the desired 128-bit vector chunk.
18675 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
18676
18677 // Insert the element into the desired chunk.
18678 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
18679 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
18680
18681 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
18682 DAG.getIntPtrConstant(IdxIn128, dl));
18683
18684 // Insert the changed part back into the bigger vector
18685 return insert128BitVector(N0, V, IdxVal, DAG, dl);
18686 }
18687 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
18688
18689 // This will be just movw/movd/movq/movsh/movss/movsd.
18690 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
18691 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
18692 EltVT == MVT::f16 || EltVT == MVT::i64) {
18693 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
18694 return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18695 }
18696
18697 // We can't directly insert an i8 or i16 into a vector, so zero extend
18698 // it to i32 first.
18699 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
18700 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
18701 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
18702 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
18703 N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
18704 return DAG.getBitcast(VT, N1);
18705 }
18706 }
18707
18708 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
18709 // argument. SSE41 required for pinsrb.
18710 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
18711 unsigned Opc;
18712 if (VT == MVT::v8i16) {
18713 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
18714 Opc = X86ISD::PINSRW;
18715 } else {
18716 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
18717 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
18718 Opc = X86ISD::PINSRB;
18719 }
18720
18721 assert(N1.getValueType() != MVT::i32 && "Unexpected VT");
18722 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
18723 N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);
18724 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
18725 }
18726
18727 if (Subtarget.hasSSE41()) {
18728 if (EltVT == MVT::f32) {
18729 // Bits [7:6] of the constant are the source select. This will always be
18730 // zero here. The DAG Combiner may combine an extract_elt index into
18731 // these bits. For example (insert (extract, 3), 2) could be matched by
18732 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
18733 // Bits [5:4] of the constant are the destination select. This is the
18734 // value of the incoming immediate.
18735 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
18736 // combine either bitwise AND or insert of float 0.0 to set these bits.
18737
18738 bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
18739 if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
18740 // If this is an insertion of 32-bits into the low 32-bits of
18741 // a vector, we prefer to generate a blend with immediate rather
18742 // than an insertps. Blends are simpler operations in hardware and so
18743 // will always have equal or better performance than insertps.
18744 // But if optimizing for size and there's a load folding opportunity,
18745 // generate insertps because blendps does not have a 32-bit memory
18746 // operand form.
18747 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18748 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,
18749 DAG.getTargetConstant(1, dl, MVT::i8));
18750 }
18751 // Create this as a scalar to vector..
18752 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
18753 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,
18754 DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));
18755 }
18756
18757 // PINSR* works with constant index.
18758 if (EltVT == MVT::i32 || EltVT == MVT::i64)
18759 return Op;
18760 }
18761
18762 return SDValue();
18763}
18764
18766 SelectionDAG &DAG) {
18767 SDLoc dl(Op);
18768 MVT OpVT = Op.getSimpleValueType();
18769
18770 // It's always cheaper to replace a xor+movd with xorps and simplifies further
18771 // combines.
18772 if (X86::isZeroNode(Op.getOperand(0)))
18773 return getZeroVector(OpVT, Subtarget, DAG, dl);
18774
18775 // If this is a 256-bit vector result, first insert into a 128-bit
18776 // vector and then insert into the 256-bit vector.
18777 if (!OpVT.is128BitVector()) {
18778 // Insert into a 128-bit vector.
18779 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
18781 OpVT.getVectorNumElements() / SizeFactor);
18782
18783 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
18784
18785 // Insert the 128-bit vector.
18786 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
18787 }
18788 assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&
18789 "Expected an SSE type!");
18790
18791 // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in
18792 // tblgen.
18793 if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))
18794 return Op;
18795
18796 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
18797 return DAG.getBitcast(
18798 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
18799}
18800
18801// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
18802// simple superregister reference or explicit instructions to insert
18803// the upper bits of a vector.
18805 SelectionDAG &DAG) {
18806 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
18807
18808 return insert1BitVector(Op, DAG, Subtarget);
18809}
18810
18812 SelectionDAG &DAG) {
18813 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
18814 "Only vXi1 extract_subvectors need custom lowering");
18815
18816 SDLoc dl(Op);
18817 SDValue Vec = Op.getOperand(0);
18818 uint64_t IdxVal = Op.getConstantOperandVal(1);
18819
18820 if (IdxVal == 0) // the operation is legal
18821 return Op;
18822
18823 // Extend to natively supported kshift.
18824 Vec = widenMaskVector(Vec, false, Subtarget, DAG, dl);
18825
18826 // Shift to the LSB.
18827 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, Vec.getSimpleValueType(), Vec,
18828 DAG.getTargetConstant(IdxVal, dl, MVT::i8));
18829
18830 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
18831 DAG.getIntPtrConstant(0, dl));
18832}
18833
18834// Returns the appropriate wrapper opcode for a global reference.
18835unsigned X86TargetLowering::getGlobalWrapperKind(
18836 const GlobalValue *GV, const unsigned char OpFlags) const {
18837 // References to absolute symbols are never PC-relative.
18838 if (GV && GV->isAbsoluteSymbolRef())
18839 return X86ISD::Wrapper;
18840
18841 // The following OpFlags under RIP-rel PIC use RIP.
18842 if (Subtarget.isPICStyleRIPRel() &&
18843 (OpFlags == X86II::MO_NO_FLAG || OpFlags == X86II::MO_COFFSTUB ||
18844 OpFlags == X86II::MO_DLLIMPORT))
18845 return X86ISD::WrapperRIP;
18846
18847 // GOTPCREL references must always use RIP.
18848 if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)
18849 return X86ISD::WrapperRIP;
18850
18851 return X86ISD::Wrapper;
18852}
18853
18854// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
18855// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
18856// one of the above mentioned nodes. It has to be wrapped because otherwise
18857// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
18858// be used to form addressing mode. These wrapped nodes will be selected
18859// into MOV32ri.
18860SDValue
18861X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
18862 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
18863
18864 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18865 // global base reg.
18866 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18867
18868 auto PtrVT = getPointerTy(DAG.getDataLayout());
18870 CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
18871 SDLoc DL(CP);
18872 Result =
18873 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
18874 // With PIC, the address is actually $g + Offset.
18875 if (OpFlag) {
18876 Result =
18877 DAG.getNode(ISD::ADD, DL, PtrVT,
18878 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18879 }
18880
18881 return Result;
18882}
18883
18884SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
18885 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
18886
18887 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
18888 // global base reg.
18889 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
18890
18891 auto PtrVT = getPointerTy(DAG.getDataLayout());
18892 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
18893 SDLoc DL(JT);
18894 Result =
18895 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlag), DL, PtrVT, Result);
18896
18897 // With PIC, the address is actually $g + Offset.
18898 if (OpFlag)
18899 Result =
18900 DAG.getNode(ISD::ADD, DL, PtrVT,
18901 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
18902
18903 return Result;
18904}
18905
18906SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,
18907 SelectionDAG &DAG) const {
18908 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
18909}
18910
18911SDValue
18912X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
18913 // Create the TargetBlockAddressAddress node.
18914 unsigned char OpFlags =
18916 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
18917 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
18918 SDLoc dl(Op);
18919 auto PtrVT = getPointerTy(DAG.getDataLayout());
18920 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
18921 Result =
18922 DAG.getNode(getGlobalWrapperKind(nullptr, OpFlags), dl, PtrVT, Result);
18923
18924 // With PIC, the address is actually $g + Offset.
18925 if (isGlobalRelativeToPICBase(OpFlags)) {
18926 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
18927 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
18928 }
18929
18930 return Result;
18931}
18932
18933/// Creates target global address or external symbol nodes for calls or
18934/// other uses.
18935SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,
18936 bool ForCall) const {
18937 // Unpack the global address or external symbol.
18938 SDLoc dl(Op);
18939 const GlobalValue *GV = nullptr;
18940 int64_t Offset = 0;
18941 const char *ExternalSym = nullptr;
18942 if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {
18943 GV = G->getGlobal();
18944 Offset = G->getOffset();
18945 } else {
18946 const auto *ES = cast<ExternalSymbolSDNode>(Op);
18947 ExternalSym = ES->getSymbol();
18948 }
18949
18950 // Calculate some flags for address lowering.
18952 unsigned char OpFlags;
18953 if (ForCall)
18954 OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);
18955 else
18956 OpFlags = Subtarget.classifyGlobalReference(GV, Mod);
18957 bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);
18958 bool NeedsLoad = isGlobalStubReference(OpFlags);
18959
18961 auto PtrVT = getPointerTy(DAG.getDataLayout());
18963
18964 if (GV) {
18965 // Create a target global address if this is a global. If possible, fold the
18966 // offset into the global address reference. Otherwise, ADD it on later.
18967 // Suppress the folding if Offset is negative: movl foo-1, %eax is not
18968 // allowed because if the address of foo is 0, the ELF R_X86_64_32
18969 // relocation will compute to a negative value, which is invalid.
18970 int64_t GlobalOffset = 0;
18971 if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&
18973 std::swap(GlobalOffset, Offset);
18974 }
18975 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);
18976 } else {
18977 // If this is not a global address, this must be an external symbol.
18978 Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);
18979 }
18980
18981 // If this is a direct call, avoid the wrapper if we don't need to do any
18982 // loads or adds. This allows SDAG ISel to match direct calls.
18983 if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)
18984 return Result;
18985
18986 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
18987
18988 // With PIC, the address is actually $g + Offset.
18989 if (HasPICReg) {
18990 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
18991 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
18992 }
18993
18994 // For globals that require a load from a stub to get the address, emit the
18995 // load.
18996 if (NeedsLoad)
18997 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
18999
19000 // If there was a non-zero offset that we didn't fold, create an explicit
19001 // addition for it.
19002 if (Offset != 0)
19003 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
19004 DAG.getSignedConstant(Offset, dl, PtrVT));
19005
19006 return Result;
19007}
19008
19009SDValue
19010X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
19011 return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);
19012}
19013
19015 const EVT PtrVT, unsigned ReturnReg,
19016 unsigned char OperandFlags,
19017 bool LoadGlobalBaseReg = false,
19018 bool LocalDynamic = false) {
19020 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19021 SDLoc dl(GA);
19022 SDValue TGA;
19023 bool UseTLSDESC = DAG.getTarget().useTLSDESC();
19024 SDValue Chain = DAG.getEntryNode();
19025 SDValue Ret;
19026 if (LocalDynamic && UseTLSDESC) {
19027 TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
19028 // Reuse existing GetTLSADDR node if we can find it.
19029 if (TGA->hasOneUse()) {
19030 // TLSDESC uses TGA.
19031 SDNode *TLSDescOp = *TGA->user_begin();
19032 assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC &&
19033 "Unexpected TLSDESC DAG");
19034 // CALLSEQ_END uses TGA via a chain and glue.
19035 auto *CallSeqEndOp = TLSDescOp->getGluedUser();
19036 assert(CallSeqEndOp && CallSeqEndOp->getOpcode() == ISD::CALLSEQ_END &&
19037 "Unexpected TLSDESC DAG");
19038 // CopyFromReg uses CALLSEQ_END via a chain and glue.
19039 auto *CopyFromRegOp = CallSeqEndOp->getGluedUser();
19040 assert(CopyFromRegOp && CopyFromRegOp->getOpcode() == ISD::CopyFromReg &&
19041 "Unexpected TLSDESC DAG");
19042 Ret = SDValue(CopyFromRegOp, 0);
19043 }
19044 } else {
19045 TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19046 GA->getOffset(), OperandFlags);
19047 }
19048
19049 if (!Ret) {
19050 X86ISD::NodeType CallType = UseTLSDESC ? X86ISD::TLSDESC
19051 : LocalDynamic ? X86ISD::TLSBASEADDR
19053
19054 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19055 if (LoadGlobalBaseReg) {
19056 SDValue InGlue;
19057 Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
19058 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT),
19059 InGlue);
19060 InGlue = Chain.getValue(1);
19061 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA, InGlue});
19062 } else {
19063 Chain = DAG.getNode(CallType, dl, NodeTys, {Chain, TGA});
19064 }
19065 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), dl);
19066
19067 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
19068 MFI.setHasCalls(true);
19069
19070 SDValue Glue = Chain.getValue(1);
19071 Ret = DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
19072 }
19073
19074 if (!UseTLSDESC)
19075 return Ret;
19076
19077 const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();
19078 unsigned Seg = Subtarget.is64Bit() ? X86AS::FS : X86AS::GS;
19079
19081 SDValue Offset =
19082 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19084 return DAG.getNode(ISD::ADD, dl, PtrVT, Ret, Offset);
19085}
19086
19087// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
19088static SDValue
19090 const EVT PtrVT) {
19091 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD,
19092 /*LoadGlobalBaseReg=*/true);
19093}
19094
19095// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64
19096static SDValue
19098 const EVT PtrVT) {
19099 return GetTLSADDR(DAG, GA, PtrVT, X86::RAX, X86II::MO_TLSGD);
19100}
19101
19102// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32
19103static SDValue
19105 const EVT PtrVT) {
19106 return GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSGD);
19107}
19108
19110 SelectionDAG &DAG, const EVT PtrVT,
19111 bool Is64Bit, bool Is64BitLP64) {
19112 SDLoc dl(GA);
19113
19114 // Get the start address of the TLS block for this module.
19118
19119 SDValue Base;
19120 if (Is64Bit) {
19121 unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;
19122 Base = GetTLSADDR(DAG, GA, PtrVT, ReturnReg, X86II::MO_TLSLD,
19123 /*LoadGlobalBaseReg=*/false,
19124 /*LocalDynamic=*/true);
19125 } else {
19126 Base = GetTLSADDR(DAG, GA, PtrVT, X86::EAX, X86II::MO_TLSLDM,
19127 /*LoadGlobalBaseReg=*/true,
19128 /*LocalDynamic=*/true);
19129 }
19130
19131 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
19132 // of Base.
19133
19134 // Build x@dtpoff.
19135 unsigned char OperandFlags = X86II::MO_DTPOFF;
19136 unsigned WrapperKind = X86ISD::Wrapper;
19137 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19138 GA->getValueType(0),
19139 GA->getOffset(), OperandFlags);
19140 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19141
19142 // Add x@dtpoff with the base.
19143 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
19144}
19145
19146// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
19148 const EVT PtrVT, TLSModel::Model model,
19149 bool is64Bit, bool isPIC) {
19150 SDLoc dl(GA);
19151
19152 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
19155
19156 SDValue ThreadPointer =
19157 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
19159
19160 unsigned char OperandFlags = 0;
19161 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
19162 // initialexec.
19163 unsigned WrapperKind = X86ISD::Wrapper;
19164 if (model == TLSModel::LocalExec) {
19165 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
19166 } else if (model == TLSModel::InitialExec) {
19167 if (is64Bit) {
19168 OperandFlags = X86II::MO_GOTTPOFF;
19169 WrapperKind = X86ISD::WrapperRIP;
19170 } else {
19171 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
19172 }
19173 } else {
19174 llvm_unreachable("Unexpected model");
19175 }
19176
19177 // emit "addl x@ntpoff,%eax" (local exec)
19178 // or "addl x@indntpoff,%eax" (initial exec)
19179 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
19180 SDValue TGA =
19181 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
19182 GA->getOffset(), OperandFlags);
19183 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
19184
19185 if (model == TLSModel::InitialExec) {
19186 if (isPIC && !is64Bit) {
19187 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
19188 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19189 Offset);
19190 }
19191
19192 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
19194 }
19195
19196 // The address of the thread local variable is the add of the thread
19197 // pointer with the offset of the variable.
19198 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
19199}
19200
19201SDValue
19202X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
19203
19204 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
19205
19206 if (DAG.getTarget().useEmulatedTLS())
19207 return LowerToTLSEmulatedModel(GA, DAG);
19208
19209 const GlobalValue *GV = GA->getGlobal();
19210 auto PtrVT = getPointerTy(DAG.getDataLayout());
19211 bool PositionIndependent = isPositionIndependent();
19212
19213 if (Subtarget.isTargetELF()) {
19214 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
19215 switch (model) {
19217 if (Subtarget.is64Bit()) {
19218 if (Subtarget.isTarget64BitLP64())
19219 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
19220 return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);
19221 }
19222 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
19224 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),
19225 Subtarget.isTarget64BitLP64());
19228 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
19229 PositionIndependent);
19230 }
19231 llvm_unreachable("Unknown TLS model.");
19232 }
19233
19234 if (Subtarget.isTargetDarwin()) {
19235 // Darwin only has one model of TLS. Lower to that.
19236 unsigned char OpFlag = 0;
19237 unsigned WrapperKind = 0;
19238
19239 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
19240 // global base reg.
19241 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
19242 if (PIC32) {
19243 OpFlag = X86II::MO_TLVP_PIC_BASE;
19244 WrapperKind = X86ISD::Wrapper;
19245 } else {
19246 OpFlag = X86II::MO_TLVP;
19247 WrapperKind = X86ISD::WrapperRIP;
19248 }
19249 SDLoc DL(Op);
19251 GA->getValueType(0),
19252 GA->getOffset(), OpFlag);
19253 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
19254
19255 // With PIC32, the address is actually $g + Offset.
19256 if (PIC32)
19257 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
19258 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
19259 Offset);
19260
19261 // Lowering the machine isd will make sure everything is in the right
19262 // location.
19263 SDValue Chain = DAG.getEntryNode();
19264 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19265 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
19266 SDValue Args[] = { Chain, Offset };
19267 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
19268 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);
19269
19270 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
19272 MFI.setAdjustsStack(true);
19273
19274 // And our return value (tls address) is in the standard call return value
19275 // location.
19276 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
19277 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
19278 }
19279
19280 if (Subtarget.isOSWindows()) {
19281 // Just use the implicit TLS architecture
19282 // Need to generate something similar to:
19283 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
19284 // ; from TEB
19285 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
19286 // mov rcx, qword [rdx+rcx*8]
19287 // mov eax, .tls$:tlsvar
19288 // [rax+rcx] contains the address
19289 // Windows 64bit: gs:0x58
19290 // Windows 32bit: fs:__tls_array
19291
19292 SDLoc dl(GA);
19293 SDValue Chain = DAG.getEntryNode();
19294
19295 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
19296 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
19297 // use its literal value of 0x2C.
19299 Subtarget.is64Bit() ? PointerType::get(*DAG.getContext(), X86AS::GS)
19301
19302 SDValue TlsArray = Subtarget.is64Bit()
19303 ? DAG.getIntPtrConstant(0x58, dl)
19304 : (Subtarget.isTargetWindowsGNU()
19305 ? DAG.getIntPtrConstant(0x2C, dl)
19306 : DAG.getExternalSymbol("_tls_array", PtrVT));
19307
19309 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
19310
19311 SDValue res;
19313 res = ThreadPointer;
19314 } else {
19315 // Load the _tls_index variable
19316 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
19317 if (Subtarget.is64Bit())
19318 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
19319 MachinePointerInfo(), MVT::i32);
19320 else
19321 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
19322
19323 const DataLayout &DL = DAG.getDataLayout();
19324 SDValue Scale =
19325 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
19326 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
19327
19328 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
19329 }
19330
19331 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
19332
19333 // Get the offset of start of .tls section
19334 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
19335 GA->getValueType(0),
19337 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
19338
19339 // The address of the thread local variable is the add of the thread
19340 // pointer with the offset of the variable.
19341 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
19342 }
19343
19344 llvm_unreachable("TLS not implemented for this target.");
19345}
19346
19348 if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
19349 const TargetMachine &TM = getTargetMachine();
19350 TLSModel::Model Model = TM.getTLSModel(&GV);
19351 switch (Model) {
19354 // We can include the %fs segment register in addressing modes.
19355 return true;
19358 // These models do not result in %fs relative addresses unless
19359 // TLS descriptior are used.
19360 //
19361 // Even in the case of TLS descriptors we currently have no way to model
19362 // the difference between %fs access and the computations needed for the
19363 // offset and returning `true` for TLS-desc currently duplicates both
19364 // which is detrimental :-/
19365 return false;
19366 }
19367 }
19368 return false;
19369}
19370
19371/// Lower SRA_PARTS and friends, which return two i32 values
19372/// and take a 2 x i32 value to shift plus a shift amount.
19373/// TODO: Can this be moved to general expansion code?
19375 SDValue Lo, Hi;
19376 DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);
19377 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
19378}
19379
19380// Try to use a packed vector operation to handle i64 on 32-bit targets when
19381// AVX512DQ is enabled.
19383 SelectionDAG &DAG,
19384 const X86Subtarget &Subtarget) {
19385 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19386 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19387 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19388 Op.getOpcode() == ISD::UINT_TO_FP) &&
19389 "Unexpected opcode!");
19390 bool IsStrict = Op->isStrictFPOpcode();
19391 unsigned OpNo = IsStrict ? 1 : 0;
19392 SDValue Src = Op.getOperand(OpNo);
19393 MVT SrcVT = Src.getSimpleValueType();
19394 MVT VT = Op.getSimpleValueType();
19395
19396 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
19397 (VT != MVT::f32 && VT != MVT::f64))
19398 return SDValue();
19399
19400 // Pack the i64 into a vector, do the operation and extract.
19401
19402 // Using 256-bit to ensure result is 128-bits for f32 case.
19403 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
19404 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
19405 MVT VecVT = MVT::getVectorVT(VT, NumElts);
19406
19407 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
19408 if (IsStrict) {
19409 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
19410 {Op.getOperand(0), InVec});
19411 SDValue Chain = CvtVec.getValue(1);
19412 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19413 DAG.getIntPtrConstant(0, dl));
19414 return DAG.getMergeValues({Value, Chain}, dl);
19415 }
19416
19417 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
19418
19419 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19420 DAG.getIntPtrConstant(0, dl));
19421}
19422
19423// Try to use a packed vector operation to handle i64 on 32-bit targets.
19425 const X86Subtarget &Subtarget) {
19426 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
19427 Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
19428 Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
19429 Op.getOpcode() == ISD::UINT_TO_FP) &&
19430 "Unexpected opcode!");
19431 bool IsStrict = Op->isStrictFPOpcode();
19432 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19433 MVT SrcVT = Src.getSimpleValueType();
19434 MVT VT = Op.getSimpleValueType();
19435
19436 if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)
19437 return SDValue();
19438
19439 // Pack the i64 into a vector, do the operation and extract.
19440
19441 assert(Subtarget.hasFP16() && "Expected FP16");
19442
19443 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);
19444 if (IsStrict) {
19445 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},
19446 {Op.getOperand(0), InVec});
19447 SDValue Chain = CvtVec.getValue(1);
19448 SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19449 DAG.getIntPtrConstant(0, dl));
19450 return DAG.getMergeValues({Value, Chain}, dl);
19451 }
19452
19453 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);
19454
19455 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
19456 DAG.getIntPtrConstant(0, dl));
19457}
19458
19459static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,
19460 const X86Subtarget &Subtarget) {
19461 switch (Opcode) {
19462 case ISD::SINT_TO_FP:
19463 // TODO: Handle wider types with AVX/AVX512.
19464 if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)
19465 return false;
19466 // CVTDQ2PS or (V)CVTDQ2PD
19467 return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);
19468
19469 case ISD::UINT_TO_FP:
19470 // TODO: Handle wider types and i64 elements.
19471 if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)
19472 return false;
19473 // VCVTUDQ2PS or VCVTUDQ2PD
19474 return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;
19475
19476 default:
19477 return false;
19478 }
19479}
19480
19481/// Given a scalar cast operation that is extracted from a vector, try to
19482/// vectorize the cast op followed by extraction. This will avoid an expensive
19483/// round-trip between XMM and GPR.
19485 SelectionDAG &DAG,
19486 const X86Subtarget &Subtarget) {
19487 // TODO: This could be enhanced to handle smaller integer types by peeking
19488 // through an extend.
19489 SDValue Extract = Cast.getOperand(0);
19490 MVT DestVT = Cast.getSimpleValueType();
19491 if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19492 !isa<ConstantSDNode>(Extract.getOperand(1)))
19493 return SDValue();
19494
19495 // See if we have a 128-bit vector cast op for this type of cast.
19496 SDValue VecOp = Extract.getOperand(0);
19497 MVT FromVT = VecOp.getSimpleValueType();
19498 unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();
19499 MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);
19500 MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);
19501 if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))
19502 return SDValue();
19503
19504 // If we are extracting from a non-zero element, first shuffle the source
19505 // vector to allow extracting from element zero.
19506 if (!isNullConstant(Extract.getOperand(1))) {
19507 SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);
19508 Mask[0] = Extract.getConstantOperandVal(1);
19509 VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);
19510 }
19511 // If the source vector is wider than 128-bits, extract the low part. Do not
19512 // create an unnecessarily wide vector cast op.
19513 if (FromVT != Vec128VT)
19514 VecOp = extract128BitVector(VecOp, 0, DAG, DL);
19515
19516 // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0
19517 // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0
19518 SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);
19519 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,
19520 DAG.getIntPtrConstant(0, DL));
19521}
19522
19523/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
19524/// try to vectorize the cast ops. This will avoid an expensive round-trip
19525/// between XMM and GPR.
19526static SDValue lowerFPToIntToFP(SDValue CastToFP, const SDLoc &DL,
19527 SelectionDAG &DAG,
19528 const X86Subtarget &Subtarget) {
19529 // TODO: Allow FP_TO_UINT.
19530 SDValue CastToInt = CastToFP.getOperand(0);
19531 MVT VT = CastToFP.getSimpleValueType();
19532 if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
19533 return SDValue();
19534
19535 MVT IntVT = CastToInt.getSimpleValueType();
19536 SDValue X = CastToInt.getOperand(0);
19537 MVT SrcVT = X.getSimpleValueType();
19538 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
19539 return SDValue();
19540
19541 // See if we have 128-bit vector cast instructions for this type of cast.
19542 // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
19543 if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
19544 IntVT != MVT::i32)
19545 return SDValue();
19546
19547 unsigned SrcSize = SrcVT.getSizeInBits();
19548 unsigned IntSize = IntVT.getSizeInBits();
19549 unsigned VTSize = VT.getSizeInBits();
19550 MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
19551 MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
19552 MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
19553
19554 // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
19555 unsigned ToIntOpcode =
19556 SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
19557 unsigned ToFPOpcode =
19558 IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
19559
19560 // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
19561 //
19562 // We are not defining the high elements (for example, zero them) because
19563 // that could nullify any performance advantage that we hoped to gain from
19564 // this vector op hack. We do not expect any adverse effects (like denorm
19565 // penalties) with cast ops.
19566 SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
19567 SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
19568 SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
19569 SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
19570 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
19571}
19572
19574 SelectionDAG &DAG,
19575 const X86Subtarget &Subtarget) {
19576 bool IsStrict = Op->isStrictFPOpcode();
19577 MVT VT = Op->getSimpleValueType(0);
19578 SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
19579
19580 if (Subtarget.hasDQI()) {
19581 assert(!Subtarget.hasVLX() && "Unexpected features");
19582
19583 assert((Src.getSimpleValueType() == MVT::v2i64 ||
19584 Src.getSimpleValueType() == MVT::v4i64) &&
19585 "Unsupported custom type");
19586
19587 // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
19588 assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
19589 "Unexpected VT!");
19590 MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
19591
19592 // Need to concat with zero vector for strict fp to avoid spurious
19593 // exceptions.
19594 SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
19595 : DAG.getUNDEF(MVT::v8i64);
19596 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
19597 DAG.getIntPtrConstant(0, DL));
19598 SDValue Res, Chain;
19599 if (IsStrict) {
19600 Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
19601 {Op->getOperand(0), Src});
19602 Chain = Res.getValue(1);
19603 } else {
19604 Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
19605 }
19606
19607 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
19608 DAG.getIntPtrConstant(0, DL));
19609
19610 if (IsStrict)
19611 return DAG.getMergeValues({Res, Chain}, DL);
19612 return Res;
19613 }
19614
19615 bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
19616 Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
19617 if (VT != MVT::v4f32 || IsSigned)
19618 return SDValue();
19619
19620 SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
19621 SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
19622 SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
19623 DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
19624 DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
19625 SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
19626 SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
19627 SmallVector<SDValue, 4> SignCvts(4);
19628 SmallVector<SDValue, 4> Chains(4);
19629 for (int i = 0; i != 4; ++i) {
19630 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
19631 DAG.getIntPtrConstant(i, DL));
19632 if (IsStrict) {
19633 SignCvts[i] =
19634 DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
19635 {Op.getOperand(0), Elt});
19636 Chains[i] = SignCvts[i].getValue(1);
19637 } else {
19638 SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
19639 }
19640 }
19641 SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
19642
19643 SDValue Slow, Chain;
19644 if (IsStrict) {
19645 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
19646 Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
19647 {Chain, SignCvt, SignCvt});
19648 Chain = Slow.getValue(1);
19649 } else {
19650 Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
19651 }
19652
19653 IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
19654 SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
19655
19656 if (IsStrict)
19657 return DAG.getMergeValues({Cvt, Chain}, DL);
19658
19659 return Cvt;
19660}
19661
19663 SelectionDAG &DAG) {
19664 bool IsStrict = Op->isStrictFPOpcode();
19665 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
19666 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19667 MVT VT = Op.getSimpleValueType();
19668 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
19669
19670 SDValue Rnd = DAG.getIntPtrConstant(0, dl, /*isTarget=*/true);
19671 if (IsStrict)
19672 return DAG.getNode(
19673 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
19674 {Chain,
19675 DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),
19676 Rnd});
19677 return DAG.getNode(ISD::FP_ROUND, dl, VT,
19678 DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);
19679}
19680
19681static bool isLegalConversion(MVT VT, bool IsSigned,
19682 const X86Subtarget &Subtarget) {
19683 if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)
19684 return true;
19685 if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)
19686 return true;
19687 if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))
19688 return true;
19689 if (Subtarget.useAVX512Regs()) {
19690 if (VT == MVT::v16i32)
19691 return true;
19692 if (VT == MVT::v8i64 && Subtarget.hasDQI())
19693 return true;
19694 }
19695 if (Subtarget.hasDQI() && Subtarget.hasVLX() &&
19696 (VT == MVT::v2i64 || VT == MVT::v4i64))
19697 return true;
19698 return false;
19699}
19700
19701SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
19702 SelectionDAG &DAG) const {
19703 bool IsStrict = Op->isStrictFPOpcode();
19704 unsigned OpNo = IsStrict ? 1 : 0;
19705 SDValue Src = Op.getOperand(OpNo);
19706 SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
19707 MVT SrcVT = Src.getSimpleValueType();
19708 MVT VT = Op.getSimpleValueType();
19709 SDLoc dl(Op);
19710
19711 if (isSoftF16(VT, Subtarget))
19712 return promoteXINT_TO_FP(Op, dl, DAG);
19713 else if (isLegalConversion(SrcVT, true, Subtarget))
19714 return Op;
19715
19716 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
19717 return LowerWin64_INT128_TO_FP(Op, DAG);
19718
19719 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
19720 return Extract;
19721
19722 if (SDValue R = lowerFPToIntToFP(Op, dl, DAG, Subtarget))
19723 return R;
19724
19725 if (SrcVT.isVector()) {
19726 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
19727 // Note: Since v2f64 is a legal type. We don't need to zero extend the
19728 // source for strict FP.
19729 if (IsStrict)
19730 return DAG.getNode(
19731 X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
19732 {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19733 DAG.getUNDEF(SrcVT))});
19734 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
19735 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
19736 DAG.getUNDEF(SrcVT)));
19737 }
19738 if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
19739 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
19740
19741 return SDValue();
19742 }
19743
19744 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
19745 "Unknown SINT_TO_FP to lower!");
19746
19747 bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
19748
19749 // These are really Legal; return the operand so the caller accepts it as
19750 // Legal.
19751 if (SrcVT == MVT::i32 && UseSSEReg)
19752 return Op;
19753 if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
19754 return Op;
19755
19756 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
19757 return V;
19758 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
19759 return V;
19760
19761 // SSE doesn't have an i16 conversion so we need to promote.
19762 if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
19763 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
19764 if (IsStrict)
19765 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
19766 {Chain, Ext});
19767
19768 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
19769 }
19770
19771 if (VT == MVT::f128 || !Subtarget.hasX87())
19772 return SDValue();
19773
19774 SDValue ValueToStore = Src;
19775 if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
19776 // Bitcasting to f64 here allows us to do a single 64-bit store from
19777 // an SSE register, avoiding the store forwarding penalty that would come
19778 // with two 32-bit stores.
19779 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
19780
19781 unsigned Size = SrcVT.getStoreSize();
19782 Align Alignment(Size);
19784 auto PtrVT = getPointerTy(MF.getDataLayout());
19785 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
19786 MachinePointerInfo MPI =
19788 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19789 Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
19790 std::pair<SDValue, SDValue> Tmp =
19791 BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
19792
19793 if (IsStrict)
19794 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
19795
19796 return Tmp.first;
19797}
19798
19799std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
19800 EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
19801 MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
19802 // Build the FILD
19803 SDVTList Tys;
19804 bool useSSE = isScalarFPTypeInSSEReg(DstVT);
19805 if (useSSE)
19806 Tys = DAG.getVTList(MVT::f80, MVT::Other);
19807 else
19808 Tys = DAG.getVTList(DstVT, MVT::Other);
19809
19810 SDValue FILDOps[] = {Chain, Pointer};
19811 SDValue Result =
19812 DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
19813 Alignment, MachineMemOperand::MOLoad);
19814 Chain = Result.getValue(1);
19815
19816 if (useSSE) {
19818 unsigned SSFISize = DstVT.getStoreSize();
19819 int SSFI =
19820 MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
19821 auto PtrVT = getPointerTy(MF.getDataLayout());
19822 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
19823 Tys = DAG.getVTList(MVT::Other);
19824 SDValue FSTOps[] = {Chain, Result, StackSlot};
19827 MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
19828
19829 Chain =
19830 DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
19831 Result = DAG.getLoad(
19832 DstVT, DL, Chain, StackSlot,
19834 Chain = Result.getValue(1);
19835 }
19836
19837 return { Result, Chain };
19838}
19839
19840/// Horizontal vector math instructions may be slower than normal math with
19841/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch
19842/// implementation, and likely shuffle complexity of the alternate sequence.
19843static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
19844 const X86Subtarget &Subtarget) {
19845 bool IsOptimizingSize = DAG.shouldOptForSize();
19846 bool HasFastHOps = Subtarget.hasFastHorizontalOps();
19847 return !IsSingleSource || IsOptimizingSize || HasFastHOps;
19848}
19849
19850/// 64-bit unsigned integer to double expansion.
19852 SelectionDAG &DAG,
19853 const X86Subtarget &Subtarget) {
19854 // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0
19855 // when converting 0 when rounding toward negative infinity. Caller will
19856 // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.
19857 assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!");
19858 // This algorithm is not obvious. Here it is what we're trying to output:
19859 /*
19860 movq %rax, %xmm0
19861 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
19862 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
19863 #ifdef __SSE3__
19864 haddpd %xmm0, %xmm0
19865 #else
19866 pshufd $0x4e, %xmm0, %xmm1
19867 addpd %xmm1, %xmm0
19868 #endif
19869 */
19870
19871 LLVMContext *Context = DAG.getContext();
19872
19873 // Build some magic constants.
19874 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
19875 Constant *C0 = ConstantDataVector::get(*Context, CV0);
19876 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
19877 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
19878
19880 CV1.push_back(
19881 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19882 APInt(64, 0x4330000000000000ULL))));
19883 CV1.push_back(
19884 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
19885 APInt(64, 0x4530000000000000ULL))));
19886 Constant *C1 = ConstantVector::get(CV1);
19887 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
19888
19889 // Load the 64-bit value into an XMM register.
19890 SDValue XR1 =
19891 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));
19892 SDValue CLod0 = DAG.getLoad(
19893 MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
19895 SDValue Unpck1 =
19896 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
19897
19898 SDValue CLod1 = DAG.getLoad(
19899 MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
19901 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
19902 // TODO: Are there any fast-math-flags to propagate here?
19903 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
19904 SDValue Result;
19905
19906 if (Subtarget.hasSSE3() &&
19907 shouldUseHorizontalOp(true, DAG, Subtarget)) {
19908 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
19909 } else {
19910 SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
19911 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
19912 }
19913 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
19914 DAG.getIntPtrConstant(0, dl));
19915 return Result;
19916}
19917
19918/// 32-bit unsigned integer to float expansion.
19920 SelectionDAG &DAG,
19921 const X86Subtarget &Subtarget) {
19922 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
19923 // FP constant to bias correct the final result.
19924 SDValue Bias = DAG.getConstantFP(
19925 llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);
19926
19927 // Load the 32-bit value into an XMM register.
19928 SDValue Load =
19929 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
19930
19931 // Zero out the upper parts of the register.
19932 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
19933
19934 // Or the load with the bias.
19935 SDValue Or = DAG.getNode(
19936 ISD::OR, dl, MVT::v2i64,
19937 DAG.getBitcast(MVT::v2i64, Load),
19938 DAG.getBitcast(MVT::v2i64,
19939 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
19940 Or =
19941 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
19942 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
19943
19944 if (Op.getNode()->isStrictFPOpcode()) {
19945 // Subtract the bias.
19946 // TODO: Are there any fast-math-flags to propagate here?
19947 SDValue Chain = Op.getOperand(0);
19948 SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
19949 {Chain, Or, Bias});
19950
19951 if (Op.getValueType() == Sub.getValueType())
19952 return Sub;
19953
19954 // Handle final rounding.
19955 std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
19956 Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
19957
19958 return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
19959 }
19960
19961 // Subtract the bias.
19962 // TODO: Are there any fast-math-flags to propagate here?
19963 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
19964
19965 // Handle final rounding.
19966 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
19967}
19968
19970 SelectionDAG &DAG,
19971 const X86Subtarget &Subtarget) {
19972 if (Op.getSimpleValueType() != MVT::v2f64)
19973 return SDValue();
19974
19975 bool IsStrict = Op->isStrictFPOpcode();
19976
19977 SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
19978 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
19979
19980 if (Subtarget.hasAVX512()) {
19981 if (!Subtarget.hasVLX()) {
19982 // Let generic type legalization widen this.
19983 if (!IsStrict)
19984 return SDValue();
19985 // Otherwise pad the integer input with 0s and widen the operation.
19986 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
19987 DAG.getConstant(0, DL, MVT::v2i32));
19988 SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
19989 {Op.getOperand(0), N0});
19990 SDValue Chain = Res.getValue(1);
19991 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
19992 DAG.getIntPtrConstant(0, DL));
19993 return DAG.getMergeValues({Res, Chain}, DL);
19994 }
19995
19996 // Legalize to v4i32 type.
19997 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
19998 DAG.getUNDEF(MVT::v2i32));
19999 if (IsStrict)
20000 return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
20001 {Op.getOperand(0), N0});
20002 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
20003 }
20004
20005 // Zero extend to 2i64, OR with the floating point representation of 2^52.
20006 // This gives us the floating point equivalent of 2^52 + the i32 integer
20007 // since double has 52-bits of mantissa. Then subtract 2^52 in floating
20008 // point leaving just our i32 integers in double format.
20009 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
20010 SDValue VBias = DAG.getConstantFP(
20011 llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);
20012 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
20013 DAG.getBitcast(MVT::v2i64, VBias));
20014 Or = DAG.getBitcast(MVT::v2f64, Or);
20015
20016 if (IsStrict)
20017 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
20018 {Op.getOperand(0), Or, VBias});
20019 return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
20020}
20021
20023 SelectionDAG &DAG,
20024 const X86Subtarget &Subtarget) {
20025 bool IsStrict = Op->isStrictFPOpcode();
20026 SDValue V = Op->getOperand(IsStrict ? 1 : 0);
20027 MVT VecIntVT = V.getSimpleValueType();
20028 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
20029 "Unsupported custom type");
20030
20031 if (Subtarget.hasAVX512()) {
20032 // With AVX512, but not VLX we need to widen to get a 512-bit result type.
20033 assert(!Subtarget.hasVLX() && "Unexpected features");
20034 MVT VT = Op->getSimpleValueType(0);
20035
20036 // v8i32->v8f64 is legal with AVX512 so just return it.
20037 if (VT == MVT::v8f64)
20038 return Op;
20039
20040 assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
20041 "Unexpected VT!");
20042 MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
20043 MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
20044 // Need to concat with zero vector for strict fp to avoid spurious
20045 // exceptions.
20046 SDValue Tmp =
20047 IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
20048 V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
20049 DAG.getIntPtrConstant(0, DL));
20050 SDValue Res, Chain;
20051 if (IsStrict) {
20052 Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
20053 {Op->getOperand(0), V});
20054 Chain = Res.getValue(1);
20055 } else {
20056 Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
20057 }
20058
20059 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
20060 DAG.getIntPtrConstant(0, DL));
20061
20062 if (IsStrict)
20063 return DAG.getMergeValues({Res, Chain}, DL);
20064 return Res;
20065 }
20066
20067 if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
20068 Op->getSimpleValueType(0) == MVT::v4f64) {
20069 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
20070 Constant *Bias = ConstantFP::get(
20071 *DAG.getContext(),
20072 APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
20073 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
20074 SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
20075 SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
20076 SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
20077 SDValue VBias = DAG.getMemIntrinsicNode(
20078 X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
20081
20082 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
20083 DAG.getBitcast(MVT::v4i64, VBias));
20084 Or = DAG.getBitcast(MVT::v4f64, Or);
20085
20086 if (IsStrict)
20087 return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
20088 {Op.getOperand(0), Or, VBias});
20089 return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
20090 }
20091
20092 // The algorithm is the following:
20093 // #ifdef __SSE4_1__
20094 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20095 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20096 // (uint4) 0x53000000, 0xaa);
20097 // #else
20098 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20099 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20100 // #endif
20101 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20102 // return (float4) lo + fhi;
20103
20104 bool Is128 = VecIntVT == MVT::v4i32;
20105 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
20106 // If we convert to something else than the supported type, e.g., to v4f64,
20107 // abort early.
20108 if (VecFloatVT != Op->getSimpleValueType(0))
20109 return SDValue();
20110
20111 // In the #idef/#else code, we have in common:
20112 // - The vector of constants:
20113 // -- 0x4b000000
20114 // -- 0x53000000
20115 // - A shift:
20116 // -- v >> 16
20117
20118 // Create the splat vector for 0x4b000000.
20119 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
20120 // Create the splat vector for 0x53000000.
20121 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
20122
20123 // Create the right shift.
20124 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
20125 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
20126
20127 SDValue Low, High;
20128 if (Subtarget.hasSSE41()) {
20129 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
20130 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
20131 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
20132 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
20133 // Low will be bitcasted right away, so do not bother bitcasting back to its
20134 // original type.
20135 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
20136 VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20137 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
20138 // (uint4) 0x53000000, 0xaa);
20139 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
20140 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
20141 // High will be bitcasted right away, so do not bother bitcasting back to
20142 // its original type.
20143 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
20144 VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));
20145 } else {
20146 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
20147 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
20148 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
20149 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
20150
20151 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
20152 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
20153 }
20154
20155 // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
20156 SDValue VecCstFSub = DAG.getConstantFP(
20157 APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
20158
20159 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
20160 // NOTE: By using fsub of a positive constant instead of fadd of a negative
20161 // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
20162 // enabled. See PR24512.
20163 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
20164 // TODO: Are there any fast-math-flags to propagate here?
20165 // (float4) lo;
20166 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
20167 // return (float4) lo + fhi;
20168 if (IsStrict) {
20169 SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
20170 {Op.getOperand(0), HighBitcast, VecCstFSub});
20171 return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
20172 {FHigh.getValue(1), LowBitcast, FHigh});
20173 }
20174
20175 SDValue FHigh =
20176 DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
20177 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
20178}
20179
20181 const X86Subtarget &Subtarget) {
20182 unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
20183 SDValue N0 = Op.getOperand(OpNo);
20184 MVT SrcVT = N0.getSimpleValueType();
20185
20186 switch (SrcVT.SimpleTy) {
20187 default:
20188 llvm_unreachable("Custom UINT_TO_FP is not supported!");
20189 case MVT::v2i32:
20190 return lowerUINT_TO_FP_v2i32(Op, dl, DAG, Subtarget);
20191 case MVT::v4i32:
20192 case MVT::v8i32:
20193 return lowerUINT_TO_FP_vXi32(Op, dl, DAG, Subtarget);
20194 case MVT::v2i64:
20195 case MVT::v4i64:
20196 return lowerINT_TO_FP_vXi64(Op, dl, DAG, Subtarget);
20197 }
20198}
20199
20200SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
20201 SelectionDAG &DAG) const {
20202 bool IsStrict = Op->isStrictFPOpcode();
20203 unsigned OpNo = IsStrict ? 1 : 0;
20204 SDValue Src = Op.getOperand(OpNo);
20205 SDLoc dl(Op);
20206 auto PtrVT = getPointerTy(DAG.getDataLayout());
20207 MVT SrcVT = Src.getSimpleValueType();
20208 MVT DstVT = Op->getSimpleValueType(0);
20209 SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20210
20211 // Bail out when we don't have native conversion instructions.
20212 if (DstVT == MVT::f128)
20213 return SDValue();
20214
20215 if (isSoftF16(DstVT, Subtarget))
20216 return promoteXINT_TO_FP(Op, dl, DAG);
20217 else if (isLegalConversion(SrcVT, false, Subtarget))
20218 return Op;
20219
20220 if (DstVT.isVector())
20221 return lowerUINT_TO_FP_vec(Op, dl, DAG, Subtarget);
20222
20223 if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)
20224 return LowerWin64_INT128_TO_FP(Op, DAG);
20225
20226 if (SDValue Extract = vectorizeExtractedCast(Op, dl, DAG, Subtarget))
20227 return Extract;
20228
20229 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
20230 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
20231 // Conversions from unsigned i32 to f32/f64 are legal,
20232 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
20233 return Op;
20234 }
20235
20236 // Promote i32 to i64 and use a signed conversion on 64-bit targets.
20237 if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
20238 Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
20239 if (IsStrict)
20240 return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
20241 {Chain, Src});
20242 return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
20243 }
20244
20245 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, dl, DAG, Subtarget))
20246 return V;
20247 if (SDValue V = LowerI64IntToFP16(Op, dl, DAG, Subtarget))
20248 return V;
20249
20250 // The transform for i64->f64 isn't correct for 0 when rounding to negative
20251 // infinity. It produces -0.0, so disable under strictfp.
20252 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&
20253 !IsStrict)
20254 return LowerUINT_TO_FP_i64(Op, dl, DAG, Subtarget);
20255 // The transform for i32->f64/f32 isn't correct for 0 when rounding to
20256 // negative infinity. So disable under strictfp. Using FILD instead.
20257 if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&
20258 !IsStrict)
20259 return LowerUINT_TO_FP_i32(Op, dl, DAG, Subtarget);
20260 if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&
20261 (DstVT == MVT::f32 || DstVT == MVT::f64))
20262 return SDValue();
20263
20264 // Make a 64-bit buffer, and use it to build an FILD.
20265 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
20266 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
20267 Align SlotAlign(8);
20268 MachinePointerInfo MPI =
20270 if (SrcVT == MVT::i32) {
20271 SDValue OffsetSlot =
20272 DAG.getMemBasePlusOffset(StackSlot, TypeSize::getFixed(4), dl);
20273 SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);
20274 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
20275 OffsetSlot, MPI.getWithOffset(4), SlotAlign);
20276 std::pair<SDValue, SDValue> Tmp =
20277 BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);
20278 if (IsStrict)
20279 return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
20280
20281 return Tmp.first;
20282 }
20283
20284 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
20285 SDValue ValueToStore = Src;
20286 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
20287 // Bitcasting to f64 here allows us to do a single 64-bit store from
20288 // an SSE register, avoiding the store forwarding penalty that would come
20289 // with two 32-bit stores.
20290 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
20291 }
20292 SDValue Store =
20293 DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);
20294 // For i64 source, we need to add the appropriate power of 2 if the input
20295 // was negative. We must be careful to do the computation in x87 extended
20296 // precision, not in SSE.
20297 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20298 SDValue Ops[] = {Store, StackSlot};
20299 SDValue Fild =
20300 DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
20301 SlotAlign, MachineMemOperand::MOLoad);
20302 Chain = Fild.getValue(1);
20303
20304 // Check whether the sign bit is set.
20305 SDValue SignSet = DAG.getSetCC(
20306 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
20307 Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
20308
20309 // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
20310 APInt FF(64, 0x5F80000000000000ULL);
20311 SDValue FudgePtr =
20312 DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF), PtrVT);
20313 Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
20314
20315 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
20316 SDValue Zero = DAG.getIntPtrConstant(0, dl);
20317 SDValue Four = DAG.getIntPtrConstant(4, dl);
20318 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
20319 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
20320
20321 // Load the value out, extending it from f32 to f80.
20322 SDValue Fudge = DAG.getExtLoad(
20323 ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
20325 CPAlignment);
20326 Chain = Fudge.getValue(1);
20327 // Extend everything to 80 bits to force it to be done on x87.
20328 // TODO: Are there any fast-math-flags to propagate here?
20329 if (IsStrict) {
20330 unsigned Opc = ISD::STRICT_FADD;
20331 // Windows needs the precision control changed to 80bits around this add.
20332 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20334
20335 SDValue Add =
20336 DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
20337 // STRICT_FP_ROUND can't handle equal types.
20338 if (DstVT == MVT::f80)
20339 return Add;
20340 return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
20341 {Add.getValue(1), Add,
20342 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)});
20343 }
20344 unsigned Opc = ISD::FADD;
20345 // Windows needs the precision control changed to 80bits around this add.
20346 if (Subtarget.isOSWindows() && DstVT == MVT::f32)
20347 Opc = X86ISD::FP80_ADD;
20348
20349 SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
20350 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
20351 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
20352}
20353
20354// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
20355// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
20356// just return an SDValue().
20357// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
20358// to i16, i32 or i64, and we lower it to a legal sequence and return the
20359// result.
20360SDValue X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
20361 bool IsSigned,
20362 SDValue &Chain) const {
20363 bool IsStrict = Op->isStrictFPOpcode();
20364 SDLoc DL(Op);
20365
20366 EVT DstTy = Op.getValueType();
20367 SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
20368 EVT TheVT = Value.getValueType();
20369 auto PtrVT = getPointerTy(DAG.getDataLayout());
20370
20371 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
20372 // f16 must be promoted before using the lowering in this routine.
20373 // fp128 does not use this lowering.
20374 return SDValue();
20375 }
20376
20377 // If using FIST to compute an unsigned i64, we'll need some fixup
20378 // to handle values above the maximum signed i64. A FIST is always
20379 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
20380 bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
20381
20382 // FIXME: This does not generate an invalid exception if the input does not
20383 // fit in i32. PR44019
20384 if (!IsSigned && DstTy != MVT::i64) {
20385 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
20386 // The low 32 bits of the fist result will have the correct uint32 result.
20387 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
20388 DstTy = MVT::i64;
20389 }
20390
20391 assert(DstTy.getSimpleVT() <= MVT::i64 &&
20392 DstTy.getSimpleVT() >= MVT::i16 &&
20393 "Unknown FP_TO_INT to lower!");
20394
20395 // We lower FP->int64 into FISTP64 followed by a load from a temporary
20396 // stack slot.
20398 unsigned MemSize = DstTy.getStoreSize();
20399 int SSFI =
20400 MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
20401 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
20402
20403 Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
20404
20405 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
20406
20407 if (UnsignedFixup) {
20408 //
20409 // Conversion to unsigned i64 is implemented with a select,
20410 // depending on whether the source value fits in the range
20411 // of a signed i64. Let Thresh be the FP equivalent of
20412 // 0x8000000000000000ULL.
20413 //
20414 // Adjust = (Value >= Thresh) ? 0x80000000 : 0;
20415 // FltOfs = (Value >= Thresh) ? 0x80000000 : 0;
20416 // FistSrc = (Value - FltOfs);
20417 // Fist-to-mem64 FistSrc
20418 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
20419 // to XOR'ing the high 32 bits with Adjust.
20420 //
20421 // Being a power of 2, Thresh is exactly representable in all FP formats.
20422 // For X87 we'd like to use the smallest FP type for this constant, but
20423 // for DAG type consistency we have to match the FP operand type.
20424
20425 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
20427 bool LosesInfo = false;
20428 if (TheVT == MVT::f64)
20429 // The rounding mode is irrelevant as the conversion should be exact.
20431 &LosesInfo);
20432 else if (TheVT == MVT::f80)
20433 Status = Thresh.convert(APFloat::x87DoubleExtended(),
20434 APFloat::rmNearestTiesToEven, &LosesInfo);
20435
20436 assert(Status == APFloat::opOK && !LosesInfo &&
20437 "FP conversion should have been exact");
20438
20439 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
20440
20441 EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
20442 *DAG.getContext(), TheVT);
20443 SDValue Cmp;
20444 if (IsStrict) {
20445 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,
20446 /*IsSignaling*/ true);
20447 Chain = Cmp.getValue(1);
20448 } else {
20449 Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);
20450 }
20451
20452 // Our preferred lowering of
20453 //
20454 // (Value >= Thresh) ? 0x8000000000000000ULL : 0
20455 //
20456 // is
20457 //
20458 // (Value >= Thresh) << 63
20459 //
20460 // but since we can get here after LegalOperations, DAGCombine might do the
20461 // wrong thing if we create a select. So, directly create the preferred
20462 // version.
20463 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);
20464 SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);
20465 Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);
20466
20467 SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,
20468 DAG.getConstantFP(0.0, DL, TheVT));
20469
20470 if (IsStrict) {
20471 Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
20472 { Chain, Value, FltOfs });
20473 Chain = Value.getValue(1);
20474 } else
20475 Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
20476 }
20477
20479
20480 // FIXME This causes a redundant load/store if the SSE-class value is already
20481 // in memory, such as if it is on the callstack.
20482 if (isScalarFPTypeInSSEReg(TheVT)) {
20483 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
20484 Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
20485 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
20486 SDValue Ops[] = { Chain, StackSlot };
20487
20488 unsigned FLDSize = TheVT.getStoreSize();
20489 assert(FLDSize <= MemSize && "Stack slot not big enough");
20491 MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
20492 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
20493 Chain = Value.getValue(1);
20494 }
20495
20496 // Build the FP_TO_INT*_IN_MEM
20498 MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
20499 SDValue Ops[] = { Chain, Value, StackSlot };
20501 DAG.getVTList(MVT::Other),
20502 Ops, DstTy, MMO);
20503
20504 SDValue Res = DAG.getLoad(Op.getValueType(), DL, FIST, StackSlot, MPI);
20505 Chain = Res.getValue(1);
20506
20507 // If we need an unsigned fixup, XOR the result with adjust.
20508 if (UnsignedFixup)
20509 Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);
20510
20511 return Res;
20512}
20513
20515 const X86Subtarget &Subtarget) {
20516 MVT VT = Op.getSimpleValueType();
20517 SDValue In = Op.getOperand(0);
20518 MVT InVT = In.getSimpleValueType();
20519 unsigned Opc = Op.getOpcode();
20520
20521 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
20522 assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
20523 "Unexpected extension opcode");
20525 "Expected same number of elements");
20526 assert((VT.getVectorElementType() == MVT::i16 ||
20527 VT.getVectorElementType() == MVT::i32 ||
20528 VT.getVectorElementType() == MVT::i64) &&
20529 "Unexpected element type");
20530 assert((InVT.getVectorElementType() == MVT::i8 ||
20531 InVT.getVectorElementType() == MVT::i16 ||
20532 InVT.getVectorElementType() == MVT::i32) &&
20533 "Unexpected element type");
20534
20535 unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);
20536
20537 if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
20538 assert(InVT == MVT::v32i8 && "Unexpected VT!");
20539 return splitVectorIntUnary(Op, DAG, dl);
20540 }
20541
20542 if (Subtarget.hasInt256())
20543 return Op;
20544
20545 // Optimize vectors in AVX mode:
20546 //
20547 // v8i16 -> v8i32
20548 // Use vpmovzwd for 4 lower elements v8i16 -> v4i32.
20549 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
20550 // Concat upper and lower parts.
20551 //
20552 // v4i32 -> v4i64
20553 // Use vpmovzdq for 4 lower elements v4i32 -> v2i64.
20554 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
20555 // Concat upper and lower parts.
20556 //
20557 MVT HalfVT = VT.getHalfNumVectorElementsVT();
20558 SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);
20559
20560 // Short-circuit if we can determine that each 128-bit half is the same value.
20561 // Otherwise, this is difficult to match and optimize.
20562 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))
20563 if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))
20564 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);
20565
20566 SDValue ZeroVec = DAG.getConstant(0, dl, InVT);
20567 SDValue Undef = DAG.getUNDEF(InVT);
20568 bool NeedZero = Opc == ISD::ZERO_EXTEND;
20569 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
20570 OpHi = DAG.getBitcast(HalfVT, OpHi);
20571
20572 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
20573}
20574
20575// Helper to split and extend a v16i1 mask to v16i8 or v16i16.
20576static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
20577 const SDLoc &dl, SelectionDAG &DAG) {
20578 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
20579 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20580 DAG.getIntPtrConstant(0, dl));
20581 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
20582 DAG.getIntPtrConstant(8, dl));
20583 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
20584 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
20585 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
20586 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
20587}
20588
20590 const X86Subtarget &Subtarget,
20591 SelectionDAG &DAG) {
20592 MVT VT = Op->getSimpleValueType(0);
20593 SDValue In = Op->getOperand(0);
20594 MVT InVT = In.getSimpleValueType();
20595 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
20596 unsigned NumElts = VT.getVectorNumElements();
20597
20598 // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This
20599 // avoids a constant pool load.
20600 if (VT.getVectorElementType() != MVT::i8) {
20601 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
20602 return DAG.getNode(ISD::SRL, DL, VT, Extend,
20603 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
20604 }
20605
20606 // Extend VT if BWI is not supported.
20607 MVT ExtVT = VT;
20608 if (!Subtarget.hasBWI()) {
20609 // If v16i32 is to be avoided, we'll need to split and concatenate.
20610 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
20611 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
20612
20613 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
20614 }
20615
20616 // Widen to 512-bits if VLX is not supported.
20617 MVT WideVT = ExtVT;
20618 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
20619 NumElts *= 512 / ExtVT.getSizeInBits();
20620 InVT = MVT::getVectorVT(MVT::i1, NumElts);
20621 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
20622 In, DAG.getIntPtrConstant(0, DL));
20623 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
20624 NumElts);
20625 }
20626
20627 SDValue One = DAG.getConstant(1, DL, WideVT);
20628 SDValue Zero = DAG.getConstant(0, DL, WideVT);
20629
20630 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
20631
20632 // Truncate if we had to extend above.
20633 if (VT != ExtVT) {
20634 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
20635 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
20636 }
20637
20638 // Extract back to 128/256-bit if we widened.
20639 if (WideVT != VT)
20640 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
20641 DAG.getIntPtrConstant(0, DL));
20642
20643 return SelectedVal;
20644}
20645
20647 SelectionDAG &DAG) {
20648 SDValue In = Op.getOperand(0);
20649 MVT SVT = In.getSimpleValueType();
20650 SDLoc DL(Op);
20651
20652 if (SVT.getVectorElementType() == MVT::i1)
20653 return LowerZERO_EXTEND_Mask(Op, DL, Subtarget, DAG);
20654
20655 assert(Subtarget.hasAVX() && "Expected AVX support");
20656 return LowerAVXExtend(Op, DL, DAG, Subtarget);
20657}
20658
20659/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
20660/// It makes use of the fact that vectors with enough leading sign/zero bits
20661/// prevent the PACKSS/PACKUS from saturating the results.
20662/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
20663/// within each 128-bit lane.
20664static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
20665 const SDLoc &DL, SelectionDAG &DAG,
20666 const X86Subtarget &Subtarget) {
20667 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
20668 "Unexpected PACK opcode");
20669 assert(DstVT.isVector() && "VT not a vector?");
20670
20671 // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
20672 if (!Subtarget.hasSSE2())
20673 return SDValue();
20674
20675 EVT SrcVT = In.getValueType();
20676
20677 // No truncation required, we might get here due to recursive calls.
20678 if (SrcVT == DstVT)
20679 return In;
20680
20681 unsigned NumElems = SrcVT.getVectorNumElements();
20682 if (NumElems < 2 || !isPowerOf2_32(NumElems) )
20683 return SDValue();
20684
20685 unsigned DstSizeInBits = DstVT.getSizeInBits();
20686 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
20687 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
20688 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
20689
20690 LLVMContext &Ctx = *DAG.getContext();
20691 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
20692 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
20693
20694 // Pack to the largest type possible:
20695 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
20696 EVT InVT = MVT::i16, OutVT = MVT::i8;
20697 if (SrcVT.getScalarSizeInBits() > 16 &&
20698 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
20699 InVT = MVT::i32;
20700 OutVT = MVT::i16;
20701 }
20702
20703 // Sub-128-bit truncation - widen to 128-bit src and pack in the lower half.
20704 // On pre-AVX512, pack the src in both halves to help value tracking.
20705 if (SrcSizeInBits <= 128) {
20706 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
20707 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
20708 In = widenSubVector(In, false, Subtarget, DAG, DL, 128);
20709 SDValue LHS = DAG.getBitcast(InVT, In);
20710 SDValue RHS = Subtarget.hasAVX512() ? DAG.getUNDEF(InVT) : LHS;
20711 SDValue Res = DAG.getNode(Opcode, DL, OutVT, LHS, RHS);
20712 Res = extractSubVector(Res, 0, DAG, DL, SrcSizeInBits / 2);
20713 Res = DAG.getBitcast(PackedVT, Res);
20714 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20715 }
20716
20717 // Split lower/upper subvectors.
20718 SDValue Lo, Hi;
20719 std::tie(Lo, Hi) = splitVector(In, DAG, DL);
20720
20721 // If Hi is undef, then don't bother packing it and widen the result instead.
20722 if (Hi.isUndef()) {
20723 EVT DstHalfVT = DstVT.getHalfNumVectorElementsVT(Ctx);
20724 if (SDValue Res =
20725 truncateVectorWithPACK(Opcode, DstHalfVT, Lo, DL, DAG, Subtarget))
20726 return widenSubVector(Res, false, Subtarget, DAG, DL, DstSizeInBits);
20727 }
20728
20729 unsigned SubSizeInBits = SrcSizeInBits / 2;
20730 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
20731 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
20732
20733 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
20734 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
20735 Lo = DAG.getBitcast(InVT, Lo);
20736 Hi = DAG.getBitcast(InVT, Hi);
20737 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20738 return DAG.getBitcast(DstVT, Res);
20739 }
20740
20741 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
20742 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
20743 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
20744 Lo = DAG.getBitcast(InVT, Lo);
20745 Hi = DAG.getBitcast(InVT, Hi);
20746 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
20747
20748 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
20749 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
20750 // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
20752 int Scale = 64 / OutVT.getScalarSizeInBits();
20753 narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
20754 Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
20755
20756 if (DstVT.is256BitVector())
20757 return DAG.getBitcast(DstVT, Res);
20758
20759 // If 512bit -> 128bit truncate another stage.
20760 Res = DAG.getBitcast(PackedVT, Res);
20761 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20762 }
20763
20764 // Recursively pack lower/upper subvectors, concat result and pack again.
20765 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
20766
20767 if (PackedVT.is128BitVector()) {
20768 // Avoid CONCAT_VECTORS on sub-128bit nodes as these can fail after
20769 // type legalization.
20770 SDValue Res =
20771 truncateVectorWithPACK(Opcode, PackedVT, In, DL, DAG, Subtarget);
20772 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20773 }
20774
20775 EVT HalfPackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
20776 Lo = truncateVectorWithPACK(Opcode, HalfPackedVT, Lo, DL, DAG, Subtarget);
20777 Hi = truncateVectorWithPACK(Opcode, HalfPackedVT, Hi, DL, DAG, Subtarget);
20778 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
20779 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
20780}
20781
20782/// Truncate using inreg zero extension (AND mask) and X86ISD::PACKUS.
20783/// e.g. trunc <8 x i32> X to <8 x i16> -->
20784/// MaskX = X & 0xffff (clear high bits to prevent saturation)
20785/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)
20787 const X86Subtarget &Subtarget,
20788 SelectionDAG &DAG) {
20789 In = DAG.getZeroExtendInReg(In, DL, DstVT);
20790 return truncateVectorWithPACK(X86ISD::PACKUS, DstVT, In, DL, DAG, Subtarget);
20791}
20792
20793/// Truncate using inreg sign extension and X86ISD::PACKSS.
20795 const X86Subtarget &Subtarget,
20796 SelectionDAG &DAG) {
20797 EVT SrcVT = In.getValueType();
20798 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, SrcVT, In,
20799 DAG.getValueType(DstVT));
20800 return truncateVectorWithPACK(X86ISD::PACKSS, DstVT, In, DL, DAG, Subtarget);
20801}
20802
20803/// Helper to determine if \p In truncated to \p DstVT has the necessary
20804/// signbits / leading zero bits to be truncated with PACKSS / PACKUS,
20805/// possibly by converting a SRL node to SRA for sign extension.
20806static SDValue matchTruncateWithPACK(unsigned &PackOpcode, EVT DstVT,
20807 SDValue In, const SDLoc &DL,
20808 SelectionDAG &DAG,
20809 const X86Subtarget &Subtarget) {
20810 // Requires SSE2.
20811 if (!Subtarget.hasSSE2())
20812 return SDValue();
20813
20814 EVT SrcVT = In.getValueType();
20815 EVT DstSVT = DstVT.getVectorElementType();
20816 EVT SrcSVT = SrcVT.getVectorElementType();
20817 unsigned NumDstEltBits = DstSVT.getSizeInBits();
20818 unsigned NumSrcEltBits = SrcSVT.getSizeInBits();
20819
20820 // Check we have a truncation suited for PACKSS/PACKUS.
20821 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20822 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20823 return SDValue();
20824
20825 assert(NumSrcEltBits > NumDstEltBits && "Bad truncation");
20826 unsigned NumStages = Log2_32(NumSrcEltBits / NumDstEltBits);
20827
20828 // Truncation from 128-bit to vXi32 can be better handled with PSHUFD.
20829 // Truncation to sub-64-bit vXi16 can be better handled with PSHUFD/PSHUFLW.
20830 // Truncation from v2i64 to v2i8 can be better handled with PSHUFB.
20831 if ((DstSVT == MVT::i32 && SrcVT.getSizeInBits() <= 128) ||
20832 (DstSVT == MVT::i16 && SrcVT.getSizeInBits() <= (64 * NumStages)) ||
20833 (DstVT == MVT::v2i8 && SrcVT == MVT::v2i64 && Subtarget.hasSSSE3()))
20834 return SDValue();
20835
20836 // Prefer to lower v4i64 -> v4i32 as a shuffle unless we can cheaply
20837 // split this for packing.
20838 if (SrcVT == MVT::v4i64 && DstVT == MVT::v4i32 &&
20839 !isFreeToSplitVector(In.getNode(), DAG) &&
20840 (!Subtarget.hasAVX() || DAG.ComputeNumSignBits(In) != 64))
20841 return SDValue();
20842
20843 // Don't truncate AVX512 targets as multiple PACK nodes stages.
20844 if (Subtarget.hasAVX512() && NumStages > 1)
20845 return SDValue();
20846
20847 unsigned NumPackedSignBits = std::min<unsigned>(NumDstEltBits, 16);
20848 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
20849
20850 // Truncate with PACKUS if we are truncating a vector with leading zero
20851 // bits that extend all the way to the packed/truncated value.
20852 // e.g. Masks, zext_in_reg, etc.
20853 // Pre-SSE41 we can only use PACKUSWB.
20854 KnownBits Known = DAG.computeKnownBits(In);
20855 if ((NumSrcEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) {
20856 PackOpcode = X86ISD::PACKUS;
20857 return In;
20858 }
20859
20860 // Truncate with PACKSS if we are truncating a vector with sign-bits
20861 // that extend all the way to the packed/truncated value.
20862 // e.g. Comparison result, sext_in_reg, etc.
20863 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
20864
20865 // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
20866 // a sign splat (or AVX512 VPSRAQ support). ComputeNumSignBits struggles to
20867 // see through BITCASTs later on and combines/simplifications can't then use
20868 // it.
20869 if (DstSVT == MVT::i32 && NumSignBits != NumSrcEltBits &&
20870 !Subtarget.hasAVX512())
20871 return SDValue();
20872
20873 unsigned MinSignBits = NumSrcEltBits - NumPackedSignBits;
20874 if (MinSignBits < NumSignBits) {
20875 PackOpcode = X86ISD::PACKSS;
20876 return In;
20877 }
20878
20879 // If we have a srl that only generates signbits that we will discard in
20880 // the truncation then we can use PACKSS by converting the srl to a sra.
20881 // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.
20882 if (In.getOpcode() == ISD::SRL && In->hasOneUse())
20883 if (std::optional<uint64_t> ShAmt = DAG.getValidShiftAmount(In)) {
20884 if (*ShAmt == MinSignBits) {
20885 PackOpcode = X86ISD::PACKSS;
20886 return DAG.getNode(ISD::SRA, DL, SrcVT, In->ops());
20887 }
20888 }
20889
20890 return SDValue();
20891}
20892
20893/// This function lowers a vector truncation of 'extended sign-bits' or
20894/// 'extended zero-bits' values.
20895/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
20897 const SDLoc &DL,
20898 const X86Subtarget &Subtarget,
20899 SelectionDAG &DAG) {
20900 MVT SrcVT = In.getSimpleValueType();
20901 MVT DstSVT = DstVT.getVectorElementType();
20902 MVT SrcSVT = SrcVT.getVectorElementType();
20903 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20904 (DstSVT == MVT::i8 || DstSVT == MVT::i16 || DstSVT == MVT::i32)))
20905 return SDValue();
20906
20907 // If the upper half of the source is undef, then attempt to split and
20908 // only truncate the lower half.
20909 if (DstVT.getSizeInBits() >= 128) {
20910 SmallVector<SDValue> LowerOps;
20911 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
20912 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
20913 if (SDValue Res = LowerTruncateVecPackWithSignBits(DstHalfVT, Lo, DL,
20914 Subtarget, DAG))
20915 return widenSubVector(Res, false, Subtarget, DAG, DL,
20916 DstVT.getSizeInBits());
20917 }
20918 }
20919
20920 unsigned PackOpcode;
20921 if (SDValue Src =
20922 matchTruncateWithPACK(PackOpcode, DstVT, In, DL, DAG, Subtarget))
20923 return truncateVectorWithPACK(PackOpcode, DstVT, Src, DL, DAG, Subtarget);
20924
20925 return SDValue();
20926}
20927
20928/// This function lowers a vector truncation from vXi32/vXi64 to vXi8/vXi16 into
20929/// X86ISD::PACKUS/X86ISD::PACKSS operations.
20931 const X86Subtarget &Subtarget,
20932 SelectionDAG &DAG) {
20933 MVT SrcVT = In.getSimpleValueType();
20934 MVT DstSVT = DstVT.getVectorElementType();
20935 MVT SrcSVT = SrcVT.getVectorElementType();
20936 unsigned NumElems = DstVT.getVectorNumElements();
20937 if (!((SrcSVT == MVT::i16 || SrcSVT == MVT::i32 || SrcSVT == MVT::i64) &&
20938 (DstSVT == MVT::i8 || DstSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
20939 NumElems >= 8))
20940 return SDValue();
20941
20942 // SSSE3's pshufb results in less instructions in the cases below.
20943 if (Subtarget.hasSSSE3() && NumElems == 8) {
20944 if (SrcSVT == MVT::i16)
20945 return SDValue();
20946 if (SrcSVT == MVT::i32 && (DstSVT == MVT::i8 || !Subtarget.hasSSE41()))
20947 return SDValue();
20948 }
20949
20950 // If the upper half of the source is undef, then attempt to split and
20951 // only truncate the lower half.
20952 if (DstVT.getSizeInBits() >= 128) {
20953 SmallVector<SDValue> LowerOps;
20954 if (SDValue Lo = isUpperSubvectorUndef(In, DL, DAG)) {
20955 MVT DstHalfVT = DstVT.getHalfNumVectorElementsVT();
20956 if (SDValue Res = LowerTruncateVecPack(DstHalfVT, Lo, DL, Subtarget, DAG))
20957 return widenSubVector(Res, false, Subtarget, DAG, DL,
20958 DstVT.getSizeInBits());
20959 }
20960 }
20961
20962 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
20963 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
20964 // truncate 2 x v4i32 to v8i16.
20965 if (Subtarget.hasSSE41() || DstSVT == MVT::i8)
20966 return truncateVectorWithPACKUS(DstVT, In, DL, Subtarget, DAG);
20967
20968 if (SrcSVT == MVT::i16 || SrcSVT == MVT::i32)
20969 return truncateVectorWithPACKSS(DstVT, In, DL, Subtarget, DAG);
20970
20971 // Special case vXi64 -> vXi16, shuffle to vXi32 and then use PACKSS.
20972 if (DstSVT == MVT::i16 && SrcSVT == MVT::i64) {
20973 MVT TruncVT = MVT::getVectorVT(MVT::i32, NumElems);
20974 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, In);
20975 return truncateVectorWithPACKSS(DstVT, Trunc, DL, Subtarget, DAG);
20976 }
20977
20978 return SDValue();
20979}
20980
20982 SelectionDAG &DAG,
20983 const X86Subtarget &Subtarget) {
20984 MVT VT = Op.getSimpleValueType();
20985 SDValue In = Op.getOperand(0);
20986 MVT InVT = In.getSimpleValueType();
20987 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
20988
20989 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
20990 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
20991 if (InVT.getScalarSizeInBits() <= 16) {
20992 if (Subtarget.hasBWI()) {
20993 // legal, will go to VPMOVB2M, VPMOVW2M
20994 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
20995 // We need to shift to get the lsb into sign position.
20996 // Shift packed bytes not supported natively, bitcast to word
20997 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
20998 In = DAG.getNode(ISD::SHL, DL, ExtVT,
20999 DAG.getBitcast(ExtVT, In),
21000 DAG.getConstant(ShiftInx, DL, ExtVT));
21001 In = DAG.getBitcast(InVT, In);
21002 }
21003 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
21004 In, ISD::SETGT);
21005 }
21006 // Use TESTD/Q, extended vector to packed dword/qword.
21007 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
21008 "Unexpected vector type.");
21009 unsigned NumElts = InVT.getVectorNumElements();
21010 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
21011 // We need to change to a wider element type that we have support for.
21012 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
21013 // For 16 element vectors we extend to v16i32 unless we are explicitly
21014 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
21015 // we need to split into two 8 element vectors which we can extend to v8i32,
21016 // truncate and concat the results. There's an additional complication if
21017 // the original type is v16i8. In that case we can't split the v16i8
21018 // directly, so we need to shuffle high elements to low and use
21019 // sign_extend_vector_inreg.
21020 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
21021 SDValue Lo, Hi;
21022 if (InVT == MVT::v16i8) {
21023 Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
21024 Hi = DAG.getVectorShuffle(
21025 InVT, DL, In, In,
21026 {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
21027 Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
21028 } else {
21029 assert(InVT == MVT::v16i16 && "Unexpected VT!");
21030 Lo = extract128BitVector(In, 0, DAG, DL);
21031 Hi = extract128BitVector(In, 8, DAG, DL);
21032 }
21033 // We're split now, just emit two truncates and a concat. The two
21034 // truncates will trigger legalization to come back to this function.
21035 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
21036 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
21037 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21038 }
21039 // We either have 8 elements or we're allowed to use 512-bit vectors.
21040 // If we have VLX, we want to use the narrowest vector that can get the
21041 // job done so we use vXi32.
21042 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
21043 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
21044 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
21045 InVT = ExtVT;
21046 ShiftInx = InVT.getScalarSizeInBits() - 1;
21047 }
21048
21049 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
21050 // We need to shift to get the lsb into sign position.
21051 In = DAG.getNode(ISD::SHL, DL, InVT, In,
21052 DAG.getConstant(ShiftInx, DL, InVT));
21053 }
21054 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
21055 if (Subtarget.hasDQI())
21056 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);
21057 return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);
21058}
21059
21060SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
21061 SDLoc DL(Op);
21062 MVT VT = Op.getSimpleValueType();
21063 SDValue In = Op.getOperand(0);
21064 MVT InVT = In.getSimpleValueType();
21066 "Invalid TRUNCATE operation");
21067
21068 // If we're called by the type legalizer, handle a few cases.
21069 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21070 if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(InVT)) {
21071 if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
21072 VT.is128BitVector() && Subtarget.hasAVX512()) {
21073 assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
21074 "Unexpected subtarget!");
21075 // The default behavior is to truncate one step, concatenate, and then
21076 // truncate the remainder. We'd rather produce two 64-bit results and
21077 // concatenate those.
21078 SDValue Lo, Hi;
21079 std::tie(Lo, Hi) = DAG.SplitVector(In, DL);
21080
21081 EVT LoVT, HiVT;
21082 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
21083
21084 Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);
21085 Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
21086 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
21087 }
21088
21089 // Pre-AVX512 (or prefer-256bit) see if we can make use of PACKSS/PACKUS.
21090 if (!Subtarget.hasAVX512() ||
21091 (InVT.is512BitVector() && VT.is256BitVector()))
21092 if (SDValue SignPack =
21093 LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
21094 return SignPack;
21095
21096 // Pre-AVX512 see if we can make use of PACKSS/PACKUS.
21097 if (!Subtarget.hasAVX512())
21098 return LowerTruncateVecPack(VT, In, DL, Subtarget, DAG);
21099
21100 // Otherwise let default legalization handle it.
21101 return SDValue();
21102 }
21103
21104 if (VT.getVectorElementType() == MVT::i1)
21105 return LowerTruncateVecI1(Op, DL, DAG, Subtarget);
21106
21107 // Attempt to truncate with PACKUS/PACKSS even on AVX512 if we'd have to
21108 // concat from subvectors to use VPTRUNC etc.
21109 if (!Subtarget.hasAVX512() || isFreeToSplitVector(In.getNode(), DAG))
21110 if (SDValue SignPack =
21111 LowerTruncateVecPackWithSignBits(VT, In, DL, Subtarget, DAG))
21112 return SignPack;
21113
21114 // vpmovqb/w/d, vpmovdb/w, vpmovwb
21115 if (Subtarget.hasAVX512()) {
21116 if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
21117 assert(VT == MVT::v32i8 && "Unexpected VT!");
21118 return splitVectorIntUnary(Op, DAG, DL);
21119 }
21120
21121 // word to byte only under BWI. Otherwise we have to promoted to v16i32
21122 // and then truncate that. But we should only do that if we haven't been
21123 // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
21124 // handled by isel patterns.
21125 if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||
21126 Subtarget.canExtendTo512DQ())
21127 return Op;
21128 }
21129
21130 // Handle truncation of V256 to V128 using shuffles.
21131 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
21132
21133 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
21134 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
21135 if (Subtarget.hasInt256()) {
21136 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
21137 In = DAG.getBitcast(MVT::v8i32, In);
21138 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
21139 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
21140 DAG.getIntPtrConstant(0, DL));
21141 }
21142
21143 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21144 DAG.getIntPtrConstant(0, DL));
21145 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21146 DAG.getIntPtrConstant(2, DL));
21147 static const int ShufMask[] = {0, 2, 4, 6};
21148 return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),
21149 DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);
21150 }
21151
21152 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
21153 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
21154 if (Subtarget.hasInt256()) {
21155 // The PSHUFB mask:
21156 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
21157 -1, -1, -1, -1, -1, -1, -1, -1,
21158 16, 17, 20, 21, 24, 25, 28, 29,
21159 -1, -1, -1, -1, -1, -1, -1, -1 };
21160 In = DAG.getBitcast(MVT::v32i8, In);
21161 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
21162 In = DAG.getBitcast(MVT::v4i64, In);
21163
21164 static const int ShufMask2[] = {0, 2, -1, -1};
21165 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
21166 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
21167 DAG.getIntPtrConstant(0, DL));
21168 return DAG.getBitcast(MVT::v8i16, In);
21169 }
21170
21171 return Subtarget.hasSSE41()
21172 ? truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG)
21173 : truncateVectorWithPACKSS(VT, In, DL, Subtarget, DAG);
21174 }
21175
21176 if (VT == MVT::v16i8 && InVT == MVT::v16i16)
21177 return truncateVectorWithPACKUS(VT, In, DL, Subtarget, DAG);
21178
21179 llvm_unreachable("All 256->128 cases should have been handled above!");
21180}
21181
21182// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction
21183// behaves on out of range inputs to generate optimized conversions.
21185 SelectionDAG &DAG,
21186 const X86Subtarget &Subtarget) {
21187 MVT SrcVT = Src.getSimpleValueType();
21188 unsigned DstBits = VT.getScalarSizeInBits();
21189 assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported");
21190
21191 // Calculate the converted result for values in the range 0 to
21192 // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21193 SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);
21194 SDValue Big =
21195 DAG.getNode(X86ISD::CVTTP2SI, dl, VT,
21196 DAG.getNode(ISD::FSUB, dl, SrcVT, Src,
21197 DAG.getConstantFP(2147483648.0f, dl, SrcVT)));
21198
21199 // The "CVTTP2SI" instruction conveniently sets the sign bit if
21200 // and only if the value was out of range. So we can use that
21201 // as our indicator that we rather use "Big" instead of "Small".
21202 //
21203 // Use "Small" if "IsOverflown" has all bits cleared
21204 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21205
21206 // AVX1 can't use the signsplat masking for 256-bit vectors - we have to
21207 // use the slightly slower blendv select instead.
21208 if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {
21209 SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);
21210 return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);
21211 }
21212
21213 SDValue IsOverflown =
21214 DAG.getNode(X86ISD::VSRAI, dl, VT, Small,
21215 DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));
21216 return DAG.getNode(ISD::OR, dl, VT, Small,
21217 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21218}
21219
21220SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
21221 bool IsStrict = Op->isStrictFPOpcode();
21222 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
21223 Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
21224 MVT VT = Op->getSimpleValueType(0);
21225 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
21226 SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();
21227 MVT SrcVT = Src.getSimpleValueType();
21228 SDLoc dl(Op);
21229
21230 SDValue Res;
21231 if (isSoftF16(SrcVT, Subtarget)) {
21232 MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;
21233 if (IsStrict)
21234 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
21235 {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
21236 {NVT, MVT::Other}, {Chain, Src})});
21237 return DAG.getNode(Op.getOpcode(), dl, VT,
21238 DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));
21239 } else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) {
21240 return Op;
21241 }
21242
21243 if (VT.isVector()) {
21244 if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
21245 MVT ResVT = MVT::v4i32;
21246 MVT TruncVT = MVT::v4i1;
21247 unsigned Opc;
21248 if (IsStrict)
21250 else
21251 Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21252
21253 if (!IsSigned && !Subtarget.hasVLX()) {
21254 assert(Subtarget.useAVX512Regs() && "Unexpected features!");
21255 // Widen to 512-bits.
21256 ResVT = MVT::v8i32;
21257 TruncVT = MVT::v8i1;
21258 Opc = Op.getOpcode();
21259 // Need to concat with zero vector for strict fp to avoid spurious
21260 // exceptions.
21261 // TODO: Should we just do this for non-strict as well?
21262 SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
21263 : DAG.getUNDEF(MVT::v8f64);
21264 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
21265 DAG.getIntPtrConstant(0, dl));
21266 }
21267 if (IsStrict) {
21268 Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});
21269 Chain = Res.getValue(1);
21270 } else {
21271 Res = DAG.getNode(Opc, dl, ResVT, Src);
21272 }
21273
21274 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
21275 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
21276 DAG.getIntPtrConstant(0, dl));
21277 if (IsStrict)
21278 return DAG.getMergeValues({Res, Chain}, dl);
21279 return Res;
21280 }
21281
21282 if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {
21283 if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)
21284 return Op;
21285
21286 MVT ResVT = VT;
21287 MVT EleVT = VT.getVectorElementType();
21288 if (EleVT != MVT::i64)
21289 ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;
21290
21291 if (SrcVT != MVT::v8f16) {
21292 SDValue Tmp =
21293 IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);
21294 SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);
21295 Ops[0] = Src;
21296 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);
21297 }
21298
21299 if (IsStrict) {
21300 Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI
21302 dl, {ResVT, MVT::Other}, {Chain, Src});
21303 Chain = Res.getValue(1);
21304 } else {
21305 Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,
21306 ResVT, Src);
21307 }
21308
21309 // TODO: Need to add exception check code for strict FP.
21310 if (EleVT.getSizeInBits() < 16) {
21311 ResVT = MVT::getVectorVT(EleVT, 8);
21312 Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);
21313 }
21314
21315 if (ResVT != VT)
21316 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21317 DAG.getIntPtrConstant(0, dl));
21318
21319 if (IsStrict)
21320 return DAG.getMergeValues({Res, Chain}, dl);
21321 return Res;
21322 }
21323
21324 // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.
21325 if (VT.getVectorElementType() == MVT::i16) {
21326 assert((SrcVT.getVectorElementType() == MVT::f32 ||
21327 SrcVT.getVectorElementType() == MVT::f64) &&
21328 "Expected f32/f64 vector!");
21329 MVT NVT = VT.changeVectorElementType(MVT::i32);
21330 if (IsStrict) {
21331 Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT
21333 dl, {NVT, MVT::Other}, {Chain, Src});
21334 Chain = Res.getValue(1);
21335 } else {
21336 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,
21337 NVT, Src);
21338 }
21339
21340 // TODO: Need to add exception check code for strict FP.
21341 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21342
21343 if (IsStrict)
21344 return DAG.getMergeValues({Res, Chain}, dl);
21345 return Res;
21346 }
21347
21348 // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
21349 if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
21350 assert(!IsSigned && "Expected unsigned conversion!");
21351 assert(Subtarget.useAVX512Regs() && "Requires avx512f");
21352 return Op;
21353 }
21354
21355 // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
21356 if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
21357 (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&
21358 Subtarget.useAVX512Regs()) {
21359 assert(!IsSigned && "Expected unsigned conversion!");
21360 assert(!Subtarget.hasVLX() && "Unexpected features!");
21361 MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
21362 MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
21363 // Need to concat with zero vector for strict fp to avoid spurious
21364 // exceptions.
21365 // TODO: Should we just do this for non-strict as well?
21366 SDValue Tmp =
21367 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21368 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21369 DAG.getIntPtrConstant(0, dl));
21370
21371 if (IsStrict) {
21372 Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
21373 {Chain, Src});
21374 Chain = Res.getValue(1);
21375 } else {
21376 Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
21377 }
21378
21379 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21380 DAG.getIntPtrConstant(0, dl));
21381
21382 if (IsStrict)
21383 return DAG.getMergeValues({Res, Chain}, dl);
21384 return Res;
21385 }
21386
21387 // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
21388 if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
21389 (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&
21390 Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {
21391 assert(!Subtarget.hasVLX() && "Unexpected features!");
21392 MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
21393 // Need to concat with zero vector for strict fp to avoid spurious
21394 // exceptions.
21395 // TODO: Should we just do this for non-strict as well?
21396 SDValue Tmp =
21397 IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
21398 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
21399 DAG.getIntPtrConstant(0, dl));
21400
21401 if (IsStrict) {
21402 Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21403 {Chain, Src});
21404 Chain = Res.getValue(1);
21405 } else {
21406 Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
21407 }
21408
21409 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
21410 DAG.getIntPtrConstant(0, dl));
21411
21412 if (IsStrict)
21413 return DAG.getMergeValues({Res, Chain}, dl);
21414 return Res;
21415 }
21416
21417 if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
21418 if (!Subtarget.hasVLX()) {
21419 // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
21420 // legalizer and then widened again by vector op legalization.
21421 if (!IsStrict)
21422 return SDValue();
21423
21424 SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
21425 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
21426 {Src, Zero, Zero, Zero});
21427 Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
21428 {Chain, Tmp});
21429 SDValue Chain = Tmp.getValue(1);
21430 Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
21431 DAG.getIntPtrConstant(0, dl));
21432 return DAG.getMergeValues({Tmp, Chain}, dl);
21433 }
21434
21435 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
21436 SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
21437 DAG.getUNDEF(MVT::v2f32));
21438 if (IsStrict) {
21439 unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
21441 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
21442 }
21443 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
21444 return DAG.getNode(Opc, dl, VT, Tmp);
21445 }
21446
21447 // Generate optimized instructions for pre AVX512 unsigned conversions from
21448 // vXf32 to vXi32.
21449 if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||
21450 (VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||
21451 (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {
21452 assert(!IsSigned && "Expected unsigned conversion!");
21453 return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);
21454 }
21455
21456 return SDValue();
21457 }
21458
21459 assert(!VT.isVector());
21460
21461 bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);
21462
21463 if (!IsSigned && UseSSEReg) {
21464 // Conversions from f32/f64 with AVX512 should be legal.
21465 if (Subtarget.hasAVX512())
21466 return Op;
21467
21468 // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction
21469 // behaves on out of range inputs to generate optimized conversions.
21470 if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||
21471 (VT == MVT::i64 && Subtarget.is64Bit()))) {
21472 unsigned DstBits = VT.getScalarSizeInBits();
21473 APInt UIntLimit = APInt::getSignMask(DstBits);
21474 SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,
21475 DAG.getConstant(UIntLimit, dl, VT));
21476 MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());
21477
21478 // Calculate the converted result for values in the range:
21479 // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").
21480 // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").
21481 SDValue Small =
21482 DAG.getNode(X86ISD::CVTTS2SI, dl, VT,
21483 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));
21484 SDValue Big = DAG.getNode(
21485 X86ISD::CVTTS2SI, dl, VT,
21486 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,
21487 DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));
21488
21489 // The "CVTTS2SI" instruction conveniently sets the sign bit if
21490 // and only if the value was out of range. So we can use that
21491 // as our indicator that we rather use "Big" instead of "Small".
21492 //
21493 // Use "Small" if "IsOverflown" has all bits cleared
21494 // and "0x80000000 | Big" if all bits in "IsOverflown" are set.
21495 SDValue IsOverflown = DAG.getNode(
21496 ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));
21497 return DAG.getNode(ISD::OR, dl, VT, Small,
21498 DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));
21499 }
21500
21501 // Use default expansion for i64.
21502 if (VT == MVT::i64)
21503 return SDValue();
21504
21505 assert(VT == MVT::i32 && "Unexpected VT!");
21506
21507 // Promote i32 to i64 and use a signed operation on 64-bit targets.
21508 // FIXME: This does not generate an invalid exception if the input does not
21509 // fit in i32. PR44019
21510 if (Subtarget.is64Bit()) {
21511 if (IsStrict) {
21512 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},
21513 {Chain, Src});
21514 Chain = Res.getValue(1);
21515 } else
21516 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
21517
21518 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21519 if (IsStrict)
21520 return DAG.getMergeValues({Res, Chain}, dl);
21521 return Res;
21522 }
21523
21524 // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
21525 // use fisttp which will be handled later.
21526 if (!Subtarget.hasSSE3())
21527 return SDValue();
21528 }
21529
21530 // Promote i16 to i32 if we can use a SSE operation or the type is f128.
21531 // FIXME: This does not generate an invalid exception if the input does not
21532 // fit in i16. PR44019
21533 if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
21534 assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
21535 if (IsStrict) {
21536 Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},
21537 {Chain, Src});
21538 Chain = Res.getValue(1);
21539 } else
21540 Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
21541
21542 Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
21543 if (IsStrict)
21544 return DAG.getMergeValues({Res, Chain}, dl);
21545 return Res;
21546 }
21547
21548 // If this is a FP_TO_SINT using SSEReg we're done.
21549 if (UseSSEReg && IsSigned)
21550 return Op;
21551
21552 // fp128 needs to use a libcall.
21553 if (SrcVT == MVT::f128) {
21554 RTLIB::Libcall LC;
21555 if (IsSigned)
21556 LC = RTLIB::getFPTOSINT(SrcVT, VT);
21557 else
21558 LC = RTLIB::getFPTOUINT(SrcVT, VT);
21559
21560 MakeLibCallOptions CallOptions;
21561 std::pair<SDValue, SDValue> Tmp =
21562 makeLibCall(DAG, LC, VT, Src, CallOptions, dl, Chain);
21563
21564 if (IsStrict)
21565 return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
21566
21567 return Tmp.first;
21568 }
21569
21570 // Fall back to X87.
21571 if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
21572 if (IsStrict)
21573 return DAG.getMergeValues({V, Chain}, dl);
21574 return V;
21575 }
21576
21577 llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
21578}
21579
21580SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
21581 SelectionDAG &DAG) const {
21582 SDValue Src = Op.getOperand(0);
21583 EVT DstVT = Op.getSimpleValueType();
21584 MVT SrcVT = Src.getSimpleValueType();
21585
21586 if (SrcVT.isVector())
21587 return DstVT.getScalarType() == MVT::i32 ? Op : SDValue();
21588
21589 if (SrcVT == MVT::f16)
21590 return SDValue();
21591
21592 // If the source is in an SSE register, the node is Legal.
21593 if (isScalarFPTypeInSSEReg(SrcVT))
21594 return Op;
21595
21596 return LRINT_LLRINTHelper(Op.getNode(), DAG);
21597}
21598
21599SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
21600 SelectionDAG &DAG) const {
21601 EVT DstVT = N->getValueType(0);
21602 SDValue Src = N->getOperand(0);
21603 EVT SrcVT = Src.getValueType();
21604
21605 if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
21606 // f16 must be promoted before using the lowering in this routine.
21607 // fp128 does not use this lowering.
21608 return SDValue();
21609 }
21610
21611 SDLoc DL(N);
21612 SDValue Chain = DAG.getEntryNode();
21613
21614 bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
21615
21616 // If we're converting from SSE, the stack slot needs to hold both types.
21617 // Otherwise it only needs to hold the DstVT.
21618 EVT OtherVT = UseSSE ? SrcVT : DstVT;
21619 SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
21620 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
21621 MachinePointerInfo MPI =
21623
21624 if (UseSSE) {
21625 assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
21626 Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
21627 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
21628 SDValue Ops[] = { Chain, StackPtr };
21629
21630 Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
21631 /*Align*/ std::nullopt,
21633 Chain = Src.getValue(1);
21634 }
21635
21636 SDValue StoreOps[] = { Chain, Src, StackPtr };
21637 Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
21638 StoreOps, DstVT, MPI, /*Align*/ std::nullopt,
21640
21641 return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
21642}
21643
21644SDValue
21645X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
21646 // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,
21647 // but making use of X86 specifics to produce better instruction sequences.